233 files changed, 148898 insertions, 0 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
new file mode 100644
index 000000000..9fac5ea8d
--- /dev/null
+++ b/fs/xfs/Kconfig
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config XFS_FS
+	tristate "XFS filesystem support"
+	depends on BLOCK
+	select EXPORTFS
+	select LIBCRC32C
+	select FS_IOMAP
+	help
+	  XFS is a high performance journaling filesystem which originated
+	  on the SGI IRIX platform.  It is completely multi-threaded, can
+	  support large files and large filesystems, extended attributes,
+	  variable block sizes, is extent based, and makes extensive use of
+	  Btrees (directories, extents, free space) to aid both performance
+	  and scalability.
+
+	  Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
+	  for complete details.  This implementation is on-disk compatible
+	  with the IRIX version of XFS.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called xfs.  Be aware, however, that if the file
+	  system of your root partition is compiled as a module, you'll need
+	  to use an initial ramdisk (initrd) to boot.
+
+config XFS_SUPPORT_V4
+	bool "Support deprecated V4 (crc=0) format"
+	depends on XFS_FS
+	default y
+	help
+	  The V4 filesystem format lacks certain features that are supported
+	  by the V5 format, such as metadata checksumming, strengthened
+	  metadata verification, and the ability to store timestamps past the
+	  year 2038.  Because of this, the V4 format is deprecated.  All users
+	  should upgrade by backing up their files, reformatting, and restoring
+	  from the backup.
+
+	  Administrators and users can detect a V4 filesystem by running
+	  xfs_info against a filesystem mountpoint and checking for a string
+	  beginning with "crc=".  If the string "crc=0" is found, the
+	  filesystem is a V4 filesystem.  If no such string is found, please
+	  upgrade xfsprogs to the latest version and try again.
+
+	  This option will become default N in September 2025.  Support for the
+	  V4 format will be removed entirely in September 2030.  Distributors
+	  can say N here to withdraw support earlier.
+
+	  To continue supporting the old V4 format (crc=0), say Y.
+	  To close off an attack surface, say N.
+
+config XFS_QUOTA
+	bool "XFS Quota support"
+	depends on XFS_FS
+	select QUOTACTL
+	help
+	  If you say Y here, you will be able to set limits for disk usage on
+	  a per user and/or a per group basis under XFS.  XFS considers quota
+	  information as filesystem metadata and uses journaling to provide a
+	  higher level guarantee of consistency.  The on-disk data format for
+	  quota is also compatible with the IRIX version of XFS, allowing a
+	  filesystem to be migrated between Linux and IRIX without any need
+	  for conversion.
+
+	  If unsure, say N.  More comprehensive documentation can be found in
+	  README.quota in the xfsprogs package.  XFS quota can be used either
+	  with or without the generic quota support enabled (CONFIG_QUOTA) -
+	  they are completely independent subsystems.
+
+config XFS_POSIX_ACL
+	bool "XFS POSIX ACL support"
+	depends on XFS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  If you don't know what Access Control Lists are, say N.
+
+config XFS_RT
+	bool "XFS Realtime subvolume support"
+	depends on XFS_FS
+	help
+	  If you say Y here you will be able to mount and use XFS filesystems
+	  which contain a realtime subvolume.  The realtime subvolume is a
+	  separate area of disk space where only file data is stored.  It was
+	  originally designed to provide deterministic data rates suitable
+	  for media streaming applications, but is also useful as a generic
+	  mechanism for ensuring data and metadata/log I/Os are completely
+	  separated.  Regular file I/Os are isolated to a separate device
+	  from all other requests, and this can be done quite transparently
+	  to applications via the inherit-realtime directory inode flag.
+
+	  See the xfs man page in section 5 for additional information.
+
+	  If unsure, say N.
+
+config XFS_ONLINE_SCRUB
+	bool "XFS online metadata check support"
+	default n
+	depends on XFS_FS
+	help
+	  If you say Y here you will be able to check metadata on a
+	  mounted XFS filesystem.  This feature is intended to reduce
+	  filesystem downtime by supplementing xfs_repair.  The key
+	  advantage here is to look for problems proactively so that
+	  they can be dealt with in a controlled manner.
+
+	  This feature is considered EXPERIMENTAL.  Use with caution!
+
+	  See the xfs_scrub man page in section 8 for additional information.
+
+	  If unsure, say N.
+
+config XFS_ONLINE_REPAIR
+	bool "XFS online metadata repair support"
+	default n
+	depends on XFS_FS && XFS_ONLINE_SCRUB
+	help
+	  If you say Y here you will be able to repair metadata on a
+	  mounted XFS filesystem.  This feature is intended to reduce
+	  filesystem downtime by fixing minor problems before they cause the
+	  filesystem to go down.  However, it requires that the filesystem be
+	  formatted with secondary metadata, such as reverse mappings and inode
+	  parent pointers.
+
+	  This feature is considered EXPERIMENTAL.  Use with caution!
+
+	  See the xfs_scrub man page in section 8 for additional information.
+
+	  If unsure, say N.
+
+config XFS_WARN
+	bool "XFS Verbose Warnings"
+	depends on XFS_FS && !XFS_DEBUG
+	help
+	  Say Y here to get an XFS build with many additional warnings.
+	  It converts ASSERT checks to WARN, so will log any out-of-bounds
+	  conditions that occur that would otherwise be missed. It is much
+	  lighter weight than XFS_DEBUG and does not modify algorithms and will
+	  not cause the kernel to panic on non-fatal errors.
+
+	  However, similar to XFS_DEBUG, it is only advisable to use this if you
+	  are debugging a particular problem.
+
+config XFS_DEBUG
+	bool "XFS Debugging support"
+	depends on XFS_FS
+	help
+	  Say Y here to get an XFS build with many debugging features,
+	  including ASSERT checks, function wrappers around macros,
+	  and extra sanity-checking functions in various code paths.
+
+	  Note that the resulting code will be HUGE and SLOW, and probably
+	  not useful unless you are debugging a particular problem.
+
+	  Say N unless you are an XFS developer, or you play one on TV.
+
+config XFS_ASSERT_FATAL
+	bool "XFS fatal asserts"
+	default y
+	depends on XFS_FS && XFS_DEBUG
+	help
+	  Set the default DEBUG mode ASSERT failure behavior.
+
+	  Say Y here to cause DEBUG mode ASSERT failures to result in fatal
+	  errors that BUG() the kernel by default. If you say N, ASSERT failures
+	  result in warnings.
+
+	  This behavior can be modified at runtime via sysfs.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
new file mode 100644
index 000000000..03135a1c3
--- /dev/null
+++ b/fs/xfs/Makefile
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.
+# All Rights Reserved.
+#
+
+ccflags-y += -I $(srctree)/$(src)		# needed for trace events
+ccflags-y += -I $(srctree)/$(src)/libxfs
+
+obj-$(CONFIG_XFS_FS)		+= xfs.o
+
+# this one should be compiled first, as the tracing macros can easily blow up
+xfs-y				+= xfs_trace.o
+
+# build the libxfs code first
+xfs-y				+= $(addprefix libxfs/, \
+				   xfs_ag.o \
+				   xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_leaf.o \
+				   xfs_attr_remote.o \
+				   xfs_bit.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_btree_staging.o \
+				   xfs_da_btree.o \
+				   xfs_defer.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_dquot_buf.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_iext_tree.o \
+				   xfs_inode_fork.o \
+				   xfs_inode_buf.o \
+				   xfs_log_rlimit.o \
+				   xfs_ag_resv.o \
+				   xfs_rmap.o \
+				   xfs_rmap_btree.o \
+				   xfs_refcount.o \
+				   xfs_refcount_btree.o \
+				   xfs_sb.o \
+				   xfs_symlink_remote.o \
+				   xfs_trans_inode.o \
+				   xfs_trans_resv.o \
+				   xfs_types.o \
+				   )
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
+				   xfs_rtbitmap.o \
+				   )
+
+# highlevel code
+xfs-y				+= xfs_aops.o \
+				   xfs_attr_inactive.o \
+				   xfs_attr_list.o \
+				   xfs_bmap_util.o \
+				   xfs_bio_io.o \
+				   xfs_buf.o \
+				   xfs_dir2_readdir.o \
+				   xfs_discard.o \
+				   xfs_error.o \
+				   xfs_export.o \
+				   xfs_extent_busy.o \
+				   xfs_file.o \
+				   xfs_filestream.o \
+				   xfs_fsmap.o \
+				   xfs_fsops.o \
+				   xfs_globals.o \
+				   xfs_health.o \
+				   xfs_icache.o \
+				   xfs_ioctl.o \
+				   xfs_iomap.o \
+				   xfs_iops.o \
+				   xfs_inode.o \
+				   xfs_itable.o \
+				   xfs_iwalk.o \
+				   xfs_message.o \
+				   xfs_mount.o \
+				   xfs_mru_cache.o \
+				   xfs_pwork.o \
+				   xfs_reflink.o \
+				   xfs_stats.o \
+				   xfs_super.o \
+				   xfs_symlink.o \
+				   xfs_sysfs.o \
+				   xfs_trans.o \
+				   xfs_xattr.o \
+				   kmem.o
+
+# low-level transaction/log code
+xfs-y				+= xfs_log.o \
+				   xfs_log_cil.o \
+				   xfs_bmap_item.o \
+				   xfs_buf_item.o \
+				   xfs_buf_item_recover.o \
+				   xfs_dquot_item_recover.o \
+				   xfs_extfree_item.o \
+				   xfs_attr_item.o \
+				   xfs_icreate_item.o \
+				   xfs_inode_item.o \
+				   xfs_inode_item_recover.o \
+				   xfs_iunlink_item.o \
+				   xfs_refcount_item.o \
+				   xfs_rmap_item.o \
+				   xfs_log_recover.o \
+				   xfs_trans_ail.o \
+				   xfs_trans_buf.o
+
+# optional features
+xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm_bhv.o \
+				   xfs_qm.o \
+				   xfs_quotaops.o
+
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
+
+xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
+xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
+xfs-$(CONFIG_EXPORTFS_BLOCK_OPS)	+= xfs_pnfs.o
+
+# notify failure
+ifeq ($(CONFIG_MEMORY_FAILURE),y)
+xfs-$(CONFIG_FS_DAX)		+= xfs_notify_failure.o
+endif
+
+# online scrub/repair
+ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
+
+# Tracepoints like to blow up, so build that before everything else
+
+xfs-y				+= $(addprefix scrub/, \
+				   trace.o \
+				   agheader.o \
+				   alloc.o \
+				   attr.o \
+				   bmap.o \
+				   btree.o \
+				   common.o \
+				   dabtree.o \
+				   dir.o \
+				   fscounters.o \
+				   health.o \
+				   ialloc.o \
+				   inode.o \
+				   parent.o \
+				   refcount.o \
+				   rmap.o \
+				   scrub.o \
+				   symlink.o \
+				   )
+
+xfs-$(CONFIG_XFS_RT)		+= scrub/rtbitmap.o
+xfs-$(CONFIG_XFS_QUOTA)		+= scrub/quota.o
+
+# online repair
+ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
+xfs-y				+= $(addprefix scrub/, \
+				   agheader_repair.o \
+				   bitmap.o \
+				   repair.o \
+				   )
+endif
+endif
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
new file mode 100644
index 000000000..c557a030a
--- /dev/null
+++ b/fs/xfs/kmem.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_message.h"
+#include "xfs_trace.h"
+
+void *
+kmem_alloc(size_t size, xfs_km_flags_t flags)
+{
+	int	retries = 0;
+	gfp_t	lflags = kmem_flags_convert(flags);
+	void	*ptr;
+
+	trace_kmem_alloc(size, flags, _RET_IP_);
+
+	do {
+		ptr = kmalloc(size, lflags);
+		if (ptr || (flags & KM_MAYFAIL))
+			return ptr;
+		if (!(++retries % 100))
+			xfs_err(NULL,
+	"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
+				current->comm, current->pid,
+				(unsigned int)size, __func__, lflags);
+		memalloc_retry_wait(lflags);
+	} while (1);
+}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
new file mode 100644
index 000000000..b987dc2c6
--- /dev/null
+++ b/fs/xfs/kmem.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+/*
+ * General memory allocation interfaces
+ */
+
+typedef unsigned __bitwise xfs_km_flags_t;
+#define KM_NOFS		((__force xfs_km_flags_t)0x0004u)
+#define KM_MAYFAIL	((__force xfs_km_flags_t)0x0008u)
+#define KM_ZERO		((__force xfs_km_flags_t)0x0010u)
+#define KM_NOLOCKDEP	((__force xfs_km_flags_t)0x0020u)
+
+/*
+ * We use a special process flag to avoid recursive callbacks into
+ * the filesystem during transactions.  We will also issue our own
+ * warnings, so we explicitly skip any generic ones (silly of us).
+ */
+static inline gfp_t
+kmem_flags_convert(xfs_km_flags_t flags)
+{
+	gfp_t	lflags;
+
+	BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP));
+
+	lflags = GFP_KERNEL | __GFP_NOWARN;
+	if (flags & KM_NOFS)
+		lflags &= ~__GFP_FS;
+
+	/*
+	 * Default page/slab allocator behavior is to retry for ever
+	 * for small allocations. We can override this behavior by using
+	 * __GFP_RETRY_MAYFAIL which will tell the allocator to retry as long
+	 * as it is feasible but rather fail than retry forever for all
+	 * request sizes.
+	 */
+	if (flags & KM_MAYFAIL)
+		lflags |= __GFP_RETRY_MAYFAIL;
+
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
+
+	if (flags & KM_NOLOCKDEP)
+		lflags |= __GFP_NOLOCKDEP;
+
+	return lflags;
+}
+
+extern void *kmem_alloc(size_t, xfs_km_flags_t);
+static inline void  kmem_free(const void *ptr)
+{
+	kvfree(ptr);
+}
+
+
+static inline void *
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
+{
+	return kmem_alloc(size, flags | KM_ZERO);
+}
+
+/*
+ * Zone interfaces
+ */
+static inline struct page *
+kmem_to_page(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return vmalloc_to_page(addr);
+	return virt_to_page(addr);
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
new file mode 100644
index 000000000..bb0c700af
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -0,0 +1,1042 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+
+
+/*
+ * Passive reference counting access wrappers to the perag structures.  If the
+ * per-ag structure is to be freed, the freeing code is responsible for cleaning
+ * up objects with passive references before freeing the structure. This is
+ * things like cached buffers.
+ */
+struct xfs_perag *
+xfs_perag_get(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag;
+	int			ref = 0;
+
+	rcu_read_lock();
+	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	if (pag) {
+		ASSERT(atomic_read(&pag->pag_ref) >= 0);
+		ref = atomic_inc_return(&pag->pag_ref);
+	}
+	rcu_read_unlock();
+	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+	return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		first,
+	unsigned int		tag)
+{
+	struct xfs_perag	*pag;
+	int			found;
+	int			ref;
+
+	rcu_read_lock();
+	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+					(void **)&pag, first, 1, tag);
+	if (found <= 0) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	ref = atomic_inc_return(&pag->pag_ref);
+	rcu_read_unlock();
+	trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+	return pag;
+}
+
+void
+xfs_perag_put(
+	struct xfs_perag	*pag)
+{
+	int	ref;
+
+	ASSERT(atomic_read(&pag->pag_ref) > 0);
+	ref = atomic_dec_return(&pag->pag_ref);
+	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agcount)
+{
+	xfs_agnumber_t		index;
+	struct xfs_perag	*pag;
+	struct xfs_sb		*sbp = &mp->m_sb;
+	uint64_t		ifree = 0;
+	uint64_t		ialloc = 0;
+	uint64_t		bfree = 0;
+	uint64_t		bfreelst = 0;
+	uint64_t		btree = 0;
+	uint64_t		fdblocks;
+	int			error = 0;
+
+	for (index = 0; index < agcount; index++) {
+		/*
+		 * Read the AGF and AGI buffers to populate the per-ag
+		 * structures for us.
+		 */
+		pag = xfs_perag_get(mp, index);
+		error = xfs_alloc_read_agf(pag, NULL, 0, NULL);
+		if (!error)
+			error = xfs_ialloc_read_agi(pag, NULL, NULL);
+		if (error) {
+			xfs_perag_put(pag);
+			return error;
+		}
+
+		ifree += pag->pagi_freecount;
+		ialloc += pag->pagi_count;
+		bfree += pag->pagf_freeblks;
+		bfreelst += pag->pagf_flcount;
+		btree += pag->pagf_btreeblks;
+		xfs_perag_put(pag);
+	}
+	fdblocks = bfree + bfreelst + btree;
+
+	/*
+	 * If the new summary counts are obviously incorrect, fail the
+	 * mount operation because that implies the AGFs are also corrupt.
+	 * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
+	 * will prevent xfs_repair from fixing anything.
+	 */
+	if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
+		xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* Overwrite incore superblock counters with just-read data */
+	spin_lock(&mp->m_sb_lock);
+	sbp->sb_ifree = ifree;
+	sbp->sb_icount = ialloc;
+	sbp->sb_fdblocks = fdblocks;
+	spin_unlock(&mp->m_sb_lock);
+
+	xfs_reinit_percpu_counters(mp);
+out:
+	xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+	return error;
+}
+
+STATIC void
+__xfs_free_perag(
+	struct rcu_head	*head)
+{
+	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+	ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
+	kmem_free(pag);
+}
+
+/*
+ * Free up the per-ag resources associated with the mount structure.
+ */
+void
+xfs_free_perag(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		spin_lock(&mp->m_perag_lock);
+		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		spin_unlock(&mp->m_perag_lock);
+		ASSERT(pag);
+		XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
+
+		cancel_delayed_work_sync(&pag->pag_blockgc_work);
+		xfs_buf_hash_destroy(pag);
+
+		call_rcu(&pag->rcu_head, __xfs_free_perag);
+	}
+}
+
+/* Find the size of the AG, in blocks. */
+static xfs_agblock_t
+__xfs_ag_block_count(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agnumber_t		agcount,
+	xfs_rfsblock_t		dblocks)
+{
+	ASSERT(agno < agcount);
+
+	if (agno < agcount - 1)
+		return mp->m_sb.sb_agblocks;
+	return dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+xfs_agblock_t
+xfs_ag_block_count(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount,
+			mp->m_sb.sb_dblocks);
+}
+
+/* Calculate the first and last possible inode number in an AG. */
+static void
+__xfs_agino_range(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		eoag,
+	xfs_agino_t		*first,
+	xfs_agino_t		*last)
+{
+	xfs_agblock_t		bno;
+
+	/*
+	 * Calculate the first inode, which will be in the first
+	 * cluster-aligned block after the AGFL.
+	 */
+	bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
+	*first = XFS_AGB_TO_AGINO(mp, bno);
+
+	/*
+	 * Calculate the last inode, which will be at the end of the
+	 * last (aligned) cluster that can be allocated in the AG.
+	 */
+	bno = round_down(eoag, M_IGEO(mp)->cluster_align);
+	*last = XFS_AGB_TO_AGINO(mp, bno) - 1;
+}
+
+void
+xfs_agino_range(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		*first,
+	xfs_agino_t		*last)
+{
+	return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
+}
+
+int
+xfs_initialize_perag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agcount,
+	xfs_rfsblock_t		dblocks,
+	xfs_agnumber_t		*maxagi)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		index;
+	xfs_agnumber_t		first_initialised = NULLAGNUMBER;
+	int			error;
+
+	/*
+	 * Walk the current per-ag tree so we don't try to initialise AGs
+	 * that already exist (growfs case). Allocate and insert all the
+	 * AGs we don't find ready for initialisation.
+	 */
+	for (index = 0; index < agcount; index++) {
+		pag = xfs_perag_get(mp, index);
+		if (pag) {
+			xfs_perag_put(pag);
+			continue;
+		}
+
+		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+		if (!pag) {
+			error = -ENOMEM;
+			goto out_unwind_new_pags;
+		}
+		pag->pag_agno = index;
+		pag->pag_mount = mp;
+
+		error = radix_tree_preload(GFP_NOFS);
+		if (error)
+			goto out_free_pag;
+
+		spin_lock(&mp->m_perag_lock);
+		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+			WARN_ON_ONCE(1);
+			spin_unlock(&mp->m_perag_lock);
+			radix_tree_preload_end();
+			error = -EEXIST;
+			goto out_free_pag;
+		}
+		spin_unlock(&mp->m_perag_lock);
+		radix_tree_preload_end();
+
+#ifdef __KERNEL__
+		/* Place kernel structure only init below this point. */
+		spin_lock_init(&pag->pag_ici_lock);
+		spin_lock_init(&pag->pagb_lock);
+		spin_lock_init(&pag->pag_state_lock);
+		INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+		init_waitqueue_head(&pag->pagb_wait);
+		pag->pagb_count = 0;
+		pag->pagb_tree = RB_ROOT;
+#endif /* __KERNEL__ */
+
+		error = xfs_buf_hash_init(pag);
+		if (error)
+			goto out_remove_pag;
+
+		/* first new pag is fully initialized */
+		if (first_initialised == NULLAGNUMBER)
+			first_initialised = index;
+
+		/*
+		 * Pre-calculated geometry
+		 */
+		pag->block_count = __xfs_ag_block_count(mp, index, agcount,
+				dblocks);
+		pag->min_block = XFS_AGFL_BLOCK(mp);
+		__xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+				&pag->agino_max);
+	}
+
+	index = xfs_set_inode_alloc(mp, agcount);
+
+	if (maxagi)
+		*maxagi = index;
+
+	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
+	return 0;
+
+out_remove_pag:
+	radix_tree_delete(&mp->m_perag_tree, index);
+out_free_pag:
+	kmem_free(pag);
+out_unwind_new_pags:
+	/* unwind any prior newly initialized pags */
+	for (index = first_initialised; index < agcount; index++) {
+		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		if (!pag)
+			break;
+		xfs_buf_hash_destroy(pag);
+		kmem_free(pag);
+	}
+	return error;
+}
+
+static int
+xfs_get_aghdr_buf(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
+	if (error)
+		return error;
+
+	bp->b_maps[0].bm_bn = blkno;
+	bp->b_ops = ops;
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Generic btree root block init function
+ */
+static void
+xfs_btroot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
+}
+
+/* Finish initializing a free space btree. */
+static void
+xfs_freesp_init_recs(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_alloc_rec	*arec;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+
+	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+
+	if (xfs_ag_contains_log(mp, id->agno)) {
+		struct xfs_alloc_rec	*nrec;
+		xfs_agblock_t		start = XFS_FSB_TO_AGBNO(mp,
+							mp->m_sb.sb_logstart);
+
+		ASSERT(start >= mp->m_ag_prealloc_blocks);
+		if (start != mp->m_ag_prealloc_blocks) {
+			/*
+			 * Modify first record to pad stripe align of log
+			 */
+			arec->ar_blockcount = cpu_to_be32(start -
+						mp->m_ag_prealloc_blocks);
+			nrec = arec + 1;
+
+			/*
+			 * Insert second record at start of internal log
+			 * which then gets trimmed.
+			 */
+			nrec->ar_startblock = cpu_to_be32(
+					be32_to_cpu(arec->ar_startblock) +
+					be32_to_cpu(arec->ar_blockcount));
+			arec = nrec;
+			be16_add_cpu(&block->bb_numrecs, 1);
+		}
+		/*
+		 * Change record start to after the internal log
+		 */
+		be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks);
+	}
+
+	/*
+	 * Calculate the record block count and check for the case where
+	 * the log might have consumed all available space in the AG. If
+	 * so, reset the record count to 0 to avoid exposure of an invalid
+	 * record start block.
+	 */
+	arec->ar_blockcount = cpu_to_be32(id->agsize -
+					  be32_to_cpu(arec->ar_startblock));
+	if (!arec->ar_blockcount)
+		block->bb_numrecs = 0;
+}
+
+/*
+ * Alloc btree root block init functions
+ */
+static void
+xfs_bnoroot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno);
+	xfs_freesp_init_recs(mp, bp, id);
+}
+
+static void
+xfs_cntroot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno);
+	xfs_freesp_init_recs(mp, bp, id);
+}
+
+/*
+ * Reverse map root block init
+ */
+static void
+xfs_rmaproot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_rmap_rec	*rrec;
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
+
+	/*
+	 * mark the AG header regions as static metadata The BNO
+	 * btree block is the first block after the headers, so
+	 * it's location defines the size of region the static
+	 * metadata consumes.
+	 *
+	 * Note: unlike mkfs, we never have to account for log
+	 * space when growing the data regions
+	 */
+	rrec = XFS_RMAP_REC_ADDR(block, 1);
+	rrec->rm_startblock = 0;
+	rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+	rrec->rm_offset = 0;
+
+	/* account freespace btree root blocks */
+	rrec = XFS_RMAP_REC_ADDR(block, 2);
+	rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(2);
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+	rrec->rm_offset = 0;
+
+	/* account inode btree root blocks */
+	rrec = XFS_RMAP_REC_ADDR(block, 3);
+	rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+					  XFS_IBT_BLOCK(mp));
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+	rrec->rm_offset = 0;
+
+	/* account for rmap btree root */
+	rrec = XFS_RMAP_REC_ADDR(block, 4);
+	rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(1);
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+	rrec->rm_offset = 0;
+
+	/* account for refc btree root */
+	if (xfs_has_reflink(mp)) {
+		rrec = XFS_RMAP_REC_ADDR(block, 5);
+		rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
+		rrec->rm_blockcount = cpu_to_be32(1);
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+	}
+
+	/* account for the log space */
+	if (xfs_ag_contains_log(mp, id->agno)) {
+		rrec = XFS_RMAP_REC_ADDR(block,
+				be16_to_cpu(block->bb_numrecs) + 1);
+		rrec->rm_startblock = cpu_to_be32(
+				XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart));
+		rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks);
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+	}
+}
+
+/*
+ * Initialise new secondary superblocks with the pre-grow geometry, but mark
+ * them as "in progress" so we know they haven't yet been activated. This will
+ * get cleared when the update with the new geometry information is done after
+ * changes to the primary are committed. This isn't strictly necessary, but we
+ * get it for free with the delayed buffer write lists and it means we can tell
+ * if a grow operation didn't complete properly after the fact.
+ */
+static void
+xfs_sbblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_dsb		*dsb = bp->b_addr;
+
+	xfs_sb_to_disk(dsb, &mp->m_sb);
+	dsb->sb_inprogress = 1;
+}
+
+static void
+xfs_agfblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agf		*agf = bp->b_addr;
+	xfs_extlen_t		tmpsize;
+
+	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+	agf->agf_seqno = cpu_to_be32(id->agno);
+	agf->agf_length = cpu_to_be32(id->agsize);
+	agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+	agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+	agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+	if (xfs_has_rmapbt(mp)) {
+		agf->agf_roots[XFS_BTNUM_RMAPi] =
+					cpu_to_be32(XFS_RMAP_BLOCK(mp));
+		agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+		agf->agf_rmap_blocks = cpu_to_be32(1);
+	}
+
+	agf->agf_flfirst = cpu_to_be32(1);
+	agf->agf_fllast = 0;
+	agf->agf_flcount = 0;
+	tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
+	agf->agf_freeblks = cpu_to_be32(tmpsize);
+	agf->agf_longest = cpu_to_be32(tmpsize);
+	if (xfs_has_crc(mp))
+		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+	if (xfs_has_reflink(mp)) {
+		agf->agf_refcount_root = cpu_to_be32(
+				xfs_refc_block(mp));
+		agf->agf_refcount_level = cpu_to_be32(1);
+		agf->agf_refcount_blocks = cpu_to_be32(1);
+	}
+
+	if (xfs_ag_contains_log(mp, id->agno)) {
+		int64_t	logblocks = mp->m_sb.sb_logblocks;
+
+		be32_add_cpu(&agf->agf_freeblks, -logblocks);
+		agf->agf_longest = cpu_to_be32(id->agsize -
+			XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks);
+	}
+}
+
+static void
+xfs_agflblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agfl		*agfl = XFS_BUF_TO_AGFL(bp);
+	__be32			*agfl_bno;
+	int			bucket;
+
+	if (xfs_has_crc(mp)) {
+		agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+		agfl->agfl_seqno = cpu_to_be32(id->agno);
+		uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+	}
+
+	agfl_bno = xfs_buf_to_agfl_bno(bp);
+	for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
+		agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+}
+
+static void
+xfs_agiblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agi		*agi = bp->b_addr;
+	int			bucket;
+
+	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+	agi->agi_seqno = cpu_to_be32(id->agno);
+	agi->agi_length = cpu_to_be32(id->agsize);
+	agi->agi_count = 0;
+	agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+	agi->agi_level = cpu_to_be32(1);
+	agi->agi_freecount = 0;
+	agi->agi_newino = cpu_to_be32(NULLAGINO);
+	agi->agi_dirino = cpu_to_be32(NULLAGINO);
+	if (xfs_has_crc(mp))
+		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
+	if (xfs_has_finobt(mp)) {
+		agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+		agi->agi_free_level = cpu_to_be32(1);
+	}
+	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+		agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+	if (xfs_has_inobtcounts(mp)) {
+		agi->agi_iblocks = cpu_to_be32(1);
+		if (xfs_has_finobt(mp))
+			agi->agi_fblocks = cpu_to_be32(1);
+	}
+}
+
+typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
+				  struct aghdr_init_data *id);
+static int
+xfs_ag_init_hdr(
+	struct xfs_mount	*mp,
+	struct aghdr_init_data	*id,
+	aghdr_init_work_f	work,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops);
+	if (error)
+		return error;
+
+	(*work)(mp, bp, id);
+
+	xfs_buf_delwri_queue(bp, &id->buffer_list);
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+struct xfs_aghdr_grow_data {
+	xfs_daddr_t		daddr;
+	size_t			numblks;
+	const struct xfs_buf_ops *ops;
+	aghdr_init_work_f	work;
+	xfs_btnum_t		type;
+	bool			need_init;
+};
+
+/*
+ * Prepare new AG headers to be written to disk. We use uncached buffers here,
+ * as it is assumed these new AG headers are currently beyond the currently
+ * valid filesystem address space. Using cached buffers would trip over EOFS
+ * corruption detection alogrithms in the buffer cache lookup routines.
+ *
+ * This is a non-transactional function, but the prepared buffers are added to a
+ * delayed write buffer list supplied by the caller so they can submit them to
+ * disk and wait on them as required.
+ */
+int
+xfs_ag_init_headers(
+	struct xfs_mount	*mp,
+	struct aghdr_init_data	*id)
+
+{
+	struct xfs_aghdr_grow_data aghdr_data[] = {
+	{ /* SB */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_sb_buf_ops,
+		.work = &xfs_sbblock_init,
+		.need_init = true
+	},
+	{ /* AGF */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agf_buf_ops,
+		.work = &xfs_agfblock_init,
+		.need_init = true
+	},
+	{ /* AGFL */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agfl_buf_ops,
+		.work = &xfs_agflblock_init,
+		.need_init = true
+	},
+	{ /* AGI */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agi_buf_ops,
+		.work = &xfs_agiblock_init,
+		.need_init = true
+	},
+	{ /* BNO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_bnobt_buf_ops,
+		.work = &xfs_bnoroot_init,
+		.need_init = true
+	},
+	{ /* CNT root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_cntbt_buf_ops,
+		.work = &xfs_cntroot_init,
+		.need_init = true
+	},
+	{ /* INO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_inobt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_INO,
+		.need_init = true
+	},
+	{ /* FINO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_finobt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_FINO,
+		.need_init =  xfs_has_finobt(mp)
+	},
+	{ /* RMAP root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_rmapbt_buf_ops,
+		.work = &xfs_rmaproot_init,
+		.need_init = xfs_has_rmapbt(mp)
+	},
+	{ /* REFC root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_refcountbt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_REFC,
+		.need_init = xfs_has_reflink(mp)
+	},
+	{ /* NULL terminating block */
+		.daddr = XFS_BUF_DADDR_NULL,
+	}
+	};
+	struct  xfs_aghdr_grow_data *dp;
+	int			error = 0;
+
+	/* Account for AG free space in new AG */
+	id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
+	for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) {
+		if (!dp->need_init)
+			continue;
+
+		id->daddr = dp->daddr;
+		id->numblks = dp->numblks;
+		id->type = dp->type;
+		error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
+		if (error)
+			break;
+	}
+	return error;
+}
+
+int
+xfs_ag_shrink_space(
+	struct xfs_perag	*pag,
+	struct xfs_trans	**tpp,
+	xfs_extlen_t		delta)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_alloc_arg	args = {
+		.tp	= *tpp,
+		.mp	= mp,
+		.type	= XFS_ALLOCTYPE_THIS_BNO,
+		.minlen = delta,
+		.maxlen = delta,
+		.oinfo	= XFS_RMAP_OINFO_SKIP_UPDATE,
+		.resv	= XFS_AG_RESV_NONE,
+		.prod	= 1
+	};
+	struct xfs_buf		*agibp, *agfbp;
+	struct xfs_agi		*agi;
+	struct xfs_agf		*agf;
+	xfs_agblock_t		aglen;
+	int			error, err2;
+
+	ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
+	error = xfs_ialloc_read_agi(pag, *tpp, &agibp);
+	if (error)
+		return error;
+
+	agi = agibp->b_addr;
+
+	error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp);
+	if (error)
+		return error;
+
+	agf = agfbp->b_addr;
+	aglen = be32_to_cpu(agi->agi_length);
+	/* some extra paranoid checks before we shrink the ag */
+	if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
+		return -EFSCORRUPTED;
+	if (delta >= aglen)
+		return -EINVAL;
+
+	args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta);
+
+	/*
+	 * Make sure that the last inode cluster cannot overlap with the new
+	 * end of the AG, even if it's sparse.
+	 */
+	error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp,
+			aglen - delta);
+	if (error)
+		return error;
+
+	/*
+	 * Disable perag reservations so it doesn't cause the allocation request
+	 * to fail. We'll reestablish reservation before we return.
+	 */
+	error = xfs_ag_resv_free(pag);
+	if (error)
+		return error;
+
+	/* internal log shouldn't also show up in the free space btrees */
+	error = xfs_alloc_vextent(&args);
+	if (!error && args.agbno == NULLAGBLOCK)
+		error = -ENOSPC;
+
+	if (error) {
+		/*
+		 * if extent allocation fails, need to roll the transaction to
+		 * ensure that the AGFL fixup has been committed anyway.
+		 */
+		xfs_trans_bhold(*tpp, agfbp);
+		err2 = xfs_trans_roll(tpp);
+		if (err2)
+			return err2;
+		xfs_trans_bjoin(*tpp, agfbp);
+		goto resv_init_out;
+	}
+
+	/*
+	 * if successfully deleted from freespace btrees, need to confirm
+	 * per-AG reservation works as expected.
+	 */
+	be32_add_cpu(&agi->agi_length, -delta);
+	be32_add_cpu(&agf->agf_length, -delta);
+
+	err2 = xfs_ag_resv_init(pag, *tpp);
+	if (err2) {
+		be32_add_cpu(&agi->agi_length, delta);
+		be32_add_cpu(&agf->agf_length, delta);
+		if (err2 != -ENOSPC)
+			goto resv_err;
+
+		__xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
+
+		/*
+		 * Roll the transaction before trying to re-init the per-ag
+		 * reservation. The new transaction is clean so it will cancel
+		 * without any side effects.
+		 */
+		error = xfs_defer_finish(tpp);
+		if (error)
+			return error;
+
+		error = -ENOSPC;
+		goto resv_init_out;
+	}
+	xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
+	xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
+	return 0;
+
+resv_init_out:
+	err2 = xfs_ag_resv_init(pag, *tpp);
+	if (!err2)
+		return error;
+resv_err:
+	xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", err2);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	return err2;
+}
+
+/*
+ * Extent the AG indicated by the @id by the length passed in
+ */
+int
+xfs_ag_extend_space(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	xfs_extlen_t		len)
+{
+	struct xfs_buf		*bp;
+	struct xfs_agi		*agi;
+	struct xfs_agf		*agf;
+	int			error;
+
+	ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
+
+	error = xfs_ialloc_read_agi(pag, tp, &bp);
+	if (error)
+		return error;
+
+	agi = bp->b_addr;
+	be32_add_cpu(&agi->agi_length, len);
+	xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+
+	/*
+	 * Change agf length.
+	 */
+	error = xfs_alloc_read_agf(pag, tp, 0, &bp);
+	if (error)
+		return error;
+
+	agf = bp->b_addr;
+	be32_add_cpu(&agf->agf_length, len);
+	ASSERT(agf->agf_length == agi->agi_length);
+	xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+
+	/*
+	 * Free the new space.
+	 *
+	 * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
+	 * this doesn't actually exist in the rmap btree.
+	 */
+	error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len,
+				len, &XFS_RMAP_OINFO_SKIP_UPDATE);
+	if (error)
+		return error;
+
+	error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
+					be32_to_cpu(agf->agf_length) - len),
+				len, &XFS_RMAP_OINFO_SKIP_UPDATE,
+				XFS_AG_RESV_NONE);
+	if (error)
+		return error;
+
+	/* Update perag geometry */
+	pag->block_count = be32_to_cpu(agf->agf_length);
+	__xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
+				&pag->agino_max);
+	return 0;
+}
+
+/* Retrieve AG geometry. */
+int
+xfs_ag_get_geometry(
+	struct xfs_perag	*pag,
+	struct xfs_ag_geometry	*ageo)
+{
+	struct xfs_buf		*agi_bp;
+	struct xfs_buf		*agf_bp;
+	struct xfs_agi		*agi;
+	struct xfs_agf		*agf;
+	unsigned int		freeblks;
+	int			error;
+
+	/* Lock the AG headers. */
+	error = xfs_ialloc_read_agi(pag, NULL, &agi_bp);
+	if (error)
+		return error;
+	error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp);
+	if (error)
+		goto out_agi;
+
+	/* Fill out form. */
+	memset(ageo, 0, sizeof(*ageo));
+	ageo->ag_number = pag->pag_agno;
+
+	agi = agi_bp->b_addr;
+	ageo->ag_icount = be32_to_cpu(agi->agi_count);
+	ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
+
+	agf = agf_bp->b_addr;
+	ageo->ag_length = be32_to_cpu(agf->agf_length);
+	freeblks = pag->pagf_freeblks +
+		   pag->pagf_flcount +
+		   pag->pagf_btreeblks -
+		   xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE);
+	ageo->ag_freeblks = freeblks;
+	xfs_ag_geom_health(pag, ageo);
+
+	/* Release resources. */
+	xfs_buf_relse(agf_bp);
+out_agi:
+	xfs_buf_relse(agi_bp);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
new file mode 100644
index 000000000..191b22b9a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __LIBXFS_AG_H
+#define __LIBXFS_AG_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_perag;
+
+/*
+ * Per-ag infrastructure
+ */
+
+/* per-AG block reservation data structures*/
+struct xfs_ag_resv {
+	/* number of blocks originally reserved here */
+	xfs_extlen_t			ar_orig_reserved;
+	/* number of blocks reserved here */
+	xfs_extlen_t			ar_reserved;
+	/* number of blocks originally asked for */
+	xfs_extlen_t			ar_asked;
+};
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi, to improve the
+ * performance of allocation group selection.
+ */
+struct xfs_perag {
+	struct xfs_mount *pag_mount;	/* owner filesystem */
+	xfs_agnumber_t	pag_agno;	/* AG this structure belongs to */
+	atomic_t	pag_ref;	/* perag reference count */
+	char		pagf_init;	/* this agf's entry is initialized */
+	char		pagi_init;	/* this agi's entry is initialized */
+	char		pagf_metadata;	/* the agf is preferred to be metadata */
+	char		pagi_inodeok;	/* The agi is ok for inodes */
+	uint8_t		pagf_levels[XFS_BTNUM_AGF];
+					/* # of levels in bno & cnt btree */
+	bool		pagf_agflreset; /* agfl requires reset before use */
+	uint32_t	pagf_flcount;	/* count of blocks in freelist */
+	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
+	xfs_extlen_t	pagf_longest;	/* longest free space */
+	uint32_t	pagf_btreeblks;	/* # of blocks held in AGF btrees */
+	xfs_agino_t	pagi_freecount;	/* number of free inodes */
+	xfs_agino_t	pagi_count;	/* number of allocated inodes */
+
+	/*
+	 * Inode allocation search lookup optimisation.
+	 * If the pagino matches, the search for new inodes
+	 * doesn't need to search the near ones again straight away
+	 */
+	xfs_agino_t	pagl_pagino;
+	xfs_agino_t	pagl_leftrec;
+	xfs_agino_t	pagl_rightrec;
+
+	int		pagb_count;	/* pagb slots in use */
+	uint8_t		pagf_refcount_level; /* recount btree height */
+
+	/* Blocks reserved for all kinds of metadata. */
+	struct xfs_ag_resv	pag_meta_resv;
+	/* Blocks reserved for the reverse mapping btree. */
+	struct xfs_ag_resv	pag_rmapbt_resv;
+
+	/* for rcu-safe freeing */
+	struct rcu_head	rcu_head;
+
+	/* Precalculated geometry info */
+	xfs_agblock_t		block_count;
+	xfs_agblock_t		min_block;
+	xfs_agino_t		agino_min;
+	xfs_agino_t		agino_max;
+
+#ifdef __KERNEL__
+	/* -- kernel only structures below this line -- */
+
+	/*
+	 * Bitsets of per-ag metadata that have been checked and/or are sick.
+	 * Callers should hold pag_state_lock before accessing this field.
+	 */
+	uint16_t	pag_checked;
+	uint16_t	pag_sick;
+	spinlock_t	pag_state_lock;
+
+	spinlock_t	pagb_lock;	/* lock for pagb_tree */
+	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
+	unsigned int	pagb_gen;	/* generation count for pagb_tree */
+	wait_queue_head_t pagb_wait;	/* woken when pagb_gen changes */
+
+	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
+
+	spinlock_t	pag_ici_lock;	/* incore inode cache lock */
+	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
+	int		pag_ici_reclaimable;	/* reclaimable inodes */
+	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
+
+	/* buffer cache index */
+	spinlock_t	pag_buf_lock;	/* lock for pag_buf_hash */
+	struct rhashtable pag_buf_hash;
+
+	/* background prealloc block trimming */
+	struct delayed_work	pag_blockgc_work;
+
+#endif /* __KERNEL__ */
+};
+
+int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
+			xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
+int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_free_perag(struct xfs_mount *mp);
+
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+		unsigned int tag);
+void xfs_perag_put(struct xfs_perag *pag);
+
+/*
+ * Per-ag geometry infomation and validation
+ */
+xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_agino_t *first, xfs_agino_t *last);
+
+static inline bool
+xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
+{
+	if (agbno >= pag->block_count)
+		return false;
+	if (agbno <= pag->min_block)
+		return false;
+	return true;
+}
+
+static inline bool
+xfs_verify_agbext(
+	struct xfs_perag	*pag,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		len)
+{
+	if (agbno + len <= agbno)
+		return false;
+
+	if (!xfs_verify_agbno(pag, agbno))
+		return false;
+
+	return xfs_verify_agbno(pag, agbno + len - 1);
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino)
+{
+	if (agino < pag->agino_min)
+		return false;
+	if (agino > pag->agino_max)
+		return false;
+	return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+static inline bool
+xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino)
+{
+	if (agino == NULLAGINO)
+		return true;
+	return xfs_verify_agino(pag, agino);
+}
+
+static inline bool
+xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+	return mp->m_sb.sb_logstart > 0 &&
+	       agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
+/*
+ * Perag iteration APIs
+ */
+static inline struct xfs_perag *
+xfs_perag_next(
+	struct xfs_perag	*pag,
+	xfs_agnumber_t		*agno,
+	xfs_agnumber_t		end_agno)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+
+	*agno = pag->pag_agno + 1;
+	xfs_perag_put(pag);
+	if (*agno > end_agno)
+		return NULL;
+	return xfs_perag_get(mp, *agno);
+}
+
+#define for_each_perag_range(mp, agno, end_agno, pag) \
+	for ((pag) = xfs_perag_get((mp), (agno)); \
+		(pag) != NULL; \
+		(pag) = xfs_perag_next((pag), &(agno), (end_agno)))
+
+#define for_each_perag_from(mp, agno, pag) \
+	for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
+
+
+#define for_each_perag(mp, agno, pag) \
+	(agno) = 0; \
+	for_each_perag_from((mp), (agno), (pag))
+
+#define for_each_perag_tag(mp, agno, pag, tag) \
+	for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
+		(pag) != NULL; \
+		(agno) = (pag)->pag_agno + 1, \
+		xfs_perag_put(pag), \
+		(pag) = xfs_perag_get_tag((mp), (agno), (tag)))
+
+struct aghdr_init_data {
+	/* per ag data */
+	xfs_agblock_t		agno;		/* ag to init */
+	xfs_extlen_t		agsize;		/* new AG size */
+	struct list_head	buffer_list;	/* buffer writeback list */
+	xfs_rfsblock_t		nfree;		/* cumulative new free space */
+
+	/* per header data */
+	xfs_daddr_t		daddr;		/* header location */
+	size_t			numblks;	/* size of header */
+	xfs_btnum_t		type;		/* type of btree root block */
+};
+
+int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
+int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp,
+			xfs_extlen_t delta);
+int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
+			xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+
+#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
new file mode 100644
index 000000000..5af123d13
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+
+/*
+ * Per-AG Block Reservations
+ *
+ * For some kinds of allocation group metadata structures, it is advantageous
+ * to reserve a small number of blocks in each AG so that future expansions of
+ * that data structure do not encounter ENOSPC because errors during a btree
+ * split cause the filesystem to go offline.
+ *
+ * Prior to the introduction of reflink, this wasn't an issue because the free
+ * space btrees maintain a reserve of space (the AGFL) to handle any expansion
+ * that may be necessary; and allocations of other metadata (inodes, BMBT,
+ * dir/attr) aren't restricted to a single AG.  However, with reflink it is
+ * possible to allocate all the space in an AG, have subsequent reflink/CoW
+ * activity expand the refcount btree, and discover that there's no space left
+ * to handle that expansion.  Since we can calculate the maximum size of the
+ * refcount btree, we can reserve space for it and avoid ENOSPC.
+ *
+ * Handling per-AG reservations consists of three changes to the allocator's
+ * behavior:  First, because these reservations are always needed, we decrease
+ * the ag_max_usable counter to reflect the size of the AG after the reserved
+ * blocks are taken.  Second, the reservations must be reflected in the
+ * fdblocks count to maintain proper accounting.  Third, each AG must maintain
+ * its own reserved block counter so that we can calculate the amount of space
+ * that must remain free to maintain the reservations.  Fourth, the "remaining
+ * reserved blocks" count must be used when calculating the length of the
+ * longest free extent in an AG and to clamp maxlen in the per-AG allocation
+ * functions.  In other words, we maintain a virtual allocation via in-core
+ * accounting tricks so that we don't have to clean up after a crash. :)
+ *
+ * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
+ * values via struct xfs_alloc_arg or directly to the xfs_free_extent
+ * function.  It might seem a little funny to maintain a reservoir of blocks
+ * to feed another reservoir, but the AGFL only holds enough blocks to get
+ * through the next transaction.  The per-AG reservation is to ensure (we
+ * hope) that each AG never runs out of blocks.  Each data structure wanting
+ * to use the reservation system should update ask/used in xfs_ag_resv_init.
+ */
+
+/*
+ * Are we critically low on blocks?  For now we'll define that as the number
+ * of blocks we can get our hands on being less than 10% of what we reserved
+ * or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_ag_resv_critical(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	xfs_extlen_t			avail;
+	xfs_extlen_t			orig;
+
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+		avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
+		orig = pag->pag_meta_resv.ar_asked;
+		break;
+	case XFS_AG_RESV_RMAPBT:
+		avail = pag->pagf_freeblks + pag->pagf_flcount -
+			pag->pag_meta_resv.ar_reserved;
+		orig = pag->pag_rmapbt_resv.ar_asked;
+		break;
+	default:
+		ASSERT(0);
+		return false;
+	}
+
+	trace_xfs_ag_resv_critical(pag, type, avail);
+
+	/* Critically low if less than 10% or max btree height remains. */
+	return XFS_TEST_ERROR(avail < orig / 10 ||
+			      avail < pag->pag_mount->m_agbtree_maxlevels,
+			pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
+}
+
+/*
+ * How many blocks are reserved but not used, and therefore must not be
+ * allocated away?
+ */
+xfs_extlen_t
+xfs_ag_resv_needed(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	xfs_extlen_t			len;
+
+	len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_RMAPBT:
+		len -= xfs_perag_resv(pag, type)->ar_reserved;
+		break;
+	case XFS_AG_RESV_NONE:
+		/* empty */
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	trace_xfs_ag_resv_needed(pag, type, len);
+
+	return len;
+}
+
+/* Clean out a reservation */
+static int
+__xfs_ag_resv_free(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	struct xfs_ag_resv		*resv;
+	xfs_extlen_t			oldresv;
+	int				error;
+
+	trace_xfs_ag_resv_free(pag, type, 0);
+
+	resv = xfs_perag_resv(pag, type);
+	if (pag->pag_agno == 0)
+		pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+	/*
+	 * RMAPBT blocks come from the AGFL and AGFL blocks are always
+	 * considered "free", so whatever was reserved at mount time must be
+	 * given back at umount.
+	 */
+	if (type == XFS_AG_RESV_RMAPBT)
+		oldresv = resv->ar_orig_reserved;
+	else
+		oldresv = resv->ar_reserved;
+	error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+	resv->ar_reserved = 0;
+	resv->ar_asked = 0;
+	resv->ar_orig_reserved = 0;
+
+	if (error)
+		trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
+				error, _RET_IP_);
+	return error;
+}
+
+/* Free a per-AG reservation. */
+int
+xfs_ag_resv_free(
+	struct xfs_perag		*pag)
+{
+	int				error;
+	int				err2;
+
+	error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
+	err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
+	if (err2 && !error)
+		error = err2;
+	return error;
+}
+
+static int
+__xfs_ag_resv_init(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	xfs_extlen_t			ask,
+	xfs_extlen_t			used)
+{
+	struct xfs_mount		*mp = pag->pag_mount;
+	struct xfs_ag_resv		*resv;
+	int				error;
+	xfs_extlen_t			hidden_space;
+
+	if (used > ask)
+		ask = used;
+
+	switch (type) {
+	case XFS_AG_RESV_RMAPBT:
+		/*
+		 * Space taken by the rmapbt is not subtracted from fdblocks
+		 * because the rmapbt lives in the free space.  Here we must
+		 * subtract the entire reservation from fdblocks so that we
+		 * always have blocks available for rmapbt expansion.
+		 */
+		hidden_space = ask;
+		break;
+	case XFS_AG_RESV_METADATA:
+		/*
+		 * Space taken by all other metadata btrees are accounted
+		 * on-disk as used space.  We therefore only hide the space
+		 * that is reserved but not used by the trees.
+		 */
+		hidden_space = ask - used;
+		break;
+	default:
+		ASSERT(0);
+		return -EINVAL;
+	}
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
+		error = -ENOSPC;
+	else
+		error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
+	if (error) {
+		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+				error, _RET_IP_);
+		xfs_warn(mp,
+"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
+				pag->pag_agno);
+		return error;
+	}
+
+	/*
+	 * Reduce the maximum per-AG allocation length by however much we're
+	 * trying to reserve for an AG.  Since this is a filesystem-wide
+	 * counter, we only make the adjustment for AG 0.  This assumes that
+	 * there aren't any AGs hungrier for per-AG reservation than AG 0.
+	 */
+	if (pag->pag_agno == 0)
+		mp->m_ag_max_usable -= ask;
+
+	resv = xfs_perag_resv(pag, type);
+	resv->ar_asked = ask;
+	resv->ar_orig_reserved = hidden_space;
+	resv->ar_reserved = ask - used;
+
+	trace_xfs_ag_resv_init(pag, type, ask);
+	return 0;
+}
+
+/* Create a per-AG block reservation. */
+int
+xfs_ag_resv_init(
+	struct xfs_perag		*pag,
+	struct xfs_trans		*tp)
+{
+	struct xfs_mount		*mp = pag->pag_mount;
+	xfs_extlen_t			ask;
+	xfs_extlen_t			used;
+	int				error = 0, error2;
+	bool				has_resv = false;
+
+	/* Create the metadata reservation. */
+	if (pag->pag_meta_resv.ar_asked == 0) {
+		ask = used = 0;
+
+		error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
+		if (error)
+			goto out;
+
+		error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
+		if (error)
+			goto out;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+				ask, used);
+		if (error) {
+			/*
+			 * Because we didn't have per-AG reservations when the
+			 * finobt feature was added we might not be able to
+			 * reserve all needed blocks.  Warn and fall back to the
+			 * old and potentially buggy code in that case, but
+			 * ensure we do have the reservation for the refcountbt.
+			 */
+			ask = used = 0;
+
+			mp->m_finobt_nores = true;
+
+			error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
+					&used);
+			if (error)
+				goto out;
+
+			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+					ask, used);
+			if (error)
+				goto out;
+		}
+		if (ask)
+			has_resv = true;
+	}
+
+	/* Create the RMAPBT metadata reservation */
+	if (pag->pag_rmapbt_resv.ar_asked == 0) {
+		ask = used = 0;
+
+		error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
+		if (error)
+			goto out;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
+		if (error)
+			goto out;
+		if (ask)
+			has_resv = true;
+	}
+
+out:
+	/*
+	 * Initialize the pagf if we have at least one active reservation on the
+	 * AG. This may have occurred already via reservation calculation, but
+	 * fall back to an explicit init to ensure the in-core allocbt usage
+	 * counters are initialized as soon as possible. This is important
+	 * because filesystems with large perag reservations are susceptible to
+	 * free space reservation problems that the allocbt counter is used to
+	 * address.
+	 */
+	if (has_resv) {
+		error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
+		if (error2)
+			return error2;
+
+		/*
+		 * If there isn't enough space in the AG to satisfy the
+		 * reservation, let the caller know that there wasn't enough
+		 * space.  Callers are responsible for deciding what to do
+		 * next, since (in theory) we can stumble along with
+		 * insufficient reservation if data blocks are being freed to
+		 * replenish the AG's free space.
+		 */
+		if (!error &&
+		    xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+		    xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
+		    pag->pagf_freeblks + pag->pagf_flcount)
+			error = -ENOSPC;
+	}
+
+	return error;
+}
+
+/* Allocate a block from the reservation. */
+void
+xfs_ag_resv_alloc_extent(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	struct xfs_alloc_arg		*args)
+{
+	struct xfs_ag_resv		*resv;
+	xfs_extlen_t			len;
+	uint				field;
+
+	trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
+
+	switch (type) {
+	case XFS_AG_RESV_AGFL:
+		return;
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_RMAPBT:
+		resv = xfs_perag_resv(pag, type);
+		break;
+	default:
+		ASSERT(0);
+		fallthrough;
+	case XFS_AG_RESV_NONE:
+		field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+				       XFS_TRANS_SB_FDBLOCKS;
+		xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
+		return;
+	}
+
+	len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
+	resv->ar_reserved -= len;
+	if (type == XFS_AG_RESV_RMAPBT)
+		return;
+	/* Allocations of reserved blocks only need on-disk sb updates... */
+	xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
+	/* ...but non-reserved blocks need in-core and on-disk updates. */
+	if (args->len > len)
+		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
+				-((int64_t)args->len - len));
+}
+
+/* Free a block to the reservation. */
+void
+xfs_ag_resv_free_extent(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	struct xfs_trans		*tp,
+	xfs_extlen_t			len)
+{
+	xfs_extlen_t			leftover;
+	struct xfs_ag_resv		*resv;
+
+	trace_xfs_ag_resv_free_extent(pag, type, len);
+
+	switch (type) {
+	case XFS_AG_RESV_AGFL:
+		return;
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_RMAPBT:
+		resv = xfs_perag_resv(pag, type);
+		break;
+	default:
+		ASSERT(0);
+		fallthrough;
+	case XFS_AG_RESV_NONE:
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+		return;
+	}
+
+	leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
+	resv->ar_reserved += leftover;
+	if (type == XFS_AG_RESV_RMAPBT)
+		return;
+	/* Freeing into the reserved pool only requires on-disk update... */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
+	/* ...but freeing beyond that requires in-core and on-disk update. */
+	if (len > leftover)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
+}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
new file mode 100644
index 000000000..b74b21000
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_AG_RESV_H__
+#define	__XFS_AG_RESV_H__
+
+int xfs_ag_resv_free(struct xfs_perag *pag);
+int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp);
+
+bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
+		enum xfs_ag_resv_type type);
+
+void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+		struct xfs_alloc_arg *args);
+void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+		struct xfs_trans *tp, xfs_extlen_t len);
+
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+	struct xfs_perag	*pag,
+	enum xfs_ag_resv_type	type)
+{
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+		return &pag->pag_meta_resv;
+	case XFS_AG_RESV_RMAPBT:
+		return &pag->pag_rmapbt_resv;
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from
+ * the AGFL, they are allocated one at a time and the reservation updates don't
+ * require a transaction.
+ */
+static inline void
+xfs_ag_resv_rmapbt_alloc(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_alloc_arg	args = { NULL };
+	struct xfs_perag	*pag;
+
+	args.len = 1;
+	pag = xfs_perag_get(mp, agno);
+	xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args);
+	xfs_perag_put(pag);
+}
+
+#endif	/* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
new file mode 100644
index 000000000..de79f5d07
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -0,0 +1,3568 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_bmap.h"
+
+struct kmem_cache	*xfs_extfree_item_cache;
+
+struct workqueue_struct *xfs_alloc_wq;
+
+#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define	XFSA_FIXUP_BNO_OK	1
+#define	XFSA_FIXUP_CNT_OK	2
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+
+/*
+ * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
+ * the beginning of the block for a proper header with the location information
+ * and CRC.
+ */
+unsigned int
+xfs_agfl_size(
+	struct xfs_mount	*mp)
+{
+	unsigned int		size = mp->m_sb.sb_sectsize;
+
+	if (xfs_has_crc(mp))
+		size -= sizeof(struct xfs_agfl);
+
+	return size / sizeof(xfs_agblock_t);
+}
+
+unsigned int
+xfs_refc_block(
+	struct xfs_mount	*mp)
+{
+	if (xfs_has_rmapbt(mp))
+		return XFS_RMAP_BLOCK(mp) + 1;
+	if (xfs_has_finobt(mp))
+		return XFS_FIBT_BLOCK(mp) + 1;
+	return XFS_IBT_BLOCK(mp) + 1;
+}
+
+xfs_extlen_t
+xfs_prealloc_blocks(
+	struct xfs_mount	*mp)
+{
+	if (xfs_has_reflink(mp))
+		return xfs_refc_block(mp) + 1;
+	if (xfs_has_rmapbt(mp))
+		return XFS_RMAP_BLOCK(mp) + 1;
+	if (xfs_has_finobt(mp))
+		return XFS_FIBT_BLOCK(mp) + 1;
+	return XFS_IBT_BLOCK(mp) + 1;
+}
+
+/*
+ * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
+ * guarantee that we can refill the AGFL prior to allocating space in a nearly
+ * full AG.  Although the space described by the free space btrees, the
+ * blocks used by the freesp btrees themselves, and the blocks owned by the
+ * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
+ * free space in the AG drop so low that the free space btrees cannot refill an
+ * empty AGFL up to the minimum level.  Rather than grind through empty AGs
+ * until the fs goes down, we subtract this many AG blocks from the incore
+ * fdblocks to ensure user allocation does not overcommit the space the
+ * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
+ * withhold space from xfs_mod_fdblocks, so we do not account for that here.
+ */
+#define XFS_ALLOCBT_AGFL_RESERVE	4
+
+/*
+ * Compute the number of blocks that we set aside to guarantee the ability to
+ * refill the AGFL and handle a full bmap btree split.
+ *
+ * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
+ * AGF buffer (PV 947395), we place constraints on the relationship among
+ * actual allocations for data blocks, freelist blocks, and potential file data
+ * bmap btree blocks. However, these restrictions may result in no actual space
+ * allocated for a delayed extent, for example, a data block in a certain AG is
+ * allocated but there is no additional block for the additional bmap btree
+ * block due to a split of the bmap btree of the file. The result of this may
+ * lead to an infinite loop when the file gets flushed to disk and all delayed
+ * extents need to be actually allocated. To get around this, we explicitly set
+ * aside a few blocks which will not be reserved in delayed allocation.
+ *
+ * For each AG, we need to reserve enough blocks to replenish a totally empty
+ * AGFL and 4 more to handle a potential split of the file's bmap btree.
+ */
+unsigned int
+xfs_alloc_set_aside(
+	struct xfs_mount	*mp)
+{
+	return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
+}
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *	- the AG superblock, AGF, AGI and AGFL
+ *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
+ *	  the AGI free inode and rmap btree root blocks.
+ *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
+ *	- the rmapbt root block
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+unsigned int
+xfs_alloc_ag_max_usable(
+	struct xfs_mount	*mp)
+{
+	unsigned int		blocks;
+
+	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
+	blocks += XFS_ALLOCBT_AGFL_RESERVE;
+	blocks += 3;			/* AGF, AGI btree root blocks */
+	if (xfs_has_finobt(mp))
+		blocks++;		/* finobt root block */
+	if (xfs_has_rmapbt(mp))
+		blocks++;		/* rmap root block */
+	if (xfs_has_reflink(mp))
+		blocks++;		/* refcount root block */
+
+	return mp->m_sb.sb_agblocks - blocks;
+}
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int				/* error */
+xfs_alloc_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	int			error;
+
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+	cur->bc_ag.abt.active = (*stat == 1);
+	return error;
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int				/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	int			error;
+
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+	cur->bc_ag.abt.active = (*stat == 1);
+	return error;
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	int			error;
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+	cur->bc_ag.abt.active = (*stat == 1);
+	return error;
+}
+
+static inline bool
+xfs_alloc_cur_active(
+	struct xfs_btree_cur	*cur)
+{
+	return cur && cur->bc_ag.abt.active;
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_alloc_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len)	/* length of extent */
+{
+	union xfs_btree_rec	rec;
+
+	rec.alloc.ar_startblock = cpu_to_be32(bno);
+	rec.alloc.ar_blockcount = cpu_to_be32(len);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat)	/* output: success/failure */
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_perag	*pag = cur->bc_ag.pag;
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (error || !(*stat))
+		return error;
+
+	*bno = be32_to_cpu(rec->alloc.ar_startblock);
+	*len = be32_to_cpu(rec->alloc.ar_blockcount);
+
+	if (*len == 0)
+		goto out_bad_rec;
+
+	/* check for valid extent range, including overflow */
+	if (!xfs_verify_agbext(pag, *bno, *len))
+		goto out_bad_rec;
+
+	return 0;
+
+out_bad_rec:
+	xfs_warn(mp,
+		"%s Freespace BTree record corruption in AG %d detected!",
+		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
+		pag->pag_agno);
+	xfs_warn(mp,
+		"start block 0x%x block count 0x%x", *bno, *len);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC bool
+xfs_alloc_compute_aligned(
+	xfs_alloc_arg_t	*args,		/* allocation argument structure */
+	xfs_agblock_t	foundbno,	/* starting block in found extent */
+	xfs_extlen_t	foundlen,	/* length in found extent */
+	xfs_agblock_t	*resbno,	/* result block number */
+	xfs_extlen_t	*reslen,	/* result length */
+	unsigned	*busy_gen)
+{
+	xfs_agblock_t	bno = foundbno;
+	xfs_extlen_t	len = foundlen;
+	xfs_extlen_t	diff;
+	bool		busy;
+
+	/* Trim busy sections out of found extent */
+	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
+
+	/*
+	 * If we have a largish extent that happens to start before min_agbno,
+	 * see if we can shift it into range...
+	 */
+	if (bno < args->min_agbno && bno + len > args->min_agbno) {
+		diff = args->min_agbno - bno;
+		if (len > diff) {
+			bno += diff;
+			len -= diff;
+		}
+	}
+
+	if (args->alignment > 1 && len >= args->minlen) {
+		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
+
+		diff = aligned_bno - bno;
+
+		*resbno = aligned_bno;
+		*reslen = diff >= len ? 0 : len - diff;
+	} else {
+		*resbno = bno;
+		*reslen = len;
+	}
+
+	return busy;
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t			/* difference value (absolute) */
+xfs_alloc_compute_diff(
+	xfs_agblock_t	wantbno,	/* target starting block */
+	xfs_extlen_t	wantlen,	/* target length */
+	xfs_extlen_t	alignment,	/* target alignment */
+	int		datatype,	/* are we allocating data? */
+	xfs_agblock_t	freebno,	/* freespace's starting block */
+	xfs_extlen_t	freelen,	/* freespace's length */
+	xfs_agblock_t	*newbnop)	/* result: best start block from free */
+{
+	xfs_agblock_t	freeend;	/* end of freespace extent */
+	xfs_agblock_t	newbno1;	/* return block number */
+	xfs_agblock_t	newbno2;	/* other new block number */
+	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
+	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
+	xfs_agblock_t	wantend;	/* end of target extent */
+	bool		userdata = datatype & XFS_ALLOC_USERDATA;
+
+	ASSERT(freelen >= wantlen);
+	freeend = freebno + freelen;
+	wantend = wantbno + wantlen;
+	/*
+	 * We want to allocate from the start of a free extent if it is past
+	 * the desired block or if we are allocating user data and the free
+	 * extent is before desired block. The second case is there to allow
+	 * for contiguous allocation from the remaining free space if the file
+	 * grows in the short term.
+	 */
+	if (freebno >= wantbno || (userdata && freeend < wantend)) {
+		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else if (freeend >= wantend && alignment > 1) {
+		newbno1 = roundup(wantbno, alignment);
+		newbno2 = newbno1 - alignment;
+		if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+		else
+			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+		if (newbno2 < freebno)
+			newbno2 = NULLAGBLOCK;
+		else
+			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+			if (newlen1 < newlen2 ||
+			    (newlen1 == newlen2 &&
+			     XFS_ABSDIFF(newbno1, wantbno) >
+			     XFS_ABSDIFF(newbno2, wantbno)))
+				newbno1 = newbno2;
+		} else if (newbno2 != NULLAGBLOCK)
+			newbno1 = newbno2;
+	} else if (freeend >= wantend) {
+		newbno1 = wantbno;
+	} else if (alignment > 1) {
+		newbno1 = roundup(freeend - wantlen, alignment);
+		if (newbno1 > freeend - wantlen &&
+		    newbno1 - alignment >= freebno)
+			newbno1 -= alignment;
+		else if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else
+		newbno1 = freeend - wantlen;
+	*newbnop = newbno1;
+	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_extlen_t	k;
+	xfs_extlen_t	rlen;
+
+	ASSERT(args->mod < args->prod);
+	rlen = args->len;
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+	    (args->mod == 0 && rlen < args->prod))
+		return;
+	k = rlen % args->prod;
+	if (k == args->mod)
+		return;
+	if (k > args->mod)
+		rlen = rlen - (k - args->mod);
+	else
+		rlen = rlen - args->prod + (args->mod - k);
+	/* casts to (int) catch length underflows */
+	if ((int)rlen < (int)args->minlen)
+		return;
+	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+	ASSERT(rlen % args->prod == args->mod);
+	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
+		rlen + args->minleft);
+	args->len = rlen;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int				/* error code */
+xfs_alloc_fixup_trees(
+	struct xfs_btree_cur *cnt_cur,	/* cursor for by-size btree */
+	struct xfs_btree_cur *bno_cur,	/* cursor for by-block btree */
+	xfs_agblock_t	fbno,		/* starting block of free extent */
+	xfs_extlen_t	flen,		/* length of free extent */
+	xfs_agblock_t	rbno,		/* starting block of returned extent */
+	xfs_extlen_t	rlen,		/* length of returned extent */
+	int		flags)		/* flags, XFSA_FIXUP_... */
+{
+	int		error;		/* error code */
+	int		i;		/* operation results */
+	xfs_agblock_t	nfbno1;		/* first new free startblock */
+	xfs_agblock_t	nfbno2;		/* second new free startblock */
+	xfs_extlen_t	nflen1=0;	/* first new free length */
+	xfs_extlen_t	nflen2=0;	/* second new free length */
+	struct xfs_mount *mp;
+
+	mp = cnt_cur->bc_mp;
+
+	/*
+	 * Look up the record in the by-size tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp,
+				   i != 1 ||
+				   nfbno1 != fbno ||
+				   nflen1 != flen))
+			return -EFSCORRUPTED;
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
+	}
+	/*
+	 * Look up the record in the by-block tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp,
+				   i != 1 ||
+				   nfbno1 != fbno ||
+				   nflen1 != flen))
+			return -EFSCORRUPTED;
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
+	}
+
+#ifdef DEBUG
+	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+		struct xfs_btree_block	*bnoblock;
+		struct xfs_btree_block	*cntblock;
+
+		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
+		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
+
+		if (XFS_IS_CORRUPT(mp,
+				   bnoblock->bb_numrecs !=
+				   cntblock->bb_numrecs))
+			return -EFSCORRUPTED;
+	}
+#endif
+
+	/*
+	 * Deal with all four cases: the allocated record is contained
+	 * within the freespace record, so we can have new freespace
+	 * at either (or both) end, or no freespace remaining.
+	 */
+	if (rbno == fbno && rlen == flen)
+		nfbno1 = nfbno2 = NULLAGBLOCK;
+	else if (rbno == fbno) {
+		nfbno1 = rbno + rlen;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else if (rbno + rlen == fbno + flen) {
+		nfbno1 = fbno;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else {
+		nfbno1 = fbno;
+		nflen1 = rbno - fbno;
+		nfbno2 = rbno + rlen;
+		nflen2 = (fbno + flen) - nfbno2;
+	}
+	/*
+	 * Delete the entry from the by-size btree.
+	 */
+	if ((error = xfs_btree_delete(cnt_cur, &i)))
+		return error;
+	if (XFS_IS_CORRUPT(mp, i != 1))
+		return -EFSCORRUPTED;
+	/*
+	 * Add new by-size btree entry(s).
+	 */
+	if (nfbno1 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 0))
+			return -EFSCORRUPTED;
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 0))
+			return -EFSCORRUPTED;
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
+	}
+	/*
+	 * Fix up the by-block btree entry(s).
+	 */
+	if (nfbno1 == NULLAGBLOCK) {
+		/*
+		 * No remaining freespace, just delete the by-block tree entry.
+		 */
+		if ((error = xfs_btree_delete(bno_cur, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
+	} else {
+		/*
+		 * Update the by-block entry to start later|be shorter.
+		 */
+		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+			return error;
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		/*
+		 * 2 resulting free entries, need to add one.
+		 */
+		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 0))
+			return -EFSCORRUPTED;
+		if ((error = xfs_btree_insert(bno_cur, &i)))
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+static xfs_failaddr_t
+xfs_agfl_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
+	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
+	int		i;
+
+	/*
+	 * There is no verification of non-crc AGFLs because mkfs does not
+	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
+	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
+	 * can't verify just those entries are valid.
+	 */
+	if (!xfs_has_crc(mp))
+		return NULL;
+
+	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
+		return __this_address;
+	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+		return __this_address;
+
+	for (i = 0; i < xfs_agfl_size(mp); i++) {
+		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
+		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+			return __this_address;
+	}
+
+	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
+		return __this_address;
+	return NULL;
+}
+
+static void
+xfs_agfl_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	xfs_failaddr_t	fa;
+
+	/*
+	 * There is no verification of non-crc AGFLs because mkfs does not
+	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
+	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
+	 * can't verify just those entries are valid.
+	 */
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_agfl_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_agfl_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	xfs_failaddr_t		fa;
+
+	/* no verification of non-crc AGFLs */
+	if (!xfs_has_crc(mp))
+		return;
+
+	fa = xfs_agfl_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (bip)
+		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+	.name = "xfs_agfl",
+	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
+	.verify_read = xfs_agfl_read_verify,
+	.verify_write = xfs_agfl_write_verify,
+	.verify_struct = xfs_agfl_verify,
+};
+
+/*
+ * Read in the allocation group free block array.
+ */
+int
+xfs_alloc_read_agfl(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	error = xfs_trans_read_buf(
+			mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+	if (error)
+		return error;
+	xfs_buf_set_ref(bp, XFS_AGFL_REF);
+	*bpp = bp;
+	return 0;
+}
+
+STATIC int
+xfs_alloc_update_counters(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	long			len)
+{
+	struct xfs_agf		*agf = agbp->b_addr;
+
+	agbp->b_pag->pagf_freeblks += len;
+	be32_add_cpu(&agf->agf_freeblks, len);
+
+	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+		     be32_to_cpu(agf->agf_length))) {
+		xfs_buf_mark_corrupt(agbp);
+		return -EFSCORRUPTED;
+	}
+
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+	return 0;
+}
+
+/*
+ * Block allocation algorithm and data structures.
+ */
+struct xfs_alloc_cur {
+	struct xfs_btree_cur		*cnt;	/* btree cursors */
+	struct xfs_btree_cur		*bnolt;
+	struct xfs_btree_cur		*bnogt;
+	xfs_extlen_t			cur_len;/* current search length */
+	xfs_agblock_t			rec_bno;/* extent startblock */
+	xfs_extlen_t			rec_len;/* extent length */
+	xfs_agblock_t			bno;	/* alloc bno */
+	xfs_extlen_t			len;	/* alloc len */
+	xfs_extlen_t			diff;	/* diff from search bno */
+	unsigned int			busy_gen;/* busy state */
+	bool				busy;
+};
+
+/*
+ * Set up cursors, etc. in the extent allocation cursor. This function can be
+ * called multiple times to reset an initialized structure without having to
+ * reallocate cursors.
+ */
+static int
+xfs_alloc_cur_setup(
+	struct xfs_alloc_arg	*args,
+	struct xfs_alloc_cur	*acur)
+{
+	int			error;
+	int			i;
+
+	ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
+
+	acur->cur_len = args->maxlen;
+	acur->rec_bno = 0;
+	acur->rec_len = 0;
+	acur->bno = 0;
+	acur->len = 0;
+	acur->diff = -1;
+	acur->busy = false;
+	acur->busy_gen = 0;
+
+	/*
+	 * Perform an initial cntbt lookup to check for availability of maxlen
+	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
+	 * attempt a small allocation.
+	 */
+	if (!acur->cnt)
+		acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
+					args->agbp, args->pag, XFS_BTNUM_CNT);
+	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
+	if (error)
+		return error;
+
+	/*
+	 * Allocate the bnobt left and right search cursors.
+	 */
+	if (!acur->bnolt)
+		acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
+					args->agbp, args->pag, XFS_BTNUM_BNO);
+	if (!acur->bnogt)
+		acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
+					args->agbp, args->pag, XFS_BTNUM_BNO);
+	return i == 1 ? 0 : -ENOSPC;
+}
+
+static void
+xfs_alloc_cur_close(
+	struct xfs_alloc_cur	*acur,
+	bool			error)
+{
+	int			cur_error = XFS_BTREE_NOERROR;
+
+	if (error)
+		cur_error = XFS_BTREE_ERROR;
+
+	if (acur->cnt)
+		xfs_btree_del_cursor(acur->cnt, cur_error);
+	if (acur->bnolt)
+		xfs_btree_del_cursor(acur->bnolt, cur_error);
+	if (acur->bnogt)
+		xfs_btree_del_cursor(acur->bnogt, cur_error);
+	acur->cnt = acur->bnolt = acur->bnogt = NULL;
+}
+
+/*
+ * Check an extent for allocation and track the best available candidate in the
+ * allocation structure. The cursor is deactivated if it has entered an out of
+ * range state based on allocation arguments. Optionally return the extent
+ * extent geometry and allocation status if requested by the caller.
+ */
+static int
+xfs_alloc_cur_check(
+	struct xfs_alloc_arg	*args,
+	struct xfs_alloc_cur	*acur,
+	struct xfs_btree_cur	*cur,
+	int			*new)
+{
+	int			error, i;
+	xfs_agblock_t		bno, bnoa, bnew;
+	xfs_extlen_t		len, lena, diff = -1;
+	bool			busy;
+	unsigned		busy_gen = 0;
+	bool			deactivate = false;
+	bool			isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+
+	*new = 0;
+
+	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(args->mp, i != 1))
+		return -EFSCORRUPTED;
+
+	/*
+	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
+	 * range (i.e., walking backwards looking for a minlen extent).
+	 */
+	if (len < args->minlen) {
+		deactivate = !isbnobt;
+		goto out;
+	}
+
+	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
+					 &busy_gen);
+	acur->busy |= busy;
+	if (busy)
+		acur->busy_gen = busy_gen;
+	/* deactivate a bnobt cursor outside of locality range */
+	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
+		deactivate = isbnobt;
+		goto out;
+	}
+	if (lena < args->minlen)
+		goto out;
+
+	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
+	xfs_alloc_fix_len(args);
+	ASSERT(args->len >= args->minlen);
+	if (args->len < acur->len)
+		goto out;
+
+	/*
+	 * We have an aligned record that satisfies minlen and beats or matches
+	 * the candidate extent size. Compare locality for near allocation mode.
+	 */
+	ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+	diff = xfs_alloc_compute_diff(args->agbno, args->len,
+				      args->alignment, args->datatype,
+				      bnoa, lena, &bnew);
+	if (bnew == NULLAGBLOCK)
+		goto out;
+
+	/*
+	 * Deactivate a bnobt cursor with worse locality than the current best.
+	 */
+	if (diff > acur->diff) {
+		deactivate = isbnobt;
+		goto out;
+	}
+
+	ASSERT(args->len > acur->len ||
+	       (args->len == acur->len && diff <= acur->diff));
+	acur->rec_bno = bno;
+	acur->rec_len = len;
+	acur->bno = bnew;
+	acur->len = args->len;
+	acur->diff = diff;
+	*new = 1;
+
+	/*
+	 * We're done if we found a perfect allocation. This only deactivates
+	 * the current cursor, but this is just an optimization to terminate a
+	 * cntbt search that otherwise runs to the edge of the tree.
+	 */
+	if (acur->diff == 0 && acur->len == args->maxlen)
+		deactivate = true;
+out:
+	if (deactivate)
+		cur->bc_ag.abt.active = false;
+	trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
+				  *new);
+	return 0;
+}
+
+/*
+ * Complete an allocation of a candidate extent. Remove the extent from both
+ * trees and update the args structure.
+ */
+STATIC int
+xfs_alloc_cur_finish(
+	struct xfs_alloc_arg	*args,
+	struct xfs_alloc_cur	*acur)
+{
+	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
+	int			error;
+
+	ASSERT(acur->cnt && acur->bnolt);
+	ASSERT(acur->bno >= acur->rec_bno);
+	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
+	ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
+
+	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
+				      acur->rec_len, acur->bno, acur->len, 0);
+	if (error)
+		return error;
+
+	args->agbno = acur->bno;
+	args->len = acur->len;
+	args->wasfromfl = 0;
+
+	trace_xfs_alloc_cur(args);
+	return 0;
+}
+
+/*
+ * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
+ * bno optimized lookup to search for extents with ideal size and locality.
+ */
+STATIC int
+xfs_alloc_cntbt_iter(
+	struct xfs_alloc_arg		*args,
+	struct xfs_alloc_cur		*acur)
+{
+	struct xfs_btree_cur	*cur = acur->cnt;
+	xfs_agblock_t		bno;
+	xfs_extlen_t		len, cur_len;
+	int			error;
+	int			i;
+
+	if (!xfs_alloc_cur_active(cur))
+		return 0;
+
+	/* locality optimized lookup */
+	cur_len = acur->cur_len;
+	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
+	if (error)
+		return error;
+	if (i == 0)
+		return 0;
+	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+	if (error)
+		return error;
+
+	/* check the current record and update search length from it */
+	error = xfs_alloc_cur_check(args, acur, cur, &i);
+	if (error)
+		return error;
+	ASSERT(len >= acur->cur_len);
+	acur->cur_len = len;
+
+	/*
+	 * We looked up the first record >= [agbno, len] above. The agbno is a
+	 * secondary key and so the current record may lie just before or after
+	 * agbno. If it is past agbno, check the previous record too so long as
+	 * the length matches as it may be closer. Don't check a smaller record
+	 * because that could deactivate our cursor.
+	 */
+	if (bno > args->agbno) {
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (!error && i) {
+			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+			if (!error && i && len == acur->cur_len)
+				error = xfs_alloc_cur_check(args, acur, cur,
+							    &i);
+		}
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Increment the search key until we find at least one allocation
+	 * candidate or if the extent we found was larger. Otherwise, double the
+	 * search key to optimize the search. Efficiency is more important here
+	 * than absolute best locality.
+	 */
+	cur_len <<= 1;
+	if (!acur->len || acur->cur_len >= cur_len)
+		acur->cur_len++;
+	else
+		acur->cur_len = cur_len;
+
+	return error;
+}
+
+/*
+ * Deal with the case where only small freespaces remain. Either return the
+ * contents of the last freespace record, or allocate space from the freelist if
+ * there is nothing in the tree.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_small(
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
+	xfs_agblock_t		*fbnop,	/* result block number */
+	xfs_extlen_t		*flenp,	/* result length */
+	int			*stat)	/* status: 0-freelist, 1-normal/none */
+{
+	struct xfs_agf		*agf = args->agbp->b_addr;
+	int			error = 0;
+	xfs_agblock_t		fbno = NULLAGBLOCK;
+	xfs_extlen_t		flen = 0;
+	int			i = 0;
+
+	/*
+	 * If a cntbt cursor is provided, try to allocate the largest record in
+	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
+	 * allocation. Make sure to respect minleft even when pulling from the
+	 * freelist.
+	 */
+	if (ccur)
+		error = xfs_btree_decrement(ccur, 0, &i);
+	if (error)
+		goto error;
+	if (i) {
+		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
+		if (error)
+			goto error;
+		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error;
+		}
+		goto out;
+	}
+
+	if (args->minlen != 1 || args->alignment != 1 ||
+	    args->resv == XFS_AG_RESV_AGFL ||
+	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
+		goto out;
+
+	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
+			&fbno, 0);
+	if (error)
+		goto error;
+	if (fbno == NULLAGBLOCK)
+		goto out;
+
+	xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
+			      (args->datatype & XFS_ALLOC_NOBUSY));
+
+	if (args->datatype & XFS_ALLOC_USERDATA) {
+		struct xfs_buf	*bp;
+
+		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
+				XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
+				args->mp->m_bsize, 0, &bp);
+		if (error)
+			goto error;
+		xfs_trans_binval(args->tp, bp);
+	}
+	*fbnop = args->agbno = fbno;
+	*flenp = args->len = 1;
+	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
+		error = -EFSCORRUPTED;
+		goto error;
+	}
+	args->wasfromfl = 1;
+	trace_xfs_alloc_small_freelist(args);
+
+	/*
+	 * If we're feeding an AGFL block to something that doesn't live in the
+	 * free space, we need to clear out the OWN_AG rmap.
+	 */
+	error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
+			      &XFS_RMAP_OINFO_AG);
+	if (error)
+		goto error;
+
+	*stat = 0;
+	return 0;
+
+out:
+	/*
+	 * Can't do the allocation, give up.
+	 */
+	if (flen < args->minlen) {
+		args->agbno = NULLAGBLOCK;
+		trace_xfs_alloc_small_notenough(args);
+		flen = 0;
+	}
+	*fbnop = fbno;
+	*flenp = flen;
+	*stat = 1;
+	trace_xfs_alloc_small_done(args);
+	return 0;
+
+error:
+	trace_xfs_alloc_small_error(args);
+	return error;
+}
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent(
+	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
+{
+	int		error=0;
+
+	ASSERT(args->minlen > 0);
+	ASSERT(args->maxlen > 0);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->mod < args->prod);
+	ASSERT(args->alignment > 0);
+
+	/*
+	 * Branch to correct routine based on the type.
+	 */
+	args->wasfromfl = 0;
+	switch (args->type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+		error = xfs_alloc_ag_vextent_size(args);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_alloc_ag_vextent_near(args);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_alloc_ag_vextent_exact(args);
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+
+	if (error || args->agbno == NULLAGBLOCK)
+		return error;
+
+	ASSERT(args->len >= args->minlen);
+	ASSERT(args->len <= args->maxlen);
+	ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
+	ASSERT(args->agbno % args->alignment == 0);
+
+	/* if not file data, insert new block into the reverse map btree */
+	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
+		error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
+				       args->agbno, args->len, &args->oinfo);
+		if (error)
+			return error;
+	}
+
+	if (!args->wasfromfl) {
+		error = xfs_alloc_update_counters(args->tp, args->agbp,
+						  -((long)(args->len)));
+		if (error)
+			return error;
+
+		ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
+					      args->agbno, args->len));
+	}
+
+	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
+
+	XFS_STATS_INC(args->mp, xs_allocx);
+	XFS_STATS_ADD(args->mp, xs_allocb, args->len);
+	return error;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_exact(
+	xfs_alloc_arg_t	*args)	/* allocation argument structure */
+{
+	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
+	struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
+	struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
+	int		error;
+	xfs_agblock_t	fbno;	/* start block of found extent */
+	xfs_extlen_t	flen;	/* length of found extent */
+	xfs_agblock_t	tbno;	/* start block of busy extent */
+	xfs_extlen_t	tlen;	/* length of busy extent */
+	xfs_agblock_t	tend;	/* end block of busy extent */
+	int		i;	/* success/failure of operation */
+	unsigned	busy_gen;
+
+	ASSERT(args->alignment == 1);
+
+	/*
+	 * Allocate/initialize a cursor for the by-number freespace btree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+					  args->pag, XFS_BTNUM_BNO);
+
+	/*
+	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+	 * Look for the closest free block <= bno, it must contain bno
+	 * if any free block does.
+	 */
+	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+	if (error)
+		goto error0;
+	if (!i)
+		goto not_found;
+
+	/*
+	 * Grab the freespace record.
+	 */
+	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+	if (error)
+		goto error0;
+	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(fbno <= args->agbno);
+
+	/*
+	 * Check for overlapping busy extents.
+	 */
+	tbno = fbno;
+	tlen = flen;
+	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
+
+	/*
+	 * Give up if the start of the extent is busy, or the freespace isn't
+	 * long enough for the minimum request.
+	 */
+	if (tbno > args->agbno)
+		goto not_found;
+	if (tlen < args->minlen)
+		goto not_found;
+	tend = tbno + tlen;
+	if (tend < args->agbno + args->minlen)
+		goto not_found;
+
+	/*
+	 * End of extent will be smaller of the freespace end and the
+	 * maximal requested end.
+	 *
+	 * Fix the length according to mod and prod if given.
+	 */
+	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
+						- args->agbno;
+	xfs_alloc_fix_len(args);
+	ASSERT(args->agbno + args->len <= tend);
+
+	/*
+	 * We are allocating agbno for args->len
+	 * Allocate/initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+					args->pag, XFS_BTNUM_CNT);
+	ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
+	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+				      args->len, XFSA_FIXUP_BNO_OK);
+	if (error) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+		goto error0;
+	}
+
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+	args->wasfromfl = 0;
+	trace_xfs_alloc_exact_done(args);
+	return 0;
+
+not_found:
+	/* Didn't find it, return null. */
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	args->agbno = NULLAGBLOCK;
+	trace_xfs_alloc_exact_notfound(args);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	trace_xfs_alloc_exact_error(args);
+	return error;
+}
+
+/*
+ * Search a given number of btree records in a given direction. Check each
+ * record against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_walk_iter(
+	struct xfs_alloc_arg	*args,
+	struct xfs_alloc_cur	*acur,
+	struct xfs_btree_cur	*cur,
+	bool			increment,
+	bool			find_one, /* quit on first candidate */
+	int			count,    /* rec count (-1 for infinite) */
+	int			*stat)
+{
+	int			error;
+	int			i;
+
+	*stat = 0;
+
+	/*
+	 * Search so long as the cursor is active or we find a better extent.
+	 * The cursor is deactivated if it extends beyond the range of the
+	 * current allocation candidate.
+	 */
+	while (xfs_alloc_cur_active(cur) && count) {
+		error = xfs_alloc_cur_check(args, acur, cur, &i);
+		if (error)
+			return error;
+		if (i == 1) {
+			*stat = 1;
+			if (find_one)
+				break;
+		}
+		if (!xfs_alloc_cur_active(cur))
+			break;
+
+		if (increment)
+			error = xfs_btree_increment(cur, 0, &i);
+		else
+			error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			return error;
+		if (i == 0)
+			cur->bc_ag.abt.active = false;
+
+		if (count > 0)
+			count--;
+	}
+
+	return 0;
+}
+
+/*
+ * Search the by-bno and by-size btrees in parallel in search of an extent with
+ * ideal locality based on the NEAR mode ->agbno locality hint.
+ */
+STATIC int
+xfs_alloc_ag_vextent_locality(
+	struct xfs_alloc_arg	*args,
+	struct xfs_alloc_cur	*acur,
+	int			*stat)
+{
+	struct xfs_btree_cur	*fbcur = NULL;
+	int			error;
+	int			i;
+	bool			fbinc;
+
+	ASSERT(acur->len == 0);
+	ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+
+	*stat = 0;
+
+	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
+	if (error)
+		return error;
+	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
+	if (error)
+		return error;
+	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
+	if (error)
+		return error;
+
+	/*
+	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
+	 * right and lookup the closest extent to the locality hint for each
+	 * extent size key in the cntbt. The entire search terminates
+	 * immediately on a bnobt hit because that means we've found best case
+	 * locality. Otherwise the search continues until the cntbt cursor runs
+	 * off the end of the tree. If no allocation candidate is found at this
+	 * point, give up on locality, walk backwards from the end of the cntbt
+	 * and take the first available extent.
+	 *
+	 * The parallel tree searches balance each other out to provide fairly
+	 * consistent performance for various situations. The bnobt search can
+	 * have pathological behavior in the worst case scenario of larger
+	 * allocation requests and fragmented free space. On the other hand, the
+	 * bnobt is able to satisfy most smaller allocation requests much more
+	 * quickly than the cntbt. The cntbt search can sift through fragmented
+	 * free space and sets of free extents for larger allocation requests
+	 * more quickly than the bnobt. Since the locality hint is just a hint
+	 * and we don't want to scan the entire bnobt for perfect locality, the
+	 * cntbt search essentially bounds the bnobt search such that we can
+	 * find good enough locality at reasonable performance in most cases.
+	 */
+	while (xfs_alloc_cur_active(acur->bnolt) ||
+	       xfs_alloc_cur_active(acur->bnogt) ||
+	       xfs_alloc_cur_active(acur->cnt)) {
+
+		trace_xfs_alloc_cur_lookup(args);
+
+		/*
+		 * Search the bnobt left and right. In the case of a hit, finish
+		 * the search in the opposite direction and we're done.
+		 */
+		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
+					    true, 1, &i);
+		if (error)
+			return error;
+		if (i == 1) {
+			trace_xfs_alloc_cur_left(args);
+			fbcur = acur->bnogt;
+			fbinc = true;
+			break;
+		}
+		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
+					    1, &i);
+		if (error)
+			return error;
+		if (i == 1) {
+			trace_xfs_alloc_cur_right(args);
+			fbcur = acur->bnolt;
+			fbinc = false;
+			break;
+		}
+
+		/*
+		 * Check the extent with best locality based on the current
+		 * extent size search key and keep track of the best candidate.
+		 */
+		error = xfs_alloc_cntbt_iter(args, acur);
+		if (error)
+			return error;
+		if (!xfs_alloc_cur_active(acur->cnt)) {
+			trace_xfs_alloc_cur_lookup_done(args);
+			break;
+		}
+	}
+
+	/*
+	 * If we failed to find anything due to busy extents, return empty
+	 * handed so the caller can flush and retry. If no busy extents were
+	 * found, walk backwards from the end of the cntbt as a last resort.
+	 */
+	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
+		error = xfs_btree_decrement(acur->cnt, 0, &i);
+		if (error)
+			return error;
+		if (i) {
+			acur->cnt->bc_ag.abt.active = true;
+			fbcur = acur->cnt;
+			fbinc = false;
+		}
+	}
+
+	/*
+	 * Search in the opposite direction for a better entry in the case of
+	 * a bnobt hit or walk backwards from the end of the cntbt.
+	 */
+	if (fbcur) {
+		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
+					    &i);
+		if (error)
+			return error;
+	}
+
+	if (acur->len)
+		*stat = 1;
+
+	return 0;
+}
+
+/* Check the last block of the cnt btree for allocations. */
+static int
+xfs_alloc_ag_vextent_lastblock(
+	struct xfs_alloc_arg	*args,
+	struct xfs_alloc_cur	*acur,
+	xfs_agblock_t		*bno,
+	xfs_extlen_t		*len,
+	bool			*allocated)
+{
+	int			error;
+	int			i;
+
+#ifdef DEBUG
+	/* Randomly don't execute the first algorithm. */
+	if (prandom_u32_max(2))
+		return 0;
+#endif
+
+	/*
+	 * Start from the entry that lookup found, sequence through all larger
+	 * free blocks.  If we're actually pointing at a record smaller than
+	 * maxlen, go to the start of this block, and skip all those smaller
+	 * than minlen.
+	 */
+	if (*len || args->alignment > 1) {
+		acur->cnt->bc_levels[0].ptr = 1;
+		do {
+			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
+			if (error)
+				return error;
+			if (XFS_IS_CORRUPT(args->mp, i != 1))
+				return -EFSCORRUPTED;
+			if (*len >= args->minlen)
+				break;
+			error = xfs_btree_increment(acur->cnt, 0, &i);
+			if (error)
+				return error;
+		} while (i);
+		ASSERT(*len >= args->minlen);
+		if (!i)
+			return 0;
+	}
+
+	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
+	if (error)
+		return error;
+
+	/*
+	 * It didn't work.  We COULD be in a case where there's a good record
+	 * somewhere, so try again.
+	 */
+	if (acur->len == 0)
+		return 0;
+
+	trace_xfs_alloc_near_first(args);
+	*allocated = true;
+	return 0;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int
+xfs_alloc_ag_vextent_near(
+	struct xfs_alloc_arg	*args)
+{
+	struct xfs_alloc_cur	acur = {};
+	int			error;		/* error code */
+	int			i;		/* result code, temporary */
+	xfs_agblock_t		bno;
+	xfs_extlen_t		len;
+
+	/* handle uninitialized agbno range so caller doesn't have to */
+	if (!args->min_agbno && !args->max_agbno)
+		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+	ASSERT(args->min_agbno <= args->max_agbno);
+
+	/* clamp agbno to the range if it's outside */
+	if (args->agbno < args->min_agbno)
+		args->agbno = args->min_agbno;
+	if (args->agbno > args->max_agbno)
+		args->agbno = args->max_agbno;
+
+restart:
+	len = 0;
+
+	/*
+	 * Set up cursors and see if there are any free extents as big as
+	 * maxlen. If not, pick the last entry in the tree unless the tree is
+	 * empty.
+	 */
+	error = xfs_alloc_cur_setup(args, &acur);
+	if (error == -ENOSPC) {
+		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
+				&len, &i);
+		if (error)
+			goto out;
+		if (i == 0 || len == 0) {
+			trace_xfs_alloc_near_noentry(args);
+			goto out;
+		}
+		ASSERT(i == 1);
+	} else if (error) {
+		goto out;
+	}
+
+	/*
+	 * First algorithm.
+	 * If the requested extent is large wrt the freespaces available
+	 * in this a.g., then the cursor will be pointing to a btree entry
+	 * near the right edge of the tree.  If it's in the last btree leaf
+	 * block, then we just examine all the entries in that block
+	 * that are big enough, and pick the best one.
+	 */
+	if (xfs_btree_islastblock(acur.cnt, 0)) {
+		bool		allocated = false;
+
+		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
+				&allocated);
+		if (error)
+			goto out;
+		if (allocated)
+			goto alloc_finish;
+	}
+
+	/*
+	 * Second algorithm. Combined cntbt and bnobt search to find ideal
+	 * locality.
+	 */
+	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
+	if (error)
+		goto out;
+
+	/*
+	 * If we couldn't get anything, give up.
+	 */
+	if (!acur.len) {
+		if (acur.busy) {
+			trace_xfs_alloc_near_busy(args);
+			xfs_extent_busy_flush(args->mp, args->pag,
+					      acur.busy_gen);
+			goto restart;
+		}
+		trace_xfs_alloc_size_neither(args);
+		args->agbno = NULLAGBLOCK;
+		goto out;
+	}
+
+alloc_finish:
+	/* fix up btrees on a successful allocation */
+	error = xfs_alloc_cur_finish(args, &acur);
+
+out:
+	xfs_alloc_cur_close(&acur, error);
+	return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_size(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	struct xfs_agf	*agf = args->agbp->b_addr;
+	struct xfs_btree_cur *bno_cur;	/* cursor for bno btree */
+	struct xfs_btree_cur *cnt_cur;	/* cursor for cnt btree */
+	int		error;		/* error result */
+	xfs_agblock_t	fbno;		/* start of found freespace */
+	xfs_extlen_t	flen;		/* length of found freespace */
+	int		i;		/* temp status variable */
+	xfs_agblock_t	rbno;		/* returned block number */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+	bool		busy;
+	unsigned	busy_gen;
+
+restart:
+	/*
+	 * Allocate and initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+					args->pag, XFS_BTNUM_CNT);
+	bno_cur = NULL;
+
+	/*
+	 * Look for an entry >= maxlen+alignment-1 blocks.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+			args->maxlen + args->alignment - 1, &i)))
+		goto error0;
+
+	/*
+	 * If none then we have to settle for a smaller extent. In the case that
+	 * there are no large extents, this will return the last entry in the
+	 * tree unless the tree is empty. In the case that there are only busy
+	 * large extents, this will return the largest small extent unless there
+	 * are no smaller extents available.
+	 */
+	if (!i) {
+		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+						   &fbno, &flen, &i);
+		if (error)
+			goto error0;
+		if (i == 0 || flen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_noentry(args);
+			return 0;
+		}
+		ASSERT(i == 1);
+		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
+				&rlen, &busy_gen);
+	} else {
+		/*
+		 * Search for a non-busy extent that is large enough.
+		 */
+		for (;;) {
+			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+			if (error)
+				goto error0;
+			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto error0;
+			}
+
+			busy = xfs_alloc_compute_aligned(args, fbno, flen,
+					&rbno, &rlen, &busy_gen);
+
+			if (rlen >= args->maxlen)
+				break;
+
+			error = xfs_btree_increment(cnt_cur, 0, &i);
+			if (error)
+				goto error0;
+			if (i == 0) {
+				/*
+				 * Our only valid extents must have been busy.
+				 * Make it unbusy by forcing the log out and
+				 * retrying.
+				 */
+				xfs_btree_del_cursor(cnt_cur,
+						     XFS_BTREE_NOERROR);
+				trace_xfs_alloc_size_busy(args);
+				xfs_extent_busy_flush(args->mp,
+							args->pag, busy_gen);
+				goto restart;
+			}
+		}
+	}
+
+	/*
+	 * In the first case above, we got the last entry in the
+	 * by-size btree.  Now we check to see if the space hits maxlen
+	 * once aligned; if not, we search left for something better.
+	 * This can't happen in the second case above.
+	 */
+	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+	if (XFS_IS_CORRUPT(args->mp,
+			   rlen != 0 &&
+			   (rlen > flen ||
+			    rbno + rlen > fbno + flen))) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	if (rlen < args->maxlen) {
+		xfs_agblock_t	bestfbno;
+		xfs_extlen_t	bestflen;
+		xfs_agblock_t	bestrbno;
+		xfs_extlen_t	bestrlen;
+
+		bestrlen = rlen;
+		bestrbno = rbno;
+		bestflen = flen;
+		bestfbno = fbno;
+		for (;;) {
+			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
+				goto error0;
+			if (i == 0)
+				break;
+			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+					&i)))
+				goto error0;
+			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto error0;
+			}
+			if (flen < bestrlen)
+				break;
+			busy = xfs_alloc_compute_aligned(args, fbno, flen,
+					&rbno, &rlen, &busy_gen);
+			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+			if (XFS_IS_CORRUPT(args->mp,
+					   rlen != 0 &&
+					   (rlen > flen ||
+					    rbno + rlen > fbno + flen))) {
+				error = -EFSCORRUPTED;
+				goto error0;
+			}
+			if (rlen > bestrlen) {
+				bestrlen = rlen;
+				bestrbno = rbno;
+				bestflen = flen;
+				bestfbno = fbno;
+				if (rlen == args->maxlen)
+					break;
+			}
+		}
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+				&i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		rlen = bestrlen;
+		rbno = bestrbno;
+		flen = bestflen;
+		fbno = bestfbno;
+	}
+	args->wasfromfl = 0;
+	/*
+	 * Fix up the length.
+	 */
+	args->len = rlen;
+	if (rlen < args->minlen) {
+		if (busy) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_busy(args);
+			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
+			goto restart;
+		}
+		goto out_nominleft;
+	}
+	xfs_alloc_fix_len(args);
+
+	rlen = args->len;
+	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	/*
+	 * Allocate and initialize a cursor for the by-block tree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+					args->pag, XFS_BTNUM_BNO);
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			rbno, rlen, XFSA_FIXUP_CNT_OK)))
+		goto error0;
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	cnt_cur = bno_cur = NULL;
+	args->len = rlen;
+	args->agbno = rbno;
+	if (XFS_IS_CORRUPT(args->mp,
+			   args->agbno + args->len >
+			   be32_to_cpu(agf->agf_length))) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	trace_xfs_alloc_size_done(args);
+	return 0;
+
+error0:
+	trace_xfs_alloc_size_error(args);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	return error;
+
+out_nominleft:
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	trace_xfs_alloc_size_nominleft(args);
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int
+xfs_free_ag_extent(
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	xfs_agnumber_t			agno,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		type)
+{
+	struct xfs_mount		*mp;
+	struct xfs_btree_cur		*bno_cur;
+	struct xfs_btree_cur		*cnt_cur;
+	xfs_agblock_t			gtbno; /* start of right neighbor */
+	xfs_extlen_t			gtlen; /* length of right neighbor */
+	xfs_agblock_t			ltbno; /* start of left neighbor */
+	xfs_extlen_t			ltlen; /* length of left neighbor */
+	xfs_agblock_t			nbno; /* new starting block of freesp */
+	xfs_extlen_t			nlen; /* new length of freespace */
+	int				haveleft; /* have a left neighbor */
+	int				haveright; /* have a right neighbor */
+	int				i;
+	int				error;
+	struct xfs_perag		*pag = agbp->b_pag;
+
+	bno_cur = cnt_cur = NULL;
+	mp = tp->t_mountp;
+
+	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
+		error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * Allocate and initialize a cursor for the by-block btree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
+	/*
+	 * Look for a neighboring block on the left (lower block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+		goto error0;
+	if (haveleft) {
+		/*
+		 * There is a block to our left.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (ltbno + ltlen < bno)
+			haveleft = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+				error = -EFSCORRUPTED;
+				goto error0;
+			}
+		}
+	}
+	/*
+	 * Look for a neighboring block on the right (higher block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
+		goto error0;
+	if (haveright) {
+		/*
+		 * There is a block to our right.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (bno + len < gtbno)
+			haveright = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+				error = -EFSCORRUPTED;
+				goto error0;
+			}
+		}
+	}
+	/*
+	 * Now allocate and initialize a cursor for the by-size tree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
+	/*
+	 * Have both left and right contiguous neighbors.
+	 * Merge all three into a single free block.
+	 */
+	if (haveleft && haveright) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		/*
+		 * Delete the old by-block entry for the right block.
+		 */
+		if ((error = xfs_btree_delete(bno_cur, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		/*
+		 * Move the by-block cursor back to the left neighbor.
+		 */
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+#ifdef DEBUG
+		/*
+		 * Check that this is the right record: delete didn't
+		 * mangle the cursor.
+		 */
+		{
+			xfs_agblock_t	xxbno;
+			xfs_extlen_t	xxlen;
+
+			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+					&i)))
+				goto error0;
+			if (XFS_IS_CORRUPT(mp,
+					   i != 1 ||
+					   xxbno != ltbno ||
+					   xxlen != ltlen)) {
+				error = -EFSCORRUPTED;
+				goto error0;
+			}
+		}
+#endif
+		/*
+		 * Update remaining by-block entry to the new, joined block.
+		 */
+		nbno = ltbno;
+		nlen = len + ltlen + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a left contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveleft) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		/*
+		 * Back up the by-block cursor to the left neighbor, and
+		 * update its length.
+		 */
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		nbno = ltbno;
+		nlen = len + ltlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a right contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveright) {
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		/*
+		 * Update the starting block and length of the right
+		 * neighbor in the by-block tree.
+		 */
+		nbno = bno;
+		nlen = len + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * No contiguous neighbors.
+	 * Insert the new freespace into the by-block tree.
+	 */
+	else {
+		nbno = bno;
+		nlen = len;
+		if ((error = xfs_btree_insert(bno_cur, &i)))
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+	}
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	bno_cur = NULL;
+	/*
+	 * In all cases we need to insert the new freespace in the by-size tree.
+	 */
+	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+		goto error0;
+	if (XFS_IS_CORRUPT(mp, i != 0)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	if ((error = xfs_btree_insert(cnt_cur, &i)))
+		goto error0;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	cnt_cur = NULL;
+
+	/*
+	 * Update the freespace totals in the ag and superblock.
+	 */
+	error = xfs_alloc_update_counters(tp, agbp, len);
+	xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
+	if (error)
+		goto error0;
+
+	XFS_STATS_INC(mp, xs_freex);
+	XFS_STATS_ADD(mp, xs_freeb, len);
+
+	trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
+
+	return 0;
+
+ error0:
+	trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_alloc_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
+			(mp->m_sb.sb_agblocks + 1) / 2);
+	ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
+}
+
+/*
+ * Find the length of the longest extent in an AG.  The 'need' parameter
+ * specifies how much space we're going to need for the AGFL and the
+ * 'reserved' parameter tells us how many blocks in this AG are reserved for
+ * other callers.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+	struct xfs_perag	*pag,
+	xfs_extlen_t		need,
+	xfs_extlen_t		reserved)
+{
+	xfs_extlen_t		delta = 0;
+
+	/*
+	 * If the AGFL needs a recharge, we'll have to subtract that from the
+	 * longest extent.
+	 */
+	if (need > pag->pagf_flcount)
+		delta = need - pag->pagf_flcount;
+
+	/*
+	 * If we cannot maintain others' reservations with space from the
+	 * not-longest freesp extents, we'll have to subtract /that/ from
+	 * the longest extent too.
+	 */
+	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
+		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
+
+	/*
+	 * If the longest extent is long enough to satisfy all the
+	 * reservations and AGFL rules in place, we can return this extent.
+	 */
+	if (pag->pagf_longest > delta)
+		return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
+				pag->pagf_longest - delta);
+
+	/* Otherwise, let the caller try for 1 block if there's space. */
+	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
+ * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
+ * return the largest possible minimum length.
+ */
+unsigned int
+xfs_alloc_min_freelist(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag)
+{
+	/* AG btrees have at least 1 level. */
+	static const uint8_t	fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
+	const uint8_t		*levels = pag ? pag->pagf_levels : fake_levels;
+	unsigned int		min_free;
+
+	ASSERT(mp->m_alloc_maxlevels > 0);
+
+	/* space needed by-bno freespace btree */
+	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
+				       mp->m_alloc_maxlevels);
+	/* space needed by-size freespace btree */
+	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
+				       mp->m_alloc_maxlevels);
+	/* space needed reverse mapping used space btree */
+	if (xfs_has_rmapbt(mp))
+		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
+						mp->m_rmap_maxlevels);
+
+	return min_free;
+}
+
+/*
+ * Check if the operation we are fixing up the freelist for should go ahead or
+ * not. If we are freeing blocks, we always allow it, otherwise the allocation
+ * is dependent on whether the size and shape of free space available will
+ * permit the requested allocation to take place.
+ */
+static bool
+xfs_alloc_space_available(
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		min_free,
+	int			flags)
+{
+	struct xfs_perag	*pag = args->pag;
+	xfs_extlen_t		alloc_len, longest;
+	xfs_extlen_t		reservation; /* blocks that are still reserved */
+	int			available;
+	xfs_extlen_t		agflcount;
+
+	if (flags & XFS_ALLOC_FLAG_FREEING)
+		return true;
+
+	reservation = xfs_ag_resv_needed(pag, args->resv);
+
+	/* do we have enough contiguous free space for the allocation? */
+	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
+	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
+	if (longest < alloc_len)
+		return false;
+
+	/*
+	 * Do we have enough free space remaining for the allocation? Don't
+	 * account extra agfl blocks because we are about to defer free them,
+	 * making them unavailable until the current transaction commits.
+	 */
+	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
+	available = (int)(pag->pagf_freeblks + agflcount -
+			  reservation - min_free - args->minleft);
+	if (available < (int)max(args->total, alloc_len))
+		return false;
+
+	/*
+	 * Clamp maxlen to the amount of free space available for the actual
+	 * extent allocation.
+	 */
+	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
+		args->maxlen = available;
+		ASSERT(args->maxlen > 0);
+		ASSERT(args->maxlen >= args->minlen);
+	}
+
+	return true;
+}
+
+int
+xfs_free_agfl_block(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	struct xfs_buf		*agbp,
+	struct xfs_owner_info	*oinfo)
+{
+	int			error;
+	struct xfs_buf		*bp;
+
+	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
+				   XFS_AG_RESV_AGFL);
+	if (error)
+		return error;
+
+	error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
+			XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
+			tp->t_mountp->m_bsize, 0, &bp);
+	if (error)
+		return error;
+	xfs_trans_binval(tp, bp);
+
+	return 0;
+}
+
+/*
+ * Check the agfl fields of the agf for inconsistency or corruption. The purpose
+ * is to detect an agfl header padding mismatch between current and early v5
+ * kernels. This problem manifests as a 1-slot size difference between the
+ * on-disk flcount and the active [first, last] range of a wrapped agfl. This
+ * may also catch variants of agfl count corruption unrelated to padding. Either
+ * way, we'll reset the agfl and warn the user.
+ *
+ * Return true if a reset is required before the agfl can be used, false
+ * otherwise.
+ */
+static bool
+xfs_agfl_needs_reset(
+	struct xfs_mount	*mp,
+	struct xfs_agf		*agf)
+{
+	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
+	uint32_t		l = be32_to_cpu(agf->agf_fllast);
+	uint32_t		c = be32_to_cpu(agf->agf_flcount);
+	int			agfl_size = xfs_agfl_size(mp);
+	int			active;
+
+	/* no agfl header on v4 supers */
+	if (!xfs_has_crc(mp))
+		return false;
+
+	/*
+	 * The agf read verifier catches severe corruption of these fields.
+	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
+	 * the verifier allows it.
+	 */
+	if (f >= agfl_size || l >= agfl_size)
+		return true;
+	if (c > agfl_size)
+		return true;
+
+	/*
+	 * Check consistency between the on-disk count and the active range. An
+	 * agfl padding mismatch manifests as an inconsistent flcount.
+	 */
+	if (c && l >= f)
+		active = l - f + 1;
+	else if (c)
+		active = agfl_size - f + l + 1;
+	else
+		active = 0;
+
+	return active != c;
+}
+
+/*
+ * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
+ * agfl content cannot be trusted. Warn the user that a repair is required to
+ * recover leaked blocks.
+ *
+ * The purpose of this mechanism is to handle filesystems affected by the agfl
+ * header padding mismatch problem. A reset keeps the filesystem online with a
+ * relatively minor free space accounting inconsistency rather than suffer the
+ * inevitable crash from use of an invalid agfl block.
+ */
+static void
+xfs_agfl_reset(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agf		*agf = agbp->b_addr;
+
+	ASSERT(pag->pagf_agflreset);
+	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
+
+	xfs_warn(mp,
+	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
+	       "Please unmount and run xfs_repair.",
+	         pag->pag_agno, pag->pagf_flcount);
+
+	agf->agf_flfirst = 0;
+	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
+	agf->agf_flcount = 0;
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
+				    XFS_AGF_FLCOUNT);
+
+	pag->pagf_flcount = 0;
+	pag->pagf_agflreset = false;
+}
+
+/*
+ * Defer an AGFL block free. This is effectively equivalent to
+ * xfs_free_extent_later() with some special handling particular to AGFL blocks.
+ *
+ * Deferring AGFL frees helps prevent log reservation overruns due to too many
+ * allocation operations in a transaction. AGFL frees are prone to this problem
+ * because for one they are always freed one at a time. Further, an immediate
+ * AGFL block free can cause a btree join and require another block free before
+ * the real allocation can proceed. Deferring the free disconnects freeing up
+ * the AGFL slot from freeing the block.
+ */
+STATIC void
+xfs_defer_agfl_block(
+	struct xfs_trans		*tp,
+	xfs_agnumber_t			agno,
+	xfs_fsblock_t			agbno,
+	struct xfs_owner_info		*oinfo)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_extent_free_item	*new;		/* new element */
+
+	ASSERT(xfs_extfree_item_cache != NULL);
+	ASSERT(oinfo != NULL);
+
+	new = kmem_cache_zalloc(xfs_extfree_item_cache,
+			       GFP_KERNEL | __GFP_NOFAIL);
+	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
+	new->xefi_blockcount = 1;
+	new->xefi_owner = oinfo->oi_owner;
+
+	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+
+	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
+}
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+__xfs_free_extent_later(
+	struct xfs_trans		*tp,
+	xfs_fsblock_t			bno,
+	xfs_filblks_t			len,
+	const struct xfs_owner_info	*oinfo,
+	bool				skip_discard)
+{
+	struct xfs_extent_free_item	*new;		/* new element */
+#ifdef DEBUG
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(len > 0);
+	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+	ASSERT(!isnullstartblock(bno));
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(agbno < mp->m_sb.sb_agblocks);
+	ASSERT(len < mp->m_sb.sb_agblocks);
+	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+	ASSERT(xfs_extfree_item_cache != NULL);
+
+	new = kmem_cache_zalloc(xfs_extfree_item_cache,
+			       GFP_KERNEL | __GFP_NOFAIL);
+	new->xefi_startblock = bno;
+	new->xefi_blockcount = (xfs_extlen_t)len;
+	if (skip_discard)
+		new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+	if (oinfo) {
+		ASSERT(oinfo->oi_offset == 0);
+
+		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+			new->xefi_flags |= XFS_EFI_ATTR_FORK;
+		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+			new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+		new->xefi_owner = oinfo->oi_owner;
+	} else {
+		new->xefi_owner = XFS_RMAP_OWN_NULL;
+	}
+	trace_xfs_bmap_free_defer(tp->t_mountp,
+			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
+			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+}
+
+#ifdef DEBUG
+/*
+ * Check if an AGF has a free extent record whose length is equal to
+ * args->minlen.
+ */
+STATIC int
+xfs_exact_minlen_extent_available(
+	struct xfs_alloc_arg	*args,
+	struct xfs_buf		*agbp,
+	int			*stat)
+{
+	struct xfs_btree_cur	*cnt_cur;
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	int			error = 0;
+
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
+					args->pag, XFS_BTNUM_CNT);
+	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
+	if (error)
+		goto out;
+
+	if (*stat == 0) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
+	if (error)
+		goto out;
+
+	if (*stat == 1 && flen != args->minlen)
+		*stat = 0;
+
+out:
+	xfs_btree_del_cursor(cnt_cur, error);
+
+	return error;
+}
+#endif
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+int			/* error */
+xfs_alloc_fix_freelist(
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	int			flags)	/* XFS_ALLOC_FLAG_... */
+{
+	struct xfs_mount	*mp = args->mp;
+	struct xfs_perag	*pag = args->pag;
+	struct xfs_trans	*tp = args->tp;
+	struct xfs_buf		*agbp = NULL;
+	struct xfs_buf		*agflbp = NULL;
+	struct xfs_alloc_arg	targs;	/* local allocation arguments */
+	xfs_agblock_t		bno;	/* freelist block */
+	xfs_extlen_t		need;	/* total blocks needed in freelist */
+	int			error = 0;
+
+	/* deferred ops (AGFL block frees) require permanent transactions */
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+	if (!pag->pagf_init) {
+		error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
+		if (error) {
+			/* Couldn't lock the AGF so skip this AG. */
+			if (error == -EAGAIN)
+				error = 0;
+			goto out_no_agbp;
+		}
+	}
+
+	/*
+	 * If this is a metadata preferred pag and we are user data then try
+	 * somewhere else if we are not being asked to try harder at this
+	 * point
+	 */
+	if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
+	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+		goto out_agbp_relse;
+	}
+
+	need = xfs_alloc_min_freelist(mp, pag);
+	if (!xfs_alloc_space_available(args, need, flags |
+			XFS_ALLOC_FLAG_CHECK))
+		goto out_agbp_relse;
+
+	/*
+	 * Get the a.g. freespace buffer.
+	 * Can fail if we're not blocking on locks, and it's held.
+	 */
+	if (!agbp) {
+		error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
+		if (error) {
+			/* Couldn't lock the AGF so skip this AG. */
+			if (error == -EAGAIN)
+				error = 0;
+			goto out_no_agbp;
+		}
+	}
+
+	/* reset a padding mismatched agfl before final free space check */
+	if (pag->pagf_agflreset)
+		xfs_agfl_reset(tp, agbp, pag);
+
+	/* If there isn't enough total space or single-extent, reject it. */
+	need = xfs_alloc_min_freelist(mp, pag);
+	if (!xfs_alloc_space_available(args, need, flags))
+		goto out_agbp_relse;
+
+#ifdef DEBUG
+	if (args->alloc_minlen_only) {
+		int stat;
+
+		error = xfs_exact_minlen_extent_available(args, agbp, &stat);
+		if (error || !stat)
+			goto out_agbp_relse;
+	}
+#endif
+	/*
+	 * Make the freelist shorter if it's too long.
+	 *
+	 * Note that from this point onwards, we will always release the agf and
+	 * agfl buffers on error. This handles the case where we error out and
+	 * the buffers are clean or may not have been joined to the transaction
+	 * and hence need to be released manually. If they have been joined to
+	 * the transaction, then xfs_trans_brelse() will handle them
+	 * appropriately based on the recursion count and dirty state of the
+	 * buffer.
+	 *
+	 * XXX (dgc): When we have lots of free space, does this buy us
+	 * anything other than extra overhead when we need to put more blocks
+	 * back on the free list? Maybe we should only do this when space is
+	 * getting low or the AGFL is more than half full?
+	 *
+	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
+	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
+	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
+	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
+	 * both required to ensure that rmaps are correctly recorded for the
+	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
+	 * repair/rmap.c in xfsprogs for details.
+	 */
+	memset(&targs, 0, sizeof(targs));
+	/* struct copy below */
+	if (flags & XFS_ALLOC_FLAG_NORMAP)
+		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+	else
+		targs.oinfo = XFS_RMAP_OINFO_AG;
+	while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
+		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
+		if (error)
+			goto out_agbp_relse;
+
+		/* defer agfl frees */
+		xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+	}
+
+	targs.tp = tp;
+	targs.mp = mp;
+	targs.agbp = agbp;
+	targs.agno = args->agno;
+	targs.alignment = targs.minlen = targs.prod = 1;
+	targs.type = XFS_ALLOCTYPE_THIS_AG;
+	targs.pag = pag;
+	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+	if (error)
+		goto out_agbp_relse;
+
+	/* Make the freelist longer if it's too short. */
+	while (pag->pagf_flcount < need) {
+		targs.agbno = 0;
+		targs.maxlen = need - pag->pagf_flcount;
+		targs.resv = XFS_AG_RESV_AGFL;
+
+		/* Allocate as many blocks as possible at once. */
+		error = xfs_alloc_ag_vextent(&targs);
+		if (error)
+			goto out_agflbp_relse;
+
+		/*
+		 * Stop if we run out.  Won't happen if callers are obeying
+		 * the restrictions correctly.  Can happen for free calls
+		 * on a completely full ag.
+		 */
+		if (targs.agbno == NULLAGBLOCK) {
+			if (flags & XFS_ALLOC_FLAG_FREEING)
+				break;
+			goto out_agflbp_relse;
+		}
+		/*
+		 * Put each allocated block on the list.
+		 */
+		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+			error = xfs_alloc_put_freelist(pag, tp, agbp,
+							agflbp, bno, 0);
+			if (error)
+				goto out_agflbp_relse;
+		}
+	}
+	xfs_trans_brelse(tp, agflbp);
+	args->agbp = agbp;
+	return 0;
+
+out_agflbp_relse:
+	xfs_trans_brelse(tp, agflbp);
+out_agbp_relse:
+	if (agbp)
+		xfs_trans_brelse(tp, agbp);
+out_no_agbp:
+	args->agbp = NULL;
+	return error;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int
+xfs_alloc_get_freelist(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agblock_t		*bnop,
+	int			btreeblk)
+{
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xfs_buf		*agflbp;
+	xfs_agblock_t		bno;
+	__be32			*agfl_bno;
+	int			error;
+	uint32_t		logflags;
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	/*
+	 * Freelist is empty, give up.
+	 */
+	if (!agf->agf_flcount) {
+		*bnop = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * Read the array of free blocks.
+	 */
+	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+	if (error)
+		return error;
+
+
+	/*
+	 * Get the block number and update the data structures.
+	 */
+	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
+	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+	be32_add_cpu(&agf->agf_flfirst, 1);
+	xfs_trans_brelse(tp, agflbp);
+	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
+		agf->agf_flfirst = 0;
+
+	ASSERT(!pag->pagf_agflreset);
+	be32_add_cpu(&agf->agf_flcount, -1);
+	pag->pagf_flcount--;
+
+	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, 1);
+		pag->pagf_btreeblks++;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+	*bnop = bno;
+
+	return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	uint32_t		fields)
+{
+	int	first;		/* first byte offset */
+	int	last;		/* last byte offset */
+	static const short	offsets[] = {
+		offsetof(xfs_agf_t, agf_magicnum),
+		offsetof(xfs_agf_t, agf_versionnum),
+		offsetof(xfs_agf_t, agf_seqno),
+		offsetof(xfs_agf_t, agf_length),
+		offsetof(xfs_agf_t, agf_roots[0]),
+		offsetof(xfs_agf_t, agf_levels[0]),
+		offsetof(xfs_agf_t, agf_flfirst),
+		offsetof(xfs_agf_t, agf_fllast),
+		offsetof(xfs_agf_t, agf_flcount),
+		offsetof(xfs_agf_t, agf_freeblks),
+		offsetof(xfs_agf_t, agf_longest),
+		offsetof(xfs_agf_t, agf_btreeblks),
+		offsetof(xfs_agf_t, agf_uuid),
+		offsetof(xfs_agf_t, agf_rmap_blocks),
+		offsetof(xfs_agf_t, agf_refcount_blocks),
+		offsetof(xfs_agf_t, agf_refcount_root),
+		offsetof(xfs_agf_t, agf_refcount_level),
+		/* needed so that we don't log the whole rest of the structure: */
+		offsetof(xfs_agf_t, agf_spare64),
+		sizeof(xfs_agf_t)
+	};
+
+	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
+	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int
+xfs_alloc_put_freelist(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_buf		*agflbp,
+	xfs_agblock_t		bno,
+	int			btreeblk)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agf		*agf = agbp->b_addr;
+	__be32			*blockp;
+	int			error;
+	uint32_t		logflags;
+	__be32			*agfl_bno;
+	int			startoff;
+
+	if (!agflbp) {
+		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+		if (error)
+			return error;
+	}
+
+	be32_add_cpu(&agf->agf_fllast, 1);
+	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
+		agf->agf_fllast = 0;
+
+	ASSERT(!pag->pagf_agflreset);
+	be32_add_cpu(&agf->agf_flcount, 1);
+	pag->pagf_flcount++;
+
+	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, -1);
+		pag->pagf_btreeblks--;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
+	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
+
+	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
+	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
+	*blockp = cpu_to_be32(bno);
+	startoff = (char *)blockp - (char *)agflbp->b_addr;
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
+	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+	xfs_trans_log_buf(tp, agflbp, startoff,
+			  startoff + sizeof(xfs_agblock_t) - 1);
+	return 0;
+}
+
+static xfs_failaddr_t
+xfs_agf_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_agf		*agf = bp->b_addr;
+
+	if (xfs_has_crc(mp)) {
+		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
+			return __this_address;
+	}
+
+	if (!xfs_verify_magic(bp, agf->agf_magicnum))
+		return __this_address;
+
+	if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+	      be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
+	      be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
+	      be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
+		return __this_address;
+
+	if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
+		return __this_address;
+
+	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
+	    be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
+		return __this_address;
+
+	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
+						mp->m_alloc_maxlevels ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
+						mp->m_alloc_maxlevels)
+		return __this_address;
+
+	if (xfs_has_rmapbt(mp) &&
+	    (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
+	     be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
+						mp->m_rmap_maxlevels))
+		return __this_address;
+
+	if (xfs_has_rmapbt(mp) &&
+	    be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
+		return __this_address;
+
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+		return __this_address;
+
+	if (xfs_has_lazysbcount(mp) &&
+	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+		return __this_address;
+
+	if (xfs_has_reflink(mp) &&
+	    be32_to_cpu(agf->agf_refcount_blocks) >
+	    be32_to_cpu(agf->agf_length))
+		return __this_address;
+
+	if (xfs_has_reflink(mp) &&
+	    (be32_to_cpu(agf->agf_refcount_level) < 1 ||
+	     be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
+		return __this_address;
+
+	return NULL;
+
+}
+
+static void
+xfs_agf_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	xfs_failaddr_t	fa;
+
+	if (xfs_has_crc(mp) &&
+	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_agf_verify(bp);
+		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_agf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_agf		*agf = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_agf_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (bip)
+		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+	.name = "xfs_agf",
+	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
+	.verify_read = xfs_agf_read_verify,
+	.verify_write = xfs_agf_write_verify,
+	.verify_struct = xfs_agf_verify,
+};
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int
+xfs_read_agf(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	int			flags,
+	struct xfs_buf		**agfbpp)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	int			error;
+
+	trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
+	if (error)
+		return error;
+
+	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
+	return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section) and initialise the
+ * perag structure if necessary. If the caller provides @agfbpp, then return the
+ * locked buffer to the caller, otherwise free it.
+ */
+int
+xfs_alloc_read_agf(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	int			flags,
+	struct xfs_buf		**agfbpp)
+{
+	struct xfs_buf		*agfbp;
+	struct xfs_agf		*agf;
+	int			error;
+	int			allocbt_blks;
+
+	trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
+
+	/* We don't support trylock when freeing. */
+	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
+			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
+	error = xfs_read_agf(pag, tp,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+			&agfbp);
+	if (error)
+		return error;
+
+	agf = agfbp->b_addr;
+	if (!pag->pagf_init) {
+		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+		pag->pagf_levels[XFS_BTNUM_BNOi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+		pag->pagf_levels[XFS_BTNUM_CNTi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+		pag->pagf_levels[XFS_BTNUM_RMAPi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
+		pag->pagf_init = 1;
+		pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
+
+		/*
+		 * Update the in-core allocbt counter. Filter out the rmapbt
+		 * subset of the btreeblks counter because the rmapbt is managed
+		 * by perag reservation. Subtract one for the rmapbt root block
+		 * because the rmap counter includes it while the btreeblks
+		 * counter only tracks non-root blocks.
+		 */
+		allocbt_blks = pag->pagf_btreeblks;
+		if (xfs_has_rmapbt(pag->pag_mount))
+			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
+		if (allocbt_blks > 0)
+			atomic64_add(allocbt_blks,
+					&pag->pag_mount->m_allocbt_blks);
+	}
+#ifdef DEBUG
+	else if (!xfs_is_shutdown(pag->pag_mount)) {
+		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
+		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
+		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+	}
+#endif
+	if (agfbpp)
+		*agfbpp = agfbp;
+	else
+		xfs_trans_brelse(tp, agfbp);
+	return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int				/* error */
+xfs_alloc_vextent(
+	struct xfs_alloc_arg	*args)	/* allocation argument structure */
+{
+	xfs_agblock_t		agsize;	/* allocation group size */
+	int			error;
+	int			flags;	/* XFS_ALLOC_FLAG_... locking flags */
+	struct xfs_mount	*mp;	/* mount structure pointer */
+	xfs_agnumber_t		sagno;	/* starting allocation group number */
+	xfs_alloctype_t		type;	/* input allocation type */
+	int			bump_rotor = 0;
+	xfs_agnumber_t		rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+
+	mp = args->mp;
+	type = args->otype = args->type;
+	args->agbno = NULLAGBLOCK;
+	/*
+	 * Just fix this up, for the case where the last a.g. is shorter
+	 * (or there's only one a.g.) and the caller couldn't easily figure
+	 * that out (xfs_bmap_alloc).
+	 */
+	agsize = mp->m_sb.sb_agblocks;
+	if (args->maxlen > agsize)
+		args->maxlen = agsize;
+	if (args->alignment == 0)
+		args->alignment = 1;
+	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->minlen <= agsize);
+	ASSERT(args->mod < args->prod);
+	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+	    args->minlen > args->maxlen || args->minlen > agsize ||
+	    args->mod >= args->prod) {
+		args->fsbno = NULLFSBLOCK;
+		trace_xfs_alloc_vextent_badargs(args);
+		return 0;
+	}
+
+	switch (type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+	case XFS_ALLOCTYPE_NEAR_BNO:
+	case XFS_ALLOCTYPE_THIS_BNO:
+		/*
+		 * These three force us into a single a.g.
+		 */
+		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+		args->pag = xfs_perag_get(mp, args->agno);
+		error = xfs_alloc_fix_freelist(args, 0);
+		if (error) {
+			trace_xfs_alloc_vextent_nofix(args);
+			goto error0;
+		}
+		if (!args->agbp) {
+			trace_xfs_alloc_vextent_noagbp(args);
+			break;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		if ((error = xfs_alloc_ag_vextent(args)))
+			goto error0;
+		break;
+	case XFS_ALLOCTYPE_START_BNO:
+		/*
+		 * Try near allocation first, then anywhere-in-ag after
+		 * the first a.g. fails.
+		 */
+		if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
+		    xfs_is_inode32(mp)) {
+			args->fsbno = XFS_AGB_TO_FSB(mp,
+					((mp->m_agfrotor / rotorstep) %
+					mp->m_sb.sb_agcount), 0);
+			bump_rotor = 1;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		args->type = XFS_ALLOCTYPE_NEAR_BNO;
+		fallthrough;
+	case XFS_ALLOCTYPE_FIRST_AG:
+		/*
+		 * Rotate through the allocation groups looking for a winner.
+		 */
+		if (type == XFS_ALLOCTYPE_FIRST_AG) {
+			/*
+			 * Start with allocation group given by bno.
+			 */
+			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			sagno = 0;
+			flags = 0;
+		} else {
+			/*
+			 * Start with the given allocation group.
+			 */
+			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		}
+		/*
+		 * Loop over allocation groups twice; first time with
+		 * trylock set, second time without.
+		 */
+		for (;;) {
+			args->pag = xfs_perag_get(mp, args->agno);
+			error = xfs_alloc_fix_freelist(args, flags);
+			if (error) {
+				trace_xfs_alloc_vextent_nofix(args);
+				goto error0;
+			}
+			/*
+			 * If we get a buffer back then the allocation will fly.
+			 */
+			if (args->agbp) {
+				if ((error = xfs_alloc_ag_vextent(args)))
+					goto error0;
+				break;
+			}
+
+			trace_xfs_alloc_vextent_loopfailed(args);
+
+			/*
+			 * Didn't work, figure out the next iteration.
+			 */
+			if (args->agno == sagno &&
+			    type == XFS_ALLOCTYPE_START_BNO)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			* For the first allocation, we can try any AG to get
+			* space.  However, if we already have allocated a
+			* block, we don't want to try AGs whose number is below
+			* sagno. Otherwise, we may end up with out-of-order
+			* locking of AGF, which might cause deadlock.
+			*/
+			if (++(args->agno) == mp->m_sb.sb_agcount) {
+				if (args->tp->t_firstblock != NULLFSBLOCK)
+					args->agno = sagno;
+				else
+					args->agno = 0;
+			}
+			/*
+			 * Reached the starting a.g., must either be done
+			 * or switch to non-trylock mode.
+			 */
+			if (args->agno == sagno) {
+				if (flags == 0) {
+					args->agbno = NULLAGBLOCK;
+					trace_xfs_alloc_vextent_allfailed(args);
+					break;
+				}
+
+				flags = 0;
+				if (type == XFS_ALLOCTYPE_START_BNO) {
+					args->agbno = XFS_FSB_TO_AGBNO(mp,
+						args->fsbno);
+					args->type = XFS_ALLOCTYPE_NEAR_BNO;
+				}
+			}
+			xfs_perag_put(args->pag);
+		}
+		if (bump_rotor) {
+			if (args->agno == sagno)
+				mp->m_agfrotor = (mp->m_agfrotor + 1) %
+					(mp->m_sb.sb_agcount * rotorstep);
+			else
+				mp->m_agfrotor = (args->agno * rotorstep + 1) %
+					(mp->m_sb.sb_agcount * rotorstep);
+		}
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	if (args->agbno == NULLAGBLOCK)
+		args->fsbno = NULLFSBLOCK;
+	else {
+		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+		ASSERT(args->len >= args->minlen);
+		ASSERT(args->len <= args->maxlen);
+		ASSERT(args->agbno % args->alignment == 0);
+		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+			args->len);
+#endif
+
+	}
+	xfs_perag_put(args->pag);
+	return 0;
+error0:
+	xfs_perag_put(args->pag);
+	return error;
+}
+
+/* Ensure that the freelist is at full capacity. */
+int
+xfs_free_extent_fix_freelist(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		**agbp)
+{
+	struct xfs_alloc_arg	args;
+	int			error;
+
+	memset(&args, 0, sizeof(struct xfs_alloc_arg));
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+	args.agno = pag->pag_agno;
+	args.pag = pag;
+
+	/*
+	 * validate that the block number is legal - the enables us to detect
+	 * and handle a silent filesystem corruption rather than crashing.
+	 */
+	if (args.agno >= args.mp->m_sb.sb_agcount)
+		return -EFSCORRUPTED;
+
+	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+	if (error)
+		return error;
+
+	*agbp = args.agbp;
+	return 0;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int
+__xfs_free_extent(
+	struct xfs_trans		*tp,
+	xfs_fsblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		type,
+	bool				skip_discard)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_buf			*agbp;
+	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, bno);
+	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	struct xfs_agf			*agf;
+	int				error;
+	unsigned int			busy_flags = 0;
+	struct xfs_perag		*pag;
+
+	ASSERT(len != 0);
+	ASSERT(type != XFS_AG_RESV_AGFL);
+
+	if (XFS_TEST_ERROR(false, mp,
+			XFS_ERRTAG_FREE_EXTENT))
+		return -EIO;
+
+	pag = xfs_perag_get(mp, agno);
+	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+	if (error)
+		goto err;
+	agf = agbp->b_addr;
+
+	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+		error = -EFSCORRUPTED;
+		goto err_release;
+	}
+
+	/* validate the extent size is legal now we have the agf locked */
+	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
+		error = -EFSCORRUPTED;
+		goto err_release;
+	}
+
+	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
+	if (error)
+		goto err_release;
+
+	if (skip_discard)
+		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
+	xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
+	xfs_perag_put(pag);
+	return 0;
+
+err_release:
+	xfs_trans_brelse(tp, agbp);
+err:
+	xfs_perag_put(pag);
+	return error;
+}
+
+struct xfs_alloc_query_range_info {
+	xfs_alloc_query_range_fn	fn;
+	void				*priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_alloc_query_range_helper(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*rec,
+	void				*priv)
+{
+	struct xfs_alloc_query_range_info	*query = priv;
+	struct xfs_alloc_rec_incore		irec;
+
+	irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+	irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+	return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all free space within a given range of blocks. */
+int
+xfs_alloc_query_range(
+	struct xfs_btree_cur			*cur,
+	const struct xfs_alloc_rec_incore	*low_rec,
+	const struct xfs_alloc_rec_incore	*high_rec,
+	xfs_alloc_query_range_fn		fn,
+	void					*priv)
+{
+	union xfs_btree_irec			low_brec;
+	union xfs_btree_irec			high_brec;
+	struct xfs_alloc_query_range_info	query;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+	low_brec.a = *low_rec;
+	high_brec.a = *high_rec;
+	query.priv = priv;
+	query.fn = fn;
+	return xfs_btree_query_range(cur, &low_brec, &high_brec,
+			xfs_alloc_query_range_helper, &query);
+}
+
+/* Find all free space records. */
+int
+xfs_alloc_query_all(
+	struct xfs_btree_cur			*cur,
+	xfs_alloc_query_range_fn		fn,
+	void					*priv)
+{
+	struct xfs_alloc_query_range_info	query;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+	query.priv = priv;
+	query.fn = fn;
+	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
+}
+
+/* Is there a record covering a given extent? */
+int
+xfs_alloc_has_record(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			*exists)
+{
+	union xfs_btree_irec	low;
+	union xfs_btree_irec	high;
+
+	memset(&low, 0, sizeof(low));
+	low.a.ar_startblock = bno;
+	memset(&high, 0xFF, sizeof(high));
+	high.a.ar_startblock = bno + len - 1;
+
+	return xfs_btree_has_record(cur, &low, &high, exists);
+}
+
+/*
+ * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
+ * error code or XFS_ITER_*.
+ */
+int
+xfs_agfl_walk(
+	struct xfs_mount	*mp,
+	struct xfs_agf		*agf,
+	struct xfs_buf		*agflbp,
+	xfs_agfl_walk_fn	walk_fn,
+	void			*priv)
+{
+	__be32			*agfl_bno;
+	unsigned int		i;
+	int			error;
+
+	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
+	i = be32_to_cpu(agf->agf_flfirst);
+
+	/* Nothing to walk in an empty AGFL. */
+	if (agf->agf_flcount == cpu_to_be32(0))
+		return 0;
+
+	/* Otherwise, walk from first to last, wrapping as needed. */
+	for (;;) {
+		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
+		if (error)
+			return error;
+		if (i == be32_to_cpu(agf->agf_fllast))
+			break;
+		if (++i == xfs_agfl_size(mp))
+			i = 0;
+	}
+
+	return 0;
+}
+
+int __init
+xfs_extfree_intent_init_cache(void)
+{
+	xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
+			sizeof(struct xfs_extent_free_item),
+			0, 0, NULL);
+
+	return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_extfree_intent_destroy_cache(void)
+{
+	kmem_cache_destroy(xfs_extfree_item_cache);
+	xfs_extfree_item_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
new file mode 100644
index 000000000..2c3f762df
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ALLOC_H__
+#define	__XFS_ALLOC_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+
+extern struct workqueue_struct *xfs_alloc_wq;
+
+unsigned int xfs_agfl_size(struct xfs_mount *mp);
+
+/*
+ * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
+ */
+#define XFS_ALLOCTYPE_FIRST_AG	0x02	/* ... start at ag 0 */
+#define XFS_ALLOCTYPE_THIS_AG	0x08	/* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO	0x10	/* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO	0x20	/* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO	0x40	/* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+
+#define XFS_ALLOC_TYPES \
+	{ XFS_ALLOCTYPE_FIRST_AG,	"FIRST_AG" }, \
+	{ XFS_ALLOCTYPE_THIS_AG,	"THIS_AG" }, \
+	{ XFS_ALLOCTYPE_START_BNO,	"START_BNO" }, \
+	{ XFS_ALLOCTYPE_NEAR_BNO,	"NEAR_BNO" }, \
+	{ XFS_ALLOCTYPE_THIS_BNO,	"THIS_BNO" }
+
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define	XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
+#define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
+#define	XFS_ALLOC_FLAG_NORMAP	0x00000004  /* don't modify the rmapbt */
+#define	XFS_ALLOC_FLAG_NOSHRINK	0x00000008  /* don't shrink the freelist */
+#define	XFS_ALLOC_FLAG_CHECK	0x00000010  /* test only, don't modify args */
+
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+	struct xfs_trans *tp;		/* transaction pointer */
+	struct xfs_mount *mp;		/* file system mount point */
+	struct xfs_buf	*agbp;		/* buffer for a.g. freelist header */
+	struct xfs_perag *pag;		/* per-ag struct for this agno */
+	xfs_fsblock_t	fsbno;		/* file system block number */
+	xfs_agnumber_t	agno;		/* allocation group number */
+	xfs_agblock_t	agbno;		/* allocation group-relative block # */
+	xfs_extlen_t	minlen;		/* minimum size of extent */
+	xfs_extlen_t	maxlen;		/* maximum size of extent */
+	xfs_extlen_t	mod;		/* mod value for extent size */
+	xfs_extlen_t	prod;		/* prod value for extent size */
+	xfs_extlen_t	minleft;	/* min blocks must be left after us */
+	xfs_extlen_t	total;		/* total blocks needed in xaction */
+	xfs_extlen_t	alignment;	/* align answer to multiple of this */
+	xfs_extlen_t	minalignslop;	/* slop for minlen+alignment calcs */
+	xfs_agblock_t	min_agbno;	/* set an agbno range for NEAR allocs */
+	xfs_agblock_t	max_agbno;	/* ... */
+	xfs_extlen_t	len;		/* output: actual size of extent */
+	xfs_alloctype_t	type;		/* allocation type XFS_ALLOCTYPE_... */
+	xfs_alloctype_t	otype;		/* original allocation type */
+	int		datatype;	/* mask defining data type treatment */
+	char		wasdel;		/* set if allocation was prev delayed */
+	char		wasfromfl;	/* set if allocation is from freelist */
+	struct xfs_owner_info	oinfo;	/* owner of blocks being allocated */
+	enum xfs_ag_resv_type	resv;	/* block reservation to use */
+#ifdef DEBUG
+	bool		alloc_minlen_only; /* allocate exact minlen extent */
+#endif
+} xfs_alloc_arg_t;
+
+/*
+ * Defines for datatype
+ */
+#define XFS_ALLOC_USERDATA		(1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA	(1 << 1)/* special case start of file */
+#define XFS_ALLOC_NOBUSY		(1 << 2)/* Busy extents not allowed */
+
+/* freespace limit calculations */
+unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
+unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
+
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag,
+		xfs_extlen_t need, xfs_extlen_t reserved);
+unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
+		struct xfs_perag *pag);
+int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf *agfbp, xfs_agblock_t *bnop, int	 btreeblk);
+int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf *agfbp, struct xfs_buf *agflbp,
+		xfs_agblock_t bno, int btreeblk);
+
+/*
+ * Compute and fill in value of m_alloc_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*bp,	/* buffer for a.g. freelist header */
+	uint32_t	fields);/* mask of fields to be logged (XFS_AGF_...) */
+
+/*
+ * Allocate an extent (variable-size).
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t	*args);	/* allocation argument structure */
+
+/*
+ * Free an extent.
+ */
+int				/* error */
+__xfs_free_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		bno,	/* starting block number of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	const struct xfs_owner_info	*oinfo,	/* extent owner */
+	enum xfs_ag_resv_type	type,	/* block reservation type */
+	bool			skip_discard);
+
+static inline int
+xfs_free_extent(
+	struct xfs_trans	*tp,
+	xfs_fsblock_t		bno,
+	xfs_extlen_t		len,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type	type)
+{
+	return __xfs_free_extent(tp, bno, len, oinfo, type, false);
+}
+
+int				/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
+
+int				/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
+
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat);	/* output: success/failure */
+
+int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+		struct xfs_buf **agfbpp);
+int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+		struct xfs_buf **agfbpp);
+int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf **bpp);
+int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
+			struct xfs_buf *, struct xfs_owner_info *);
+int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag,
+		struct xfs_buf **agbp);
+
+xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
+
+typedef int (*xfs_alloc_query_range_fn)(
+	struct xfs_btree_cur			*cur,
+	const struct xfs_alloc_rec_incore	*rec,
+	void					*priv);
+
+int xfs_alloc_query_range(struct xfs_btree_cur *cur,
+		const struct xfs_alloc_rec_incore *low_rec,
+		const struct xfs_alloc_rec_incore *high_rec,
+		xfs_alloc_query_range_fn fn, void *priv);
+int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
+		void *priv);
+
+int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		xfs_extlen_t len, bool *exist);
+
+typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
+		void *priv);
+int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf,
+		struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv);
+
+static inline __be32 *
+xfs_buf_to_agfl_bno(
+	struct xfs_buf		*bp)
+{
+	if (xfs_has_crc(bp->b_mount))
+		return bp->b_addr + sizeof(struct xfs_agfl);
+	return bp->b_addr;
+}
+
+void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+		xfs_filblks_t len, const struct xfs_owner_info *oinfo,
+		bool skip_discard);
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+struct xfs_extent_free_item {
+	struct list_head	xefi_list;
+	uint64_t		xefi_owner;
+	xfs_fsblock_t		xefi_startblock;/* starting fs block number */
+	xfs_extlen_t		xefi_blockcount;/* number of blocks in extent */
+	unsigned int		xefi_flags;
+};
+
+#define XFS_EFI_SKIP_DISCARD	(1U << 0) /* don't issue discard */
+#define XFS_EFI_ATTR_FORK	(1U << 1) /* freeing attr fork block */
+#define XFS_EFI_BMBT_BLOCK	(1U << 2) /* freeing bmap btree block */
+
+static inline void
+xfs_free_extent_later(
+	struct xfs_trans		*tp,
+	xfs_fsblock_t			bno,
+	xfs_filblks_t			len,
+	const struct xfs_owner_info	*oinfo)
+{
+	__xfs_free_extent_later(tp, bno, len, oinfo, false);
+}
+
+
+extern struct kmem_cache	*xfs_extfree_item_cache;
+
+int __init xfs_extfree_intent_init_cache(void);
+void xfs_extfree_intent_destroy_cache(void);
+
+#endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
new file mode 100644
index 000000000..549a3cba0
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -0,0 +1,641 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+
+static struct kmem_cache	*xfs_allocbt_cur_cache;
+
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+}
+
+STATIC void
+xfs_allocbt_set_root(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				inc)
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agf		*agf = agbp->b_addr;
+	int			btnum = cur->bc_btnum;
+
+	ASSERT(ptr->s != 0);
+
+	agf->agf_roots[btnum] = ptr->s;
+	be32_add_cpu(&agf->agf_levels[btnum], inc);
+	cur->bc_ag.pag->pagf_levels[btnum] += inc;
+
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	int			error;
+	xfs_agblock_t		bno;
+
+	/* Allocate the new block from the freelist. If we can't, give up.  */
+	error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp,
+			cur->bc_ag.agbp, &bno, 1);
+	if (error)
+		return error;
+
+	if (bno == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+
+	atomic64_inc(&cur->bc_mp->m_allocbt_blks);
+	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false);
+
+	new->s = cpu_to_be32(bno);
+
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_allocbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	xfs_agblock_t		bno;
+	int			error;
+
+	bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
+	error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL,
+			bno, 1);
+	if (error)
+		return error;
+
+	atomic64_dec(&cur->bc_mp->m_allocbt_blks);
+	xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1,
+			      XFS_EXTENT_BUSY_SKIP_DISCARD);
+	return 0;
+}
+
+/*
+ * Update the longest extent in the AGF
+ */
+STATIC void
+xfs_allocbt_update_lastrec(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_btree_block	*block,
+	const union xfs_btree_rec	*rec,
+	int				ptr,
+	int				reason)
+{
+	struct xfs_agf		*agf = cur->bc_ag.agbp->b_addr;
+	struct xfs_perag	*pag;
+	__be32			len;
+	int			numrecs;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+	switch (reason) {
+	case LASTREC_UPDATE:
+		/*
+		 * If this is the last leaf block and it's the last record,
+		 * then update the size of the longest extent in the AG.
+		 */
+		if (ptr != xfs_btree_get_numrecs(block))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_INSREC:
+		if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+		    be32_to_cpu(agf->agf_longest))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_DELREC:
+		numrecs = xfs_btree_get_numrecs(block);
+		if (ptr <= numrecs)
+			return;
+		ASSERT(ptr == numrecs + 1);
+
+		if (numrecs) {
+			xfs_alloc_rec_t *rrp;
+
+			rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+			len = rrp->ar_blockcount;
+		} else {
+			len = 0;
+		}
+
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	agf->agf_longest = len;
+	pag = cur->bc_ag.agbp->b_pag;
+	pag->pagf_longest = be32_to_cpu(len);
+	xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST);
+}
+
+STATIC int
+xfs_allocbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_alloc_mnr[level != 0];
+}
+
+STATIC int
+xfs_allocbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_alloc_mxr[level != 0];
+}
+
+STATIC void
+xfs_allocbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->alloc.ar_startblock = rec->alloc.ar_startblock;
+	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_bnobt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	__u32				x;
+
+	x = be32_to_cpu(rec->alloc.ar_startblock);
+	x += be32_to_cpu(rec->alloc.ar_blockcount) - 1;
+	key->alloc.ar_startblock = cpu_to_be32(x);
+	key->alloc.ar_blockcount = 0;
+}
+
+STATIC void
+xfs_cntbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+	key->alloc.ar_startblock = 0;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+	rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agf		*agf = cur->bc_ag.agbp->b_addr;
+
+	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+
+	ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC int64_t
+xfs_bnobt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	struct xfs_alloc_rec_incore	*rec = &cur->bc_rec.a;
+	const struct xfs_alloc_rec	*kp = &key->alloc;
+
+	return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+STATIC int64_t
+xfs_cntbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	struct xfs_alloc_rec_incore	*rec = &cur->bc_rec.a;
+	const struct xfs_alloc_rec	*kp = &key->alloc;
+	int64_t				diff;
+
+	diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+	if (diff)
+		return diff;
+
+	return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+STATIC int64_t
+xfs_bnobt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
+			  be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+STATIC int64_t
+xfs_cntbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	int64_t				diff;
+
+	diff =  be32_to_cpu(k1->alloc.ar_blockcount) -
+		be32_to_cpu(k2->alloc.ar_blockcount);
+	if (diff)
+		return diff;
+
+	return  be32_to_cpu(k1->alloc.ar_startblock) -
+		be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+static xfs_failaddr_t
+xfs_allocbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+	xfs_btnum_t		btnum = XFS_BTNUM_BNOi;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (xfs_has_crc(mp)) {
+		fa = xfs_btree_sblock_v5hdr_verify(bp);
+		if (fa)
+			return fa;
+	}
+
+	/*
+	 * The perag may not be attached during grow operations or fully
+	 * initialized from the AGF during log recovery. Therefore we can only
+	 * check against maximum tree depth from those contexts.
+	 *
+	 * Otherwise check against the per-tree limit. Peek at one of the
+	 * verifier magic values to determine the type of tree we're verifying
+	 * against.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+		btnum = XFS_BTNUM_CNTi;
+	if (pag && pag->pagf_init) {
+		if (level >= pag->pagf_levels[btnum])
+			return __this_address;
+	} else if (level >= mp->m_alloc_maxlevels)
+		return __this_address;
+
+	return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
+}
+
+static void
+xfs_allocbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_allocbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_allocbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_allocbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+	.name = "xfs_bnobt",
+	.magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+		   cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
+	.verify_read = xfs_allocbt_read_verify,
+	.verify_write = xfs_allocbt_write_verify,
+	.verify_struct = xfs_allocbt_verify,
+};
+
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+	.name = "xfs_cntbt",
+	.magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+		   cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+	.verify_read = xfs_allocbt_read_verify,
+	.verify_write = xfs_allocbt_write_verify,
+	.verify_struct = xfs_allocbt_verify,
+};
+
+STATIC int
+xfs_bnobt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->alloc.ar_startblock) <
+	       be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+STATIC int
+xfs_bnobt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	return be32_to_cpu(r1->alloc.ar_startblock) +
+		be32_to_cpu(r1->alloc.ar_blockcount) <=
+		be32_to_cpu(r2->alloc.ar_startblock);
+}
+
+STATIC int
+xfs_cntbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->alloc.ar_blockcount) <
+		be32_to_cpu(k2->alloc.ar_blockcount) ||
+		(k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+		 be32_to_cpu(k1->alloc.ar_startblock) <
+		 be32_to_cpu(k2->alloc.ar_startblock));
+}
+
+STATIC int
+xfs_cntbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	return be32_to_cpu(r1->alloc.ar_blockcount) <
+		be32_to_cpu(r2->alloc.ar_blockcount) ||
+		(r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+		 be32_to_cpu(r1->alloc.ar_startblock) <
+		 be32_to_cpu(r2->alloc.ar_startblock));
+}
+
+static const struct xfs_btree_ops xfs_bnobt_ops = {
+	.rec_len		= sizeof(xfs_alloc_rec_t),
+	.key_len		= sizeof(xfs_alloc_key_t),
+
+	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.set_root		= xfs_allocbt_set_root,
+	.alloc_block		= xfs_allocbt_alloc_block,
+	.free_block		= xfs_allocbt_free_block,
+	.update_lastrec		= xfs_allocbt_update_lastrec,
+	.get_minrecs		= xfs_allocbt_get_minrecs,
+	.get_maxrecs		= xfs_allocbt_get_maxrecs,
+	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_bnobt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
+	.key_diff		= xfs_bnobt_key_diff,
+	.buf_ops		= &xfs_bnobt_buf_ops,
+	.diff_two_keys		= xfs_bnobt_diff_two_keys,
+	.keys_inorder		= xfs_bnobt_keys_inorder,
+	.recs_inorder		= xfs_bnobt_recs_inorder,
+};
+
+static const struct xfs_btree_ops xfs_cntbt_ops = {
+	.rec_len		= sizeof(xfs_alloc_rec_t),
+	.key_len		= sizeof(xfs_alloc_key_t),
+
+	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.set_root		= xfs_allocbt_set_root,
+	.alloc_block		= xfs_allocbt_alloc_block,
+	.free_block		= xfs_allocbt_free_block,
+	.update_lastrec		= xfs_allocbt_update_lastrec,
+	.get_minrecs		= xfs_allocbt_get_minrecs,
+	.get_maxrecs		= xfs_allocbt_get_maxrecs,
+	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_cntbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
+	.key_diff		= xfs_cntbt_key_diff,
+	.buf_ops		= &xfs_cntbt_buf_ops,
+	.diff_two_keys		= xfs_cntbt_diff_two_keys,
+	.keys_inorder		= xfs_cntbt_keys_inorder,
+	.recs_inorder		= xfs_cntbt_recs_inorder,
+};
+
+/* Allocate most of a new allocation btree cursor. */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+	cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
+			xfs_allocbt_cur_cache);
+	cur->bc_ag.abt.active = false;
+
+	if (btnum == XFS_BTNUM_CNT) {
+		cur->bc_ops = &xfs_cntbt_ops;
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+	} else {
+		cur->bc_ops = &xfs_bnobt_ops;
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+	}
+
+	/* take a reference for the cursor */
+	atomic_inc(&pag->pag_ref);
+	cur->bc_ag.pag = pag;
+
+	if (xfs_has_crc(mp))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	return cur;
+}
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *			/* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_buf		*agbp,		/* buffer for agf structure */
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum)		/* btree identifier */
+{
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_allocbt_init_common(mp, tp, pag, btnum);
+	if (btnum == XFS_BTNUM_CNT)
+		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+	else
+		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+
+	cur->bc_ag.agbp = agbp;
+
+	return cur;
+}
+
+/* Create a free space btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_allocbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xbtree_afakeroot	*afake,
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_allocbt_init_common(mp, NULL, pag, btnum);
+	xfs_btree_stage_afakeroot(cur, afake);
+	return cur;
+}
+
+/*
+ * Install a new free space btree root.  Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_allocbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp)
+{
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xbtree_afakeroot	*afake = cur->bc_ag.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+	agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+	} else {
+		cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
+		xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+	}
+}
+
+/* Calculate number of records in an alloc btree block. */
+static inline unsigned int
+xfs_allocbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(xfs_alloc_rec_t);
+	return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+	return xfs_allocbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Free space btrees are at their largest when every other block is free. */
+#define XFS_MAX_FREESP_RECORDS	((XFS_MAX_AG_BLOCKS + 1) / 2)
+
+/* Compute the max possible height for free space btrees. */
+unsigned int
+xfs_allocbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+		       XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+	minrecs[0] = xfs_allocbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_allocbt_block_maxrecs(blocklen, false) / 2;
+
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_FREESP_RECORDS);
+}
+
+/* Calculate the freespace btree size for some records. */
+xfs_extlen_t
+xfs_allocbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_alloc_mnr, len);
+}
+
+int __init
+xfs_allocbt_init_cur_cache(void)
+{
+	xfs_allocbt_cur_cache = kmem_cache_create("xfs_bnobt_cur",
+			xfs_btree_cur_sizeof(xfs_allocbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_allocbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_allocbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_allocbt_cur_cache);
+	xfs_allocbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
new file mode 100644
index 000000000..45df893ef
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define	__XFS_ALLOC_BTREE_H__
+
+/*
+ * Freespace on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xbtree_afakeroot;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+	(xfs_has_crc(((mp))) ? \
+		XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+	((xfs_alloc_rec_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+	((xfs_alloc_key_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_alloc_ptr_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_alloc_key_t) + \
+		 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_buf *bp,
+		struct xfs_perag *pag, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
+		struct xbtree_afakeroot *afake, struct xfs_perag *pag,
+		xfs_btnum_t btnum);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
+void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_allocbt_maxlevels_ondisk(void);
+
+int __init xfs_allocbt_init_cur_cache(void);
+void xfs_allocbt_destroy_cur_cache(void);
+
+#endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
new file mode 100644
index 000000000..e28d93d23
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -0,0 +1,1600 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
+
+struct kmem_cache		*xfs_attr_intent_cache;
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
+STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_get(xfs_da_args_t *args);
+STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
+static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_lookup(struct xfs_da_args *args,
+		struct xfs_da_state *state);
+
+int
+xfs_inode_hasattr(
+	struct xfs_inode	*ip)
+{
+	if (!xfs_inode_has_attr_fork(ip))
+		return 0;
+	if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+	    ip->i_af.if_nextents == 0)
+		return 0;
+	return 1;
+}
+
+/*
+ * Returns true if the there is exactly only block in the attr fork, in which
+ * case the attribute fork consists of a single leaf block entry.
+ */
+bool
+xfs_attr_is_leaf(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*ifp = &ip->i_af;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	imap;
+
+	if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS)
+		return false;
+
+	xfs_iext_first(ifp, &icur);
+	xfs_iext_get_extent(ifp, &icur, &imap);
+	return imap.br_startoff == 0 && imap.br_blockcount == 1;
+}
+
+/*
+ * XXX (dchinner): name path state saving and refilling is an optimisation to
+ * avoid needing to look up name entries after rolling transactions removing
+ * remote xattr blocks between the name entry lookup and name entry removal.
+ * This optimisation got sidelined when combining the set and remove state
+ * machines, but the code has been left in place because it is worthwhile to
+ * restore the optimisation once the combined state machine paths have settled.
+ *
+ * This comment is a public service announcement to remind Future Dave that he
+ * still needs to restore this code to working order.
+ */
+#if 0
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+static int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level;
+
+	trace_xfs_attr_fillstate(state->args);
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = xfs_buf_daddr(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = xfs_buf_daddr(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+static int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level, error;
+
+	trace_xfs_attr_refillstate(state->args);
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da3_node_read_mapped(state->args->trans,
+					state->args->dp, blk->disk_blkno,
+					&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da3_node_read_mapped(state->args->trans,
+					state->args->dp, blk->disk_blkno,
+					&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	return 0;
+}
+#else
+static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; }
+#endif
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+/*
+ * Retrieve an extended attribute and its value.  Must have ilock.
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
+int
+xfs_attr_get_ilocked(
+	struct xfs_da_args	*args)
+{
+	ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	if (!xfs_inode_hasattr(args->dp))
+		return -ENOATTR;
+
+	if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+		return xfs_attr_shortform_getvalue(args);
+	if (xfs_attr_is_leaf(args->dp))
+		return xfs_attr_leaf_get(args);
+	return xfs_attr_node_get(args);
+}
+
+/*
+ * Retrieve an extended attribute by name, and its value if requested.
+ *
+ * If args->valuelen is zero, then the caller does not want the value, just an
+ * indication whether the attribute exists and the size of the value if it
+ * exists. The size is returned in args.valuelen.
+ *
+ * If args->value is NULL but args->valuelen is non-zero, allocate the buffer
+ * for the value after existence of the attribute has been determined. The
+ * caller always has to free args->value if it is set, no matter if this
+ * function was successful or not.
+ *
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * args->valuelen, return -ERANGE with the size of the attribute that was found
+ * in args->valuelen.
+ */
+int
+xfs_attr_get(
+	struct xfs_da_args	*args)
+{
+	uint			lock_mode;
+	int			error;
+
+	XFS_STATS_INC(args->dp->i_mount, xs_attr_get);
+
+	if (xfs_is_shutdown(args->dp->i_mount))
+		return -EIO;
+
+	args->geo = args->dp->i_mount->m_attr_geo;
+	args->whichfork = XFS_ATTR_FORK;
+	args->hashval = xfs_da_hashname(args->name, args->namelen);
+
+	/* Entirely possible to look up a name which doesn't exist */
+	args->op_flags = XFS_DA_OP_OKNOENT;
+
+	lock_mode = xfs_ilock_attr_map_shared(args->dp);
+	error = xfs_attr_get_ilocked(args);
+	xfs_iunlock(args->dp, lock_mode);
+
+	return error;
+}
+
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+int
+xfs_attr_calc_size(
+	struct xfs_da_args	*args,
+	int			*local)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+	int			size;
+	int			nblks;
+
+	/*
+	 * Determine space new attribute will use, and if it would be
+	 * "local" or "remote" (note: local != inline).
+	 */
+	size = xfs_attr_leaf_newentsize(args, local);
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	if (*local) {
+		if (size > (args->geo->blksize / 2)) {
+			/* Double split possible */
+			nblks *= 2;
+		}
+	} else {
+		/*
+		 * Out of line attribute, cannot double split, but
+		 * make room for the attribute value itself.
+		 */
+		uint	dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+		nblks += dblocks;
+		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+	}
+
+	return nblks;
+}
+
+/* Initialize transaction reservation for attr operations */
+void
+xfs_init_attr_trans(
+	struct xfs_da_args	*args,
+	struct xfs_trans_res	*tres,
+	unsigned int		*total)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+
+	if (args->value) {
+		tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+				 M_RES(mp)->tr_attrsetrt.tr_logres *
+				 args->total;
+		tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
+		tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
+		*total = args->total;
+	} else {
+		*tres = M_RES(mp)->tr_attrrm;
+		*total = XFS_ATTRRM_SPACE_RES(mp);
+	}
+}
+
+/*
+ * Add an attr to a shortform fork. If there is no space,
+ * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC.
+ * to use.
+ */
+STATIC int
+xfs_attr_try_sf_addname(
+	struct xfs_inode	*dp,
+	struct xfs_da_args	*args)
+{
+
+	int			error;
+
+	/*
+	 * Build initial attribute list (if required).
+	 */
+	if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS)
+		xfs_attr_shortform_create(args);
+
+	error = xfs_attr_shortform_addname(args);
+	if (error == -ENOSPC)
+		return error;
+
+	/*
+	 * Commit the shortform mods, and we're done.
+	 * NOTE: this is also the error path (EEXIST, etc).
+	 */
+	if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
+		xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+	if (xfs_has_wsync(dp->i_mount))
+		xfs_trans_set_sync(args->trans);
+
+	return error;
+}
+
+static int
+xfs_attr_sf_addname(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args		*args = attr->xattri_da_args;
+	struct xfs_inode		*dp = args->dp;
+	int				error = 0;
+
+	error = xfs_attr_try_sf_addname(dp, args);
+	if (error != -ENOSPC) {
+		ASSERT(!error || error == -EEXIST);
+		attr->xattri_dela_state = XFS_DAS_DONE;
+		goto out;
+	}
+
+	/*
+	 * It won't fit in the shortform, transform to a leaf block.  GROT:
+	 * another possible req'mt for a double-split btree op.
+	 */
+	error = xfs_attr_shortform_to_leaf(args);
+	if (error)
+		return error;
+
+	attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
+out:
+	trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
+	return error;
+}
+
+/*
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
+ */
+static enum xfs_delattr_state
+xfs_attr_complete_op(
+	struct xfs_attr_intent	*attr,
+	enum xfs_delattr_state	replace_state)
+{
+	struct xfs_da_args	*args = attr->xattri_da_args;
+	bool			do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+	args->op_flags &= ~XFS_DA_OP_REPLACE;
+	if (do_replace) {
+		args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+		return replace_state;
+	}
+	return XFS_DAS_DONE;
+}
+
+static int
+xfs_attr_leaf_addname(
+	struct xfs_attr_intent	*attr)
+{
+	struct xfs_da_args	*args = attr->xattri_da_args;
+	int			error;
+
+	ASSERT(xfs_attr_is_leaf(args->dp));
+
+	/*
+	 * Use the leaf buffer we may already hold locked as a result of
+	 * a sf-to-leaf conversion.
+	 */
+	error = xfs_attr_leaf_try_add(args);
+
+	if (error == -ENOSPC) {
+		error = xfs_attr3_leaf_to_node(args);
+		if (error)
+			return error;
+
+		/*
+		 * We're not in leaf format anymore, so roll the transaction and
+		 * retry the add to the newly allocated node block.
+		 */
+		attr->xattri_dela_state = XFS_DAS_NODE_ADD;
+		goto out;
+	}
+	if (error)
+		return error;
+
+	/*
+	 * We need to commit and roll if we need to allocate remote xattr blocks
+	 * or perform more xattr manipulations. Otherwise there is nothing more
+	 * to do and we can return success.
+	 */
+	if (args->rmtblkno)
+		attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
+	else
+		attr->xattri_dela_state = xfs_attr_complete_op(attr,
+							XFS_DAS_LEAF_REPLACE);
+out:
+	trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
+	return error;
+}
+
+/*
+ * Add an entry to a node format attr tree.
+ *
+ * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell
+ * the difference between leaf + remote attr blocks and a node format tree,
+ * so we may still end up having to convert from leaf to node format here.
+ */
+static int
+xfs_attr_node_addname(
+	struct xfs_attr_intent	*attr)
+{
+	struct xfs_da_args	*args = attr->xattri_da_args;
+	int			error;
+
+	error = xfs_attr_node_addname_find_attr(attr);
+	if (error)
+		return error;
+
+	error = xfs_attr_node_try_addname(attr);
+	if (error == -ENOSPC) {
+		error = xfs_attr3_leaf_to_node(args);
+		if (error)
+			return error;
+		/*
+		 * No state change, we really are in node form now
+		 * but we need the transaction rolled to continue.
+		 */
+		goto out;
+	}
+	if (error)
+		return error;
+
+	if (args->rmtblkno)
+		attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
+	else
+		attr->xattri_dela_state = xfs_attr_complete_op(attr,
+							XFS_DAS_NODE_REPLACE);
+out:
+	trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
+	return error;
+}
+
+static int
+xfs_attr_rmtval_alloc(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args              *args = attr->xattri_da_args;
+	int				error = 0;
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (attr->xattri_blkcnt > 0) {
+		error = xfs_attr_rmtval_set_blk(attr);
+		if (error)
+			return error;
+		/* Roll the transaction only if there is more to allocate. */
+		if (attr->xattri_blkcnt > 0)
+			goto out;
+	}
+
+	error = xfs_attr_rmtval_set_value(args);
+	if (error)
+		return error;
+
+	attr->xattri_dela_state = xfs_attr_complete_op(attr,
+						++attr->xattri_dela_state);
+	/*
+	 * If we are not doing a rename, we've finished the operation but still
+	 * have to clear the incomplete flag protecting the new attr from
+	 * exposing partially initialised state if we crash during creation.
+	 */
+	if (attr->xattri_dela_state == XFS_DAS_DONE)
+		error = xfs_attr3_leaf_clearflag(args);
+out:
+	trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
+	return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+static int
+xfs_attr_leaf_mark_incomplete(
+	struct xfs_da_args	*args,
+	struct xfs_da_state	*state)
+{
+	int			error;
+
+	/*
+	 * Fill in disk block numbers in the state structure
+	 * so that we can get the buffers back after we commit
+	 * several transactions in the following calls.
+	 */
+	error = xfs_attr_fillstate(state);
+	if (error)
+		return error;
+
+	/*
+	 * Mark the attribute as INCOMPLETE
+	 */
+	return xfs_attr3_leaf_setflag(args);
+}
+
+/* Ensure the da state of an xattr deferred work item is ready to go. */
+static inline void
+xfs_attr_item_init_da_state(
+	struct xfs_attr_intent	*attr)
+{
+	struct xfs_da_args	*args = attr->xattri_da_args;
+
+	if (!attr->xattri_da_state)
+		attr->xattri_da_state = xfs_da_state_alloc(args);
+	else
+		xfs_da_state_reset(attr->xattri_da_state, args);
+}
+
+/*
+ * Initial setup for xfs_attr_node_removename.  Make sure the attr is there and
+ * the blocks are valid.  Attr keys with remote blocks will be marked
+ * incomplete.
+ */
+static
+int xfs_attr_node_removename_setup(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args		*args = attr->xattri_da_args;
+	struct xfs_da_state		*state;
+	int				error;
+
+	xfs_attr_item_init_da_state(attr);
+	error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+	if (error != -EEXIST)
+		goto out;
+	error = 0;
+
+	state = attr->xattri_da_state;
+	ASSERT(state->path.blk[state->path.active - 1].bp != NULL);
+	ASSERT(state->path.blk[state->path.active - 1].magic ==
+		XFS_ATTR_LEAF_MAGIC);
+
+	error = xfs_attr_leaf_mark_incomplete(args, state);
+	if (error)
+		goto out;
+	if (args->rmtblkno > 0)
+		error = xfs_attr_rmtval_invalidate(args);
+out:
+	if (error) {
+		xfs_da_state_free(attr->xattri_da_state);
+		attr->xattri_da_state = NULL;
+	}
+
+	return error;
+}
+
+/*
+ * Remove the original attr we have just replaced. This is dependent on the
+ * original lookup and insert placing the old attr in args->blkno/args->index
+ * and the new attr in args->blkno2/args->index2.
+ */
+static int
+xfs_attr_leaf_remove_attr(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args              *args = attr->xattri_da_args;
+	struct xfs_inode		*dp = args->dp;
+	struct xfs_buf			*bp = NULL;
+	int				forkoff;
+	int				error;
+
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+				   &bp);
+	if (error)
+		return error;
+
+	xfs_attr3_leaf_remove(bp, args);
+
+	forkoff = xfs_attr_shortform_allfit(bp, dp);
+	if (forkoff)
+		error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+		/* bp is gone due to xfs_da_shrink_inode */
+
+	return error;
+}
+
+/*
+ * Shrink an attribute from leaf to shortform. Used by the node format remove
+ * path when the node format collapses to a single block and so we have to check
+ * if it can be collapsed further.
+ */
+static int
+xfs_attr_leaf_shrink(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_buf		*bp;
+	int			forkoff;
+	int			error;
+
+	if (!xfs_attr_is_leaf(dp))
+		return 0;
+
+	error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+	if (error)
+		return error;
+
+	forkoff = xfs_attr_shortform_allfit(bp, dp);
+	if (forkoff) {
+		error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+		/* bp is gone due to xfs_da_shrink_inode */
+	} else {
+		xfs_trans_brelse(args->trans, bp);
+	}
+
+	return error;
+}
+
+/*
+ * Run the attribute operation specified in @attr.
+ *
+ * This routine is meant to function as a delayed operation and will set the
+ * state to XFS_DAS_DONE when the operation is complete.  Calling functions will
+ * need to handle this, and recall the function until either an error or
+ * XFS_DAS_DONE is detected.
+ */
+int
+xfs_attr_set_iter(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args              *args = attr->xattri_da_args;
+	int				error = 0;
+
+	/* State machine switch */
+next_state:
+	switch (attr->xattri_dela_state) {
+	case XFS_DAS_UNINIT:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	case XFS_DAS_SF_ADD:
+		return xfs_attr_sf_addname(attr);
+	case XFS_DAS_LEAF_ADD:
+		return xfs_attr_leaf_addname(attr);
+	case XFS_DAS_NODE_ADD:
+		return xfs_attr_node_addname(attr);
+
+	case XFS_DAS_SF_REMOVE:
+		error = xfs_attr_sf_removename(args);
+		attr->xattri_dela_state = xfs_attr_complete_op(attr,
+						xfs_attr_init_add_state(args));
+		break;
+	case XFS_DAS_LEAF_REMOVE:
+		error = xfs_attr_leaf_removename(args);
+		attr->xattri_dela_state = xfs_attr_complete_op(attr,
+						xfs_attr_init_add_state(args));
+		break;
+	case XFS_DAS_NODE_REMOVE:
+		error = xfs_attr_node_removename_setup(attr);
+		if (error == -ENOATTR &&
+		    (args->op_flags & XFS_DA_OP_RECOVERY)) {
+			attr->xattri_dela_state = xfs_attr_complete_op(attr,
+						xfs_attr_init_add_state(args));
+			error = 0;
+			break;
+		}
+		if (error)
+			return error;
+		attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
+		if (args->rmtblkno == 0)
+			attr->xattri_dela_state++;
+		break;
+
+	case XFS_DAS_LEAF_SET_RMT:
+	case XFS_DAS_NODE_SET_RMT:
+		error = xfs_attr_rmtval_find_space(attr);
+		if (error)
+			return error;
+		attr->xattri_dela_state++;
+		fallthrough;
+
+	case XFS_DAS_LEAF_ALLOC_RMT:
+	case XFS_DAS_NODE_ALLOC_RMT:
+		error = xfs_attr_rmtval_alloc(attr);
+		if (error)
+			return error;
+		if (attr->xattri_dela_state == XFS_DAS_DONE)
+			break;
+		goto next_state;
+
+	case XFS_DAS_LEAF_REPLACE:
+	case XFS_DAS_NODE_REPLACE:
+		/*
+		 * We must "flip" the incomplete flags on the "new" and "old"
+		 * attribute/value pairs so that one disappears and one appears
+		 * atomically.
+		 */
+		error = xfs_attr3_leaf_flipflags(args);
+		if (error)
+			return error;
+		/*
+		 * We must commit the flag value change now to make it atomic
+		 * and then we can start the next trans in series at REMOVE_OLD.
+		 */
+		attr->xattri_dela_state++;
+		break;
+
+	case XFS_DAS_LEAF_REMOVE_OLD:
+	case XFS_DAS_NODE_REMOVE_OLD:
+		/*
+		 * If we have a remote attr, start the process of removing it
+		 * by invalidating any cached buffers.
+		 *
+		 * If we don't have a remote attr, we skip the remote block
+		 * removal state altogether with a second state increment.
+		 */
+		xfs_attr_restore_rmt_blk(args);
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_invalidate(args);
+			if (error)
+				return error;
+		} else {
+			attr->xattri_dela_state++;
+		}
+
+		attr->xattri_dela_state++;
+		goto next_state;
+
+	case XFS_DAS_LEAF_REMOVE_RMT:
+	case XFS_DAS_NODE_REMOVE_RMT:
+		error = xfs_attr_rmtval_remove(attr);
+		if (error == -EAGAIN) {
+			error = 0;
+			break;
+		}
+		if (error)
+			return error;
+
+		/*
+		 * We've finished removing the remote attr blocks, so commit the
+		 * transaction and move on to removing the attr name from the
+		 * leaf/node block. Removing the attr might require a full
+		 * transaction reservation for btree block freeing, so we
+		 * can't do that in the same transaction where we removed the
+		 * remote attr blocks.
+		 */
+		attr->xattri_dela_state++;
+		break;
+
+	case XFS_DAS_LEAF_REMOVE_ATTR:
+		error = xfs_attr_leaf_remove_attr(attr);
+		attr->xattri_dela_state = xfs_attr_complete_op(attr,
+						xfs_attr_init_add_state(args));
+		break;
+
+	case XFS_DAS_NODE_REMOVE_ATTR:
+		error = xfs_attr_node_remove_attr(attr);
+		if (!error)
+			error = xfs_attr_leaf_shrink(args);
+		attr->xattri_dela_state = xfs_attr_complete_op(attr,
+						xfs_attr_init_add_state(args));
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp);
+	return error;
+}
+
+
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+static int
+xfs_attr_lookup(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_buf		*bp = NULL;
+	struct xfs_da_state	*state;
+	int			error;
+
+	if (!xfs_inode_hasattr(dp))
+		return -ENOATTR;
+
+	if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+		return xfs_attr_sf_findname(args, NULL, NULL);
+
+	if (xfs_attr_is_leaf(dp)) {
+		error = xfs_attr_leaf_hasname(args, &bp);
+
+		if (bp)
+			xfs_trans_brelse(args->trans, bp);
+
+		return error;
+	}
+
+	state = xfs_da_state_alloc(args);
+	error = xfs_attr_node_lookup(args, state);
+	xfs_da_state_free(state);
+	return error;
+}
+
+static int
+xfs_attr_intent_init(
+	struct xfs_da_args	*args,
+	unsigned int		op_flags,	/* op flag (set or remove) */
+	struct xfs_attr_intent	**attr)		/* new xfs_attr_intent */
+{
+
+	struct xfs_attr_intent	*new;
+
+	new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	new->xattri_op_flags = op_flags;
+	new->xattri_da_args = args;
+
+	*attr = new;
+	return 0;
+}
+
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_add(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_intent	*new;
+	int			error = 0;
+
+	error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
+	if (error)
+		return error;
+
+	new->xattri_dela_state = xfs_attr_init_add_state(args);
+	xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+	trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
+
+	return 0;
+}
+
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_replace(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_intent	*new;
+	int			error = 0;
+
+	error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
+	if (error)
+		return error;
+
+	new->xattri_dela_state = xfs_attr_init_replace_state(args);
+	xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+	trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
+
+	return 0;
+}
+
+/* Removes an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_remove(
+	struct xfs_da_args	*args)
+{
+
+	struct xfs_attr_intent	*new;
+	int			error;
+
+	error  = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
+	if (error)
+		return error;
+
+	new->xattri_dela_state = xfs_attr_init_remove_state(args);
+	xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+	trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
+
+	return 0;
+}
+
+/*
+ * Note: If args->value is NULL the attribute will be removed, just like the
+ * Linux ->setattr API.
+ */
+int
+xfs_attr_set(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_trans_res	tres;
+	bool			rsvd = (args->attr_filter & XFS_ATTR_ROOT);
+	int			error, local;
+	int			rmt_blks = 0;
+	unsigned int		total;
+
+	if (xfs_is_shutdown(dp->i_mount))
+		return -EIO;
+
+	error = xfs_qm_dqattach(dp);
+	if (error)
+		return error;
+
+	args->geo = mp->m_attr_geo;
+	args->whichfork = XFS_ATTR_FORK;
+	args->hashval = xfs_da_hashname(args->name, args->namelen);
+
+	/*
+	 * We have no control over the attribute names that userspace passes us
+	 * to remove, so we have to allow the name lookup prior to attribute
+	 * removal to fail as well.  Preserve the logged flag, since we need
+	 * to pass that through to the logging code.
+	 */
+	args->op_flags = XFS_DA_OP_OKNOENT |
+					(args->op_flags & XFS_DA_OP_LOGGED);
+
+	if (args->value) {
+		XFS_STATS_INC(mp, xs_attr_set);
+		args->total = xfs_attr_calc_size(args, &local);
+
+		/*
+		 * If the inode doesn't have an attribute fork, add one.
+		 * (inode must not be locked when we call this routine)
+		 */
+		if (xfs_inode_has_attr_fork(dp) == 0) {
+			int sf_size = sizeof(struct xfs_attr_sf_hdr) +
+				xfs_attr_sf_entsize_byname(args->namelen,
+						args->valuelen);
+
+			error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+			if (error)
+				return error;
+		}
+
+		if (!local)
+			rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+	} else {
+		XFS_STATS_INC(mp, xs_attr_remove);
+		rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
+	}
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+	xfs_init_attr_trans(args, &tres, &total);
+	error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
+	if (error)
+		return error;
+
+	if (args->value || xfs_inode_hasattr(dp)) {
+		error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
+				XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+		if (error == -EFBIG)
+			error = xfs_iext_count_upgrade(args->trans, dp,
+					XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	error = xfs_attr_lookup(args);
+	switch (error) {
+	case -EEXIST:
+		/* if no value, we are performing a remove operation */
+		if (!args->value) {
+			error = xfs_attr_defer_remove(args);
+			break;
+		}
+		/* Pure create fails if the attr already exists */
+		if (args->attr_flags & XATTR_CREATE)
+			goto out_trans_cancel;
+
+		error = xfs_attr_defer_replace(args);
+		break;
+	case -ENOATTR:
+		/* Can't remove what isn't there. */
+		if (!args->value)
+			goto out_trans_cancel;
+
+		/* Pure replace fails if no existing attr to replace. */
+		if (args->attr_flags & XATTR_REPLACE)
+			goto out_trans_cancel;
+
+		error = xfs_attr_defer_add(args);
+		break;
+	default:
+		goto out_trans_cancel;
+	}
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (xfs_has_wsync(mp))
+		xfs_trans_set_sync(args->trans);
+
+	if (!(args->op_flags & XFS_DA_OP_NOTIME))
+		xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args->trans);
+out_unlock:
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+
+out_trans_cancel:
+	if (args->trans)
+		xfs_trans_cancel(args->trans);
+	goto out_unlock;
+}
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
+{
+	struct xfs_attr_shortform *sf;
+
+	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
+	return be16_to_cpu(sf->hdr.totsize);
+}
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+static int
+xfs_attr_shortform_addname(
+	struct xfs_da_args	*args)
+{
+	int			newsize, forkoff;
+	int			error;
+
+	trace_xfs_attr_sf_addname(args);
+
+	error = xfs_attr_shortform_lookup(args);
+	switch (error) {
+	case -ENOATTR:
+		if (args->op_flags & XFS_DA_OP_REPLACE)
+			return error;
+		break;
+	case -EEXIST:
+		if (!(args->op_flags & XFS_DA_OP_REPLACE))
+			return error;
+
+		error = xfs_attr_sf_removename(args);
+		if (error)
+			return error;
+
+		/*
+		 * Since we have removed the old attr, clear XFS_DA_OP_REPLACE
+		 * so that the new attr doesn't fit in shortform format, the
+		 * leaf format add routine won't trip over the attr not being
+		 * around.
+		 */
+		args->op_flags &= ~XFS_DA_OP_REPLACE;
+		break;
+	case 0:
+		break;
+	default:
+		return error;
+	}
+
+	if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
+	    args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+		return -ENOSPC;
+
+	newsize = xfs_attr_sf_totsize(args->dp);
+	newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
+
+	forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
+	if (!forkoff)
+		return -ENOSPC;
+
+	xfs_attr_shortform_add(args, forkoff);
+	return 0;
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/* Save the current remote block info and clear the current pointers. */
+static void
+xfs_attr_save_rmt_blk(
+	struct xfs_da_args	*args)
+{
+	args->blkno2 = args->blkno;
+	args->index2 = args->index;
+	args->rmtblkno2 = args->rmtblkno;
+	args->rmtblkcnt2 = args->rmtblkcnt;
+	args->rmtvaluelen2 = args->rmtvaluelen;
+	args->rmtblkno = 0;
+	args->rmtblkcnt = 0;
+	args->rmtvaluelen = 0;
+}
+
+/* Set stored info about a remote block */
+static void
+xfs_attr_restore_rmt_blk(
+	struct xfs_da_args	*args)
+{
+	args->blkno = args->blkno2;
+	args->index = args->index2;
+	args->rmtblkno = args->rmtblkno2;
+	args->rmtblkcnt = args->rmtblkcnt2;
+	args->rmtvaluelen = args->rmtvaluelen2;
+}
+
+/*
+ * Tries to add an attribute to an inode in leaf form
+ *
+ * This function is meant to execute as part of a delayed operation and leaves
+ * the transaction handling to the caller.  On success the attribute is added
+ * and the inode and transaction are left dirty.  If there is not enough space,
+ * the attr data is converted to node format and -ENOSPC is returned. Caller is
+ * responsible for handling the dirty inode and transaction or adding the attr
+ * in node format.
+ */
+STATIC int
+xfs_attr_leaf_try_add(
+	struct xfs_da_args	*args)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+	if (error)
+		return error;
+
+	/*
+	 * Look up the xattr name to set the insertion point for the new xattr.
+	 */
+	error = xfs_attr3_leaf_lookup_int(bp, args);
+	switch (error) {
+	case -ENOATTR:
+		if (args->op_flags & XFS_DA_OP_REPLACE)
+			goto out_brelse;
+		break;
+	case -EEXIST:
+		if (!(args->op_flags & XFS_DA_OP_REPLACE))
+			goto out_brelse;
+
+		trace_xfs_attr_leaf_replace(args);
+		/*
+		 * Save the existing remote attr state so that the current
+		 * values reflect the state of the new attribute we are about to
+		 * add, not the attribute we just found and will remove later.
+		 */
+		xfs_attr_save_rmt_blk(args);
+		break;
+	case 0:
+		break;
+	default:
+		goto out_brelse;
+	}
+
+	return xfs_attr3_leaf_add(bp, args);
+
+out_brelse:
+	xfs_trans_brelse(args->trans, bp);
+	return error;
+}
+
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+STATIC int
+xfs_attr_leaf_hasname(
+	struct xfs_da_args	*args,
+	struct xfs_buf		**bp)
+{
+	int                     error = 0;
+
+	error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp);
+	if (error)
+		return error;
+
+	error = xfs_attr3_leaf_lookup_int(*bp, args);
+	if (error != -ENOATTR && error != -EEXIST)
+		xfs_trans_brelse(args->trans, *bp);
+
+	return error;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp;
+	struct xfs_buf		*bp;
+	int			error, forkoff;
+
+	trace_xfs_attr_leaf_removename(args);
+
+	/*
+	 * Remove the attribute.
+	 */
+	dp = args->dp;
+
+	error = xfs_attr_leaf_hasname(args, &bp);
+	if (error == -ENOATTR) {
+		xfs_trans_brelse(args->trans, bp);
+		if (args->op_flags & XFS_DA_OP_RECOVERY)
+			return 0;
+		return error;
+	} else if (error != -EEXIST)
+		return error;
+
+	xfs_attr3_leaf_remove(bp, args);
+
+	/*
+	 * If the result is small enough, shrink it all into the inode.
+	 */
+	forkoff = xfs_attr_shortform_allfit(bp, dp);
+	if (forkoff)
+		return xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+		/* bp is gone due to xfs_da_shrink_inode */
+
+	return 0;
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
+STATIC int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+	struct xfs_buf *bp;
+	int error;
+
+	trace_xfs_attr_leaf_get(args);
+
+	error = xfs_attr_leaf_hasname(args, &bp);
+
+	if (error == -ENOATTR)  {
+		xfs_trans_brelse(args->trans, bp);
+		return error;
+	} else if (error != -EEXIST)
+		return error;
+
+
+	error = xfs_attr3_leaf_getvalue(bp, args);
+	xfs_trans_brelse(args->trans, bp);
+	return error;
+}
+
+/* Return EEXIST if attr is found, or ENOATTR if not. */
+STATIC int
+xfs_attr_node_lookup(
+	struct xfs_da_args	*args,
+	struct xfs_da_state	*state)
+{
+	int			retval, error;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da3_node_lookup_int(state, &retval);
+	if (error)
+		return error;
+
+	return retval;
+}
+
+/*========================================================================
+ * External routines when attribute list size > geo->blksize
+ *========================================================================*/
+
+STATIC int
+xfs_attr_node_addname_find_attr(
+	 struct xfs_attr_intent	*attr)
+{
+	struct xfs_da_args	*args = attr->xattri_da_args;
+	int			error;
+
+	/*
+	 * Search to see if name already exists, and get back a pointer
+	 * to where it should go.
+	 */
+	xfs_attr_item_init_da_state(attr);
+	error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+	switch (error) {
+	case -ENOATTR:
+		if (args->op_flags & XFS_DA_OP_REPLACE)
+			goto error;
+		break;
+	case -EEXIST:
+		if (!(args->op_flags & XFS_DA_OP_REPLACE))
+			goto error;
+
+
+		trace_xfs_attr_node_replace(args);
+		/*
+		 * Save the existing remote attr state so that the current
+		 * values reflect the state of the new attribute we are about to
+		 * add, not the attribute we just found and will remove later.
+		 */
+		xfs_attr_save_rmt_blk(args);
+		break;
+	case 0:
+		break;
+	default:
+		goto error;
+	}
+
+	return 0;
+error:
+	if (attr->xattri_da_state) {
+		xfs_da_state_free(attr->xattri_da_state);
+		attr->xattri_da_state = NULL;
+	}
+	return error;
+}
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+static int
+xfs_attr_node_try_addname(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_state		*state = attr->xattri_da_state;
+	struct xfs_da_state_blk		*blk;
+	int				error;
+
+	trace_xfs_attr_node_addname(state->args);
+
+	blk = &state->path.blk[state->path.active-1];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+	error = xfs_attr3_leaf_add(blk->bp, state->args);
+	if (error == -ENOSPC) {
+		if (state->path.active == 1) {
+			/*
+			 * Its really a single leaf node, but it had
+			 * out-of-line values so it looked like it *might*
+			 * have been a b-tree. Let the caller deal with this.
+			 */
+			goto out;
+		}
+
+		/*
+		 * Split as many Btree elements as required.
+		 * This code tracks the new and old attr's location
+		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
+		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+		 */
+		error = xfs_da3_split(state);
+		if (error)
+			goto out;
+	} else {
+		/*
+		 * Addition succeeded, update Btree hashvals.
+		 */
+		xfs_da3_fixhashpath(state, &state->path);
+	}
+
+out:
+	xfs_da_state_free(state);
+	attr->xattri_da_state = NULL;
+	return error;
+}
+
+static int
+xfs_attr_node_removename(
+	struct xfs_da_args	*args,
+	struct xfs_da_state	*state)
+{
+	struct xfs_da_state_blk	*blk;
+	int			retval;
+
+	/*
+	 * Remove the name and update the hashvals in the tree.
+	 */
+	blk = &state->path.blk[state->path.active-1];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	retval = xfs_attr3_leaf_remove(blk->bp, args);
+	xfs_da3_fixhashpath(state, &state->path);
+
+	return retval;
+}
+
+static int
+xfs_attr_node_remove_attr(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args		*args = attr->xattri_da_args;
+	struct xfs_da_state		*state = xfs_da_state_alloc(args);
+	int				retval = 0;
+	int				error = 0;
+
+	/*
+	 * The attr we are removing has already been marked incomplete, so
+	 * we need to set the filter appropriately to re-find the "old"
+	 * attribute entry after any split ops.
+	 */
+	args->attr_filter |= XFS_ATTR_INCOMPLETE;
+	error = xfs_da3_node_lookup_int(state, &retval);
+	if (error)
+		goto out;
+
+	error = xfs_attr_node_removename(args, state);
+
+	/*
+	 * Check to see if the tree needs to be collapsed.
+	 */
+	if (retval && (state->path.active > 1)) {
+		error = xfs_da3_join(state);
+		if (error)
+			goto out;
+	}
+	retval = error = 0;
+
+out:
+	xfs_da_state_free(state);
+	if (error)
+		return error;
+	return retval;
+}
+
+/*
+ * Retrieve the attribute data from a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
+STATIC int
+xfs_attr_node_get(
+	struct xfs_da_args	*args)
+{
+	struct xfs_da_state	*state;
+	struct xfs_da_state_blk	*blk;
+	int			i;
+	int			error;
+
+	trace_xfs_attr_node_get(args);
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	state = xfs_da_state_alloc(args);
+	error = xfs_attr_node_lookup(args, state);
+	if (error != -EEXIST)
+		goto out_release;
+
+	/*
+	 * Get the value, local or "remote"
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	error = xfs_attr3_leaf_getvalue(blk->bp, args);
+
+	/*
+	 * If not in a transaction, we have to release all the buffers.
+	 */
+out_release:
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return error;
+}
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+	const void	*name,
+	size_t		length)
+{
+	/*
+	 * MAXNAMELEN includes the trailing null, but (name/length) leave it
+	 * out, so use >= for the length check.
+	 */
+	if (length >= MAXNAMELEN)
+		return false;
+
+	/* There shouldn't be any nulls here */
+	return !memchr(name, 0, length);
+}
+
+int __init
+xfs_attr_intent_init_cache(void)
+{
+	xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent",
+			sizeof(struct xfs_attr_intent),
+			0, 0, NULL);
+
+	return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attr_intent_destroy_cache(void)
+{
+	kmem_cache_destroy(xfs_attr_intent_cache);
+	xfs_attr_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
new file mode 100644
index 000000000..81be9b3e4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -0,0 +1,621 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_H__
+#define	__XFS_ATTR_H__
+
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
+/*
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call.  Larger
+ * sizes will result in an ERANGE return code.
+ */
+#define	ATTR_MAX_VALUELEN	(64*1024)	/* max length of a value */
+
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+struct xfs_attrlist_cursor_kern {
+	__u32	hashval;	/* hash value of next entry to add */
+	__u32	blkno;		/* block containing entry (suggestion) */
+	__u32	offset;		/* offset in list of equal-hashvals */
+	__u16	pad1;		/* padding to match user-level */
+	__u8	pad2;		/* padding to match user-level */
+	__u8	initted;	/* T/F: cursor has been initialized */
+};
+
+
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+
+
+/* void; state communicated via *context */
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+			      unsigned char *, int, int);
+
+struct xfs_attr_list_context {
+	struct xfs_trans	*tp;
+	struct xfs_inode	*dp;		/* inode */
+	struct xfs_attrlist_cursor_kern cursor;	/* position in list */
+	void			*buffer;	/* output buffer */
+
+	/*
+	 * Abort attribute list iteration if non-zero.  Can be used to pass
+	 * error values to the xfs_attr_list caller.
+	 */
+	int			seen_enough;
+	bool			allow_incomplete;
+
+	ssize_t			count;		/* num used entries */
+	int			dupcnt;		/* count dup hashvals seen */
+	int			bufsize;	/* total buffer size */
+	int			firstu;		/* first used byte in buffer */
+	unsigned int		attr_filter;	/* XFS_ATTR_{ROOT,SECURE} */
+	int			resynch;	/* T/F: resynch with cursor */
+	put_listent_func_t	put_listent;	/* list output fmt function */
+	int			index;		/* index into output buffer */
+};
+
+
+/*
+ * ========================================================================
+ * Structure used to pass context around among the delayed routines.
+ * ========================================================================
+ */
+
+/*
+ * Below is a state machine diagram for attr remove operations. The  XFS_DAS_*
+ * states indicate places where the function would return -EAGAIN, and then
+ * immediately resume from after being called by the calling function. States
+ * marked as a "subroutine state" indicate that they belong to a subroutine, and
+ * so the calling function needs to pass them back to that subroutine to allow
+ * it to finish where it left off. But they otherwise do not have a role in the
+ * calling function other than just passing through.
+ *
+ * xfs_attr_remove_iter()
+ *              │
+ *              v
+ *        have attr to remove? ──n──> done
+ *              │
+ *              y
+ *              │
+ *              v
+ *        are we short form? ──y──> xfs_attr_shortform_remove ──> done
+ *              │
+ *              n
+ *              │
+ *              V
+ *        are we leaf form? ──y──> xfs_attr_leaf_removename ──> done
+ *              │
+ *              n
+ *              │
+ *              V
+ *   ┌── need to setup state?
+ *   │          │
+ *   n          y
+ *   │          │
+ *   │          v
+ *   │ find attr and get state
+ *   │ attr has remote blks? ──n─┐
+ *   │          │                v
+ *   │          │         find and invalidate
+ *   │          y         the remote blocks.
+ *   │          │         mark attr incomplete
+ *   │          ├────────────────┘
+ *   └──────────┤
+ *              │
+ *              v
+ *   Have remote blks to remove? ───y─────┐
+ *              │        ^          remove the blks
+ *              │        │                │
+ *              │        │                v
+ *              │  XFS_DAS_RMTBLK <─n── done?
+ *              │  re-enter with          │
+ *              │  one less blk to        y
+ *              │      remove             │
+ *              │                         V
+ *              │                  refill the state
+ *              n                         │
+ *              │                         v
+ *              │                   XFS_DAS_RM_NAME
+ *              │                         │
+ *              ├─────────────────────────┘
+ *              │
+ *              v
+ *       remove leaf and
+ *       update hash with
+ *   xfs_attr_node_remove_cleanup
+ *              │
+ *              v
+ *           need to
+ *        shrink tree? ─n─┐
+ *              │         │
+ *              y         │
+ *              │         │
+ *              v         │
+ *          join leaf     │
+ *              │         │
+ *              v         │
+ *      XFS_DAS_RM_SHRINK │
+ *              │         │
+ *              v         │
+ *       do the shrink    │
+ *              │         │
+ *              v         │
+ *          free state <──┘
+ *              │
+ *              v
+ *            done
+ *
+ *
+ * Below is a state machine diagram for attr set operations.
+ *
+ * It seems the challenge with understanding this system comes from trying to
+ * absorb the state machine all at once, when really one should only be looking
+ * at it with in the context of a single function. Once a state sensitive
+ * function is called, the idea is that it "takes ownership" of the
+ * state machine. It isn't concerned with the states that may have belonged to
+ * it's calling parent. Only the states relevant to itself or any other
+ * subroutines there in. Once a calling function hands off the state machine to
+ * a subroutine, it needs to respect the simple rule that it doesn't "own" the
+ * state machine anymore, and it's the responsibility of that calling function
+ * to propagate the -EAGAIN back up the call stack. Upon reentry, it is
+ * committed to re-calling that subroutine until it returns something other than
+ * -EAGAIN. Once that subroutine signals completion (by returning anything other
+ * than -EAGAIN), the calling function can resume using the state machine.
+ *
+ *  xfs_attr_set_iter()
+ *              │
+ *              v
+ *   ┌─y─ has an attr fork?
+ *   │          |
+ *   │          n
+ *   │          |
+ *   │          V
+ *   │       add a fork
+ *   │          │
+ *   └──────────┤
+ *              │
+ *              V
+ *   ┌─── is shortform?
+ *   │          │
+ *   │          y
+ *   │          │
+ *   │          V
+ *   │   xfs_attr_set_fmt
+ *   │          |
+ *   │          V
+ *   │ xfs_attr_try_sf_addname
+ *   │          │
+ *   │          V
+ *   │      had enough ──y──> done
+ *   │        space?
+ *   n          │
+ *   │          n
+ *   │          │
+ *   │          V
+ *   │   transform to leaf
+ *   │          │
+ *   │          V
+ *   │   hold the leaf buffer
+ *   │          │
+ *   │          V
+ *   │     return -EAGAIN
+ *   │      Re-enter in
+ *   │       leaf form
+ *   │
+ *   └─> release leaf buffer
+ *          if needed
+ *              │
+ *              V
+ *   ┌───n── fork has
+ *   │      only 1 blk?
+ *   │          │
+ *   │          y
+ *   │          │
+ *   │          v
+ *   │ xfs_attr_leaf_try_add()
+ *   │          │
+ *   │          v
+ *   │      had enough ──────────────y─────────────┐
+ *   │        space?                               │
+ *   │          │                                  │
+ *   │          n                                  │
+ *   │          │                                  │
+ *   │          v                                  │
+ *   │    return -EAGAIN                           │
+ *   │      re-enter in                            │
+ *   │        node form                            │
+ *   │          │                                  │
+ *   └──────────┤                                  │
+ *              │                                  │
+ *              V                                  │
+ * xfs_attr_node_addname_find_attr                 │
+ *        determines if this                       │
+ *       is create or rename                       │
+ *     find space to store attr                    │
+ *              │                                  │
+ *              v                                  │
+ *     xfs_attr_node_addname                       │
+ *              │                                  │
+ *              v                                  │
+ *   fits in a node leaf? ────n─────┐              │
+ *              │     ^             v              │
+ *              │     │       single leaf node?    │
+ *              │     │         │            │     │
+ *              y     │         y            n     │
+ *              │     │         │            │     │
+ *              v     │         v            v     │
+ *            update  │    grow the leaf  split if │
+ *           hashvals └── return -EAGAIN   needed  │
+ *              │         retry leaf add     │     │
+ *              │           on reentry       │     │
+ *              ├────────────────────────────┘     │
+ *              │                                  │
+ *              v                                  │
+ *         need to alloc                           │
+ *   ┌─y── or flip flag?                           │
+ *   │          │                                  │
+ *   │          n                                  │
+ *   │          │                                  │
+ *   │          v                                  │
+ *   │         done                                │
+ *   │                                             │
+ *   │                                             │
+ *   │         XFS_DAS_FOUND_LBLK <────────────────┘
+ *   │                  │
+ *   │                  V
+ *   │        xfs_attr_leaf_addname()
+ *   │                  │
+ *   │                  v
+ *   │      ┌──first time through?
+ *   │      │          │
+ *   │      │          y
+ *   │      │          │
+ *   │      n          v
+ *   │      │    if we have rmt blks
+ *   │      │    find space for them
+ *   │      │          │
+ *   │      └──────────┤
+ *   │                 │
+ *   │                 v
+ *   │            still have
+ *   │      ┌─n─ blks to alloc? <──┐
+ *   │      │          │           │
+ *   │      │          y           │
+ *   │      │          │           │
+ *   │      │          v           │
+ *   │      │     alloc one blk    │
+ *   │      │     return -EAGAIN ──┘
+ *   │      │    re-enter with one
+ *   │      │    less blk to alloc
+ *   │      │
+ *   │      │
+ *   │      └───> set the rmt
+ *   │               value
+ *   │                 │
+ *   │                 v
+ *   │               was this
+ *   │              a rename? ──n─┐
+ *   │                 │          │
+ *   │                 y          │
+ *   │                 │          │
+ *   │                 v          │
+ *   │           flip incomplete  │
+ *   │               flag         │
+ *   │                 │          │
+ *   │                 v          │
+ *   │         XFS_DAS_FLIP_LFLAG │
+ *   │                 │          │
+ *   │                 v          │
+ *   │          need to remove    │
+ *   │              old bks? ──n──┤
+ *   │                 │          │
+ *   │                 y          │
+ *   │                 │          │
+ *   │                 V          │
+ *   │               remove       │
+ *   │        ┌───> old blks      │
+ *   │        │        │          │
+ *   │ XFS_DAS_RM_LBLK │          │
+ *   │        ^        │          │
+ *   │        │        v          │
+ *   │        └──y── more to      │
+ *   │              remove?       │
+ *   │                 │          │
+ *   │                 n          │
+ *   │                 │          │
+ *   │                 v          │
+ *   │          XFS_DAS_RD_LEAF   │
+ *   │                 │          │
+ *   │                 v          │
+ *   │            remove leaf     │
+ *   │                 │          │
+ *   │                 v          │
+ *   │            shrink to sf    │
+ *   │             if needed      │
+ *   │                 │          │
+ *   │                 v          │
+ *   │                done <──────┘
+ *   │
+ *   └──────> XFS_DAS_FOUND_NBLK
+ *                     │
+ *                     v
+ *       ┌─────n──  need to
+ *       │        alloc blks?
+ *       │             │
+ *       │             y
+ *       │             │
+ *       │             v
+ *       │        find space
+ *       │             │
+ *       │             v
+ *       │  ┌─>XFS_DAS_ALLOC_NODE
+ *       │  │          │
+ *       │  │          v
+ *       │  │      alloc blk
+ *       │  │          │
+ *       │  │          v
+ *       │  └──y── need to alloc
+ *       │         more blocks?
+ *       │             │
+ *       │             n
+ *       │             │
+ *       │             v
+ *       │      set the rmt value
+ *       │             │
+ *       │             v
+ *       │          was this
+ *       └────────> a rename? ──n─┐
+ *                     │          │
+ *                     y          │
+ *                     │          │
+ *                     v          │
+ *               flip incomplete  │
+ *                   flag         │
+ *                     │          │
+ *                     v          │
+ *             XFS_DAS_FLIP_NFLAG │
+ *                     │          │
+ *                     v          │
+ *                 need to        │
+ *               remove blks? ─n──┤
+ *                     │          │
+ *                     y          │
+ *                     │          │
+ *                     v          │
+ *                   remove       │
+ *        ┌────────> old blks     │
+ *        │            │          │
+ *  XFS_DAS_RM_NBLK    │          │
+ *        ^            │          │
+ *        │            v          │
+ *        └──────y── more to      │
+ *                   remove       │
+ *                     │          │
+ *                     n          │
+ *                     │          │
+ *                     v          │
+ *              XFS_DAS_CLR_FLAG  │
+ *                     │          │
+ *                     v          │
+ *                clear flags     │
+ *                     │          │
+ *                     ├──────────┘
+ *                     │
+ *                     v
+ *                   done
+ */
+
+/*
+ * Enum values for xfs_attr_intent.xattri_da_state
+ *
+ * These values are used by delayed attribute operations to keep track  of where
+ * they were before they returned -EAGAIN.  A return code of -EAGAIN signals the
+ * calling function to roll the transaction, and then call the subroutine to
+ * finish the operation.  The enum is then used by the subroutine to jump back
+ * to where it was and resume executing where it left off.
+ */
+enum xfs_delattr_state {
+	XFS_DAS_UNINIT		= 0,	/* No state has been set yet */
+
+	/*
+	 * Initial sequence states. The replace setup code relies on the
+	 * ADD and REMOVE states for a specific format to be sequential so
+	 * that we can transform the initial operation to be performed
+	 * according to the xfs_has_larp() state easily.
+	 */
+	XFS_DAS_SF_ADD,			/* Initial sf add state */
+	XFS_DAS_SF_REMOVE,		/* Initial sf replace/remove state */
+
+	XFS_DAS_LEAF_ADD,		/* Initial leaf add state */
+	XFS_DAS_LEAF_REMOVE,		/* Initial leaf replace/remove state */
+
+	XFS_DAS_NODE_ADD,		/* Initial node add state */
+	XFS_DAS_NODE_REMOVE,		/* Initial node replace/remove state */
+
+	/* Leaf state set/replace/remove sequence */
+	XFS_DAS_LEAF_SET_RMT,		/* set a remote xattr from a leaf */
+	XFS_DAS_LEAF_ALLOC_RMT,		/* We are allocating remote blocks */
+	XFS_DAS_LEAF_REPLACE,		/* Perform replace ops on a leaf */
+	XFS_DAS_LEAF_REMOVE_OLD,	/* Start removing old attr from leaf */
+	XFS_DAS_LEAF_REMOVE_RMT,	/* A rename is removing remote blocks */
+	XFS_DAS_LEAF_REMOVE_ATTR,	/* Remove the old attr from a leaf */
+
+	/* Node state sequence, must match leaf state above */
+	XFS_DAS_NODE_SET_RMT,		/* set a remote xattr from a node */
+	XFS_DAS_NODE_ALLOC_RMT,		/* We are allocating remote blocks */
+	XFS_DAS_NODE_REPLACE,		/* Perform replace ops on a node */
+	XFS_DAS_NODE_REMOVE_OLD,	/* Start removing old attr from node */
+	XFS_DAS_NODE_REMOVE_RMT,	/* A rename is removing remote blocks */
+	XFS_DAS_NODE_REMOVE_ATTR,	/* Remove the old attr from a node */
+
+	XFS_DAS_DONE,			/* finished operation */
+};
+
+#define XFS_DAS_STRINGS	\
+	{ XFS_DAS_UNINIT,		"XFS_DAS_UNINIT" }, \
+	{ XFS_DAS_SF_ADD,		"XFS_DAS_SF_ADD" }, \
+	{ XFS_DAS_SF_REMOVE,		"XFS_DAS_SF_REMOVE" }, \
+	{ XFS_DAS_LEAF_ADD,		"XFS_DAS_LEAF_ADD" }, \
+	{ XFS_DAS_LEAF_REMOVE,		"XFS_DAS_LEAF_REMOVE" }, \
+	{ XFS_DAS_NODE_ADD,		"XFS_DAS_NODE_ADD" }, \
+	{ XFS_DAS_NODE_REMOVE,		"XFS_DAS_NODE_REMOVE" }, \
+	{ XFS_DAS_LEAF_SET_RMT,		"XFS_DAS_LEAF_SET_RMT" }, \
+	{ XFS_DAS_LEAF_ALLOC_RMT,	"XFS_DAS_LEAF_ALLOC_RMT" }, \
+	{ XFS_DAS_LEAF_REPLACE,		"XFS_DAS_LEAF_REPLACE" }, \
+	{ XFS_DAS_LEAF_REMOVE_OLD,	"XFS_DAS_LEAF_REMOVE_OLD" }, \
+	{ XFS_DAS_LEAF_REMOVE_RMT,	"XFS_DAS_LEAF_REMOVE_RMT" }, \
+	{ XFS_DAS_LEAF_REMOVE_ATTR,	"XFS_DAS_LEAF_REMOVE_ATTR" }, \
+	{ XFS_DAS_NODE_SET_RMT,		"XFS_DAS_NODE_SET_RMT" }, \
+	{ XFS_DAS_NODE_ALLOC_RMT,	"XFS_DAS_NODE_ALLOC_RMT" },  \
+	{ XFS_DAS_NODE_REPLACE,		"XFS_DAS_NODE_REPLACE" },  \
+	{ XFS_DAS_NODE_REMOVE_OLD,	"XFS_DAS_NODE_REMOVE_OLD" }, \
+	{ XFS_DAS_NODE_REMOVE_RMT,	"XFS_DAS_NODE_REMOVE_RMT" }, \
+	{ XFS_DAS_NODE_REMOVE_ATTR,	"XFS_DAS_NODE_REMOVE_ATTR" }, \
+	{ XFS_DAS_DONE,			"XFS_DAS_DONE" }
+
+struct xfs_attri_log_nameval;
+
+/*
+ * Context used for keeping track of delayed attribute operations
+ */
+struct xfs_attr_intent {
+	/*
+	 * used to log this item to an intent containing a list of attrs to
+	 * commit later
+	 */
+	struct list_head		xattri_list;
+
+	/* Used in xfs_attr_node_removename to roll through removing blocks */
+	struct xfs_da_state		*xattri_da_state;
+
+	struct xfs_da_args		*xattri_da_args;
+
+	/*
+	 * Shared buffer containing the attr name and value so that the logging
+	 * code can share large memory buffers between log items.
+	 */
+	struct xfs_attri_log_nameval	*xattri_nameval;
+
+	/* Used to keep track of current state of delayed operation */
+	enum xfs_delattr_state		xattri_dela_state;
+
+	/*
+	 * Attr operation being performed - XFS_ATTRI_OP_FLAGS_*
+	 */
+	unsigned int			xattri_op_flags;
+
+	/* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
+	xfs_dablk_t			xattri_lblkno;
+	int				xattri_blkcnt;
+	struct xfs_bmbt_irec		xattri_map;
+};
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_list_ilocked(struct xfs_attr_list_context *);
+int xfs_attr_list(struct xfs_attr_list_context *);
+int xfs_inode_hasattr(struct xfs_inode *ip);
+bool xfs_attr_is_leaf(struct xfs_inode *ip);
+int xfs_attr_get_ilocked(struct xfs_da_args *args);
+int xfs_attr_get(struct xfs_da_args *args);
+int xfs_attr_set(struct xfs_da_args *args);
+int xfs_attr_set_iter(struct xfs_attr_intent *attr);
+int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
+bool xfs_attr_namecheck(const void *name, size_t length);
+int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
+void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
+			 unsigned int *total);
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+	struct xfs_inode    *ip)
+{
+	return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL ||
+	       (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+		ip->i_af.if_nextents == 0);
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_add_state(struct xfs_da_args *args)
+{
+	/*
+	 * When called from the completion of a attr remove to determine the
+	 * next state, the attribute fork may be null. This can occur only occur
+	 * on a pure remove, but we grab the next state before we check if a
+	 * replace operation is being performed. If we are called from any other
+	 * context, i_af is guaranteed to exist. Hence if the attr fork is
+	 * null, we were called from a pure remove operation and so we are done.
+	 */
+	if (!xfs_inode_has_attr_fork(args->dp))
+		return XFS_DAS_DONE;
+
+	args->op_flags |= XFS_DA_OP_ADDNAME;
+	if (xfs_attr_is_shortform(args->dp))
+		return XFS_DAS_SF_ADD;
+	if (xfs_attr_is_leaf(args->dp))
+		return XFS_DAS_LEAF_ADD;
+	return XFS_DAS_NODE_ADD;
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_remove_state(struct xfs_da_args *args)
+{
+	args->op_flags |= XFS_DA_OP_REMOVE;
+	if (xfs_attr_is_shortform(args->dp))
+		return XFS_DAS_SF_REMOVE;
+	if (xfs_attr_is_leaf(args->dp))
+		return XFS_DAS_LEAF_REMOVE;
+	return XFS_DAS_NODE_REMOVE;
+}
+
+/*
+ * If we are logging the attributes, then we have to start with removal of the
+ * old attribute so that there is always consistent state that we can recover
+ * from if the system goes down part way through. We always log the new attr
+ * value, so even when we remove the attr first we still have the information in
+ * the log to finish the replace operation atomically.
+ */
+static inline enum xfs_delattr_state
+xfs_attr_init_replace_state(struct xfs_da_args *args)
+{
+	args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
+	if (args->op_flags & XFS_DA_OP_LOGGED)
+		return xfs_attr_init_remove_state(args);
+	return xfs_attr_init_add_state(args);
+}
+
+extern struct kmem_cache *xfs_attr_intent_cache;
+int __init xfs_attr_intent_init_cache(void);
+void xfs_attr_intent_destroy_cache(void);
+
+#endif	/* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
new file mode 100644
index 000000000..beee51ad7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -0,0 +1,3002 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_errortag.h"
+
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+				 xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+				   struct xfs_attr3_icleaf_hdr *ichdr,
+				   struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+				   struct xfs_attr3_icleaf_hdr *ichdr,
+				   struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
+						   xfs_da_state_blk_t *blk1,
+						   xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+			xfs_da_state_blk_t *leaf_blk_1,
+			struct xfs_attr3_icleaf_hdr *ichdr1,
+			xfs_da_state_blk_t *leaf_blk_2,
+			struct xfs_attr3_icleaf_hdr *ichdr2,
+			int *number_entries_in_blk1,
+			int *number_usedbytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+			struct xfs_attr_leafblock *src_leaf,
+			struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+			struct xfs_attr_leafblock *dst_leaf,
+			struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+			int move_count);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+
+/*
+ * attr3 block 'firstused' conversion helpers.
+ *
+ * firstused refers to the offset of the first used byte of the nameval region
+ * of an attr leaf block. The region starts at the tail of the block and expands
+ * backwards towards the middle. As such, firstused is initialized to the block
+ * size for an empty leaf block and is reduced from there.
+ *
+ * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k.
+ * The in-core firstused field is 32-bit and thus supports the maximum fsb size.
+ * The on-disk field is only 16-bit, however, and overflows at 64k. Since this
+ * only occurs at exactly 64k, we use zero as a magic on-disk value to represent
+ * the attr block size. The following helpers manage the conversion between the
+ * in-core and on-disk formats.
+ */
+
+static void
+xfs_attr3_leaf_firstused_from_disk(
+	struct xfs_da_geometry		*geo,
+	struct xfs_attr3_icleaf_hdr	*to,
+	struct xfs_attr_leafblock	*from)
+{
+	struct xfs_attr3_leaf_hdr	*hdr3;
+
+	if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+		hdr3 = (struct xfs_attr3_leaf_hdr *) from;
+		to->firstused = be16_to_cpu(hdr3->firstused);
+	} else {
+		to->firstused = be16_to_cpu(from->hdr.firstused);
+	}
+
+	/*
+	 * Convert from the magic fsb size value to actual blocksize. This
+	 * should only occur for empty blocks when the block size overflows
+	 * 16-bits.
+	 */
+	if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) {
+		ASSERT(!to->count && !to->usedbytes);
+		ASSERT(geo->blksize > USHRT_MAX);
+		to->firstused = geo->blksize;
+	}
+}
+
+static void
+xfs_attr3_leaf_firstused_to_disk(
+	struct xfs_da_geometry		*geo,
+	struct xfs_attr_leafblock	*to,
+	struct xfs_attr3_icleaf_hdr	*from)
+{
+	struct xfs_attr3_leaf_hdr	*hdr3;
+	uint32_t			firstused;
+
+	/* magic value should only be seen on disk */
+	ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF);
+
+	/*
+	 * Scale down the 32-bit in-core firstused value to the 16-bit on-disk
+	 * value. This only overflows at the max supported value of 64k. Use the
+	 * magic on-disk value to represent block size in this case.
+	 */
+	firstused = from->firstused;
+	if (firstused > USHRT_MAX) {
+		ASSERT(from->firstused == geo->blksize);
+		firstused = XFS_ATTR3_LEAF_NULLOFF;
+	}
+
+	if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+		hdr3 = (struct xfs_attr3_leaf_hdr *) to;
+		hdr3->firstused = cpu_to_be16(firstused);
+	} else {
+		to->hdr.firstused = cpu_to_be16(firstused);
+	}
+}
+
+void
+xfs_attr3_leaf_hdr_from_disk(
+	struct xfs_da_geometry		*geo,
+	struct xfs_attr3_icleaf_hdr	*to,
+	struct xfs_attr_leafblock	*from)
+{
+	int	i;
+
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+	       from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+	if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+		to->back = be32_to_cpu(hdr3->info.hdr.back);
+		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+		to->count = be16_to_cpu(hdr3->count);
+		to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+		xfs_attr3_leaf_firstused_from_disk(geo, to, from);
+		to->holes = hdr3->holes;
+
+		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+			to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+			to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+		}
+		return;
+	}
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.count);
+	to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+	xfs_attr3_leaf_firstused_from_disk(geo, to, from);
+	to->holes = from->hdr.holes;
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+		to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+	}
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+	struct xfs_da_geometry		*geo,
+	struct xfs_attr_leafblock	*to,
+	struct xfs_attr3_icleaf_hdr	*from)
+{
+	int				i;
+
+	ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+	       from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+	if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+		hdr3->info.hdr.back = cpu_to_be32(from->back);
+		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+		hdr3->count = cpu_to_be16(from->count);
+		hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+		xfs_attr3_leaf_firstused_to_disk(geo, to, from);
+		hdr3->holes = from->holes;
+		hdr3->pad1 = 0;
+
+		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+			hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+			hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+		}
+		return;
+	}
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.count = cpu_to_be16(from->count);
+	to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+	xfs_attr3_leaf_firstused_to_disk(geo, to, from);
+	to->hdr.holes = from->holes;
+	to->hdr.pad1 = 0;
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+		to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+	}
+}
+
+static xfs_failaddr_t
+xfs_attr3_leaf_verify_entry(
+	struct xfs_mount			*mp,
+	char					*buf_end,
+	struct xfs_attr_leafblock		*leaf,
+	struct xfs_attr3_icleaf_hdr		*leafhdr,
+	struct xfs_attr_leaf_entry		*ent,
+	int					idx,
+	__u32					*last_hashval)
+{
+	struct xfs_attr_leaf_name_local		*lentry;
+	struct xfs_attr_leaf_name_remote	*rentry;
+	char					*name_end;
+	unsigned int				nameidx;
+	unsigned int				namesize;
+	__u32					hashval;
+
+	/* hash order check */
+	hashval = be32_to_cpu(ent->hashval);
+	if (hashval < *last_hashval)
+		return __this_address;
+	*last_hashval = hashval;
+
+	nameidx = be16_to_cpu(ent->nameidx);
+	if (nameidx < leafhdr->firstused || nameidx >= mp->m_attr_geo->blksize)
+		return __this_address;
+
+	/*
+	 * Check the name information.  The namelen fields are u8 so we can't
+	 * possibly exceed the maximum name length of 255 bytes.
+	 */
+	if (ent->flags & XFS_ATTR_LOCAL) {
+		lentry = xfs_attr3_leaf_name_local(leaf, idx);
+		namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+				be16_to_cpu(lentry->valuelen));
+		name_end = (char *)lentry + namesize;
+		if (lentry->namelen == 0)
+			return __this_address;
+	} else {
+		rentry = xfs_attr3_leaf_name_remote(leaf, idx);
+		namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+		name_end = (char *)rentry + namesize;
+		if (rentry->namelen == 0)
+			return __this_address;
+		if (!(ent->flags & XFS_ATTR_INCOMPLETE) &&
+		    rentry->valueblk == 0)
+			return __this_address;
+	}
+
+	if (name_end > buf_end)
+		return __this_address;
+
+	return NULL;
+}
+
+/*
+ * Validate an attribute leaf block.
+ *
+ * Empty leaf blocks can occur under the following circumstances:
+ *
+ * 1. setxattr adds a new extended attribute to a file;
+ * 2. The file has zero existing attributes;
+ * 3. The attribute is too large to fit in the attribute fork;
+ * 4. The attribute is small enough to fit in a leaf block;
+ * 5. A log flush occurs after committing the transaction that creates
+ *    the (empty) leaf block; and
+ * 6. The filesystem goes down after the log flush but before the new
+ *    attribute can be committed to the leaf block.
+ *
+ * Hence we need to ensure that we don't fail the validation purely
+ * because the leaf is empty.
+ */
+static xfs_failaddr_t
+xfs_attr3_leaf_verify(
+	struct xfs_buf			*bp)
+{
+	struct xfs_attr3_icleaf_hdr	ichdr;
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_attr_leafblock	*leaf = bp->b_addr;
+	struct xfs_attr_leaf_entry	*entries;
+	struct xfs_attr_leaf_entry	*ent;
+	char				*buf_end;
+	uint32_t			end;	/* must be 32bit - see below */
+	__u32				last_hashval = 0;
+	int				i;
+	xfs_failaddr_t			fa;
+
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+
+	fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+	if (fa)
+		return fa;
+
+	/*
+	 * firstused is the block offset of the first name info structure.
+	 * Make sure it doesn't go off the block or crash into the header.
+	 */
+	if (ichdr.firstused > mp->m_attr_geo->blksize)
+		return __this_address;
+	if (ichdr.firstused < xfs_attr3_leaf_hdr_size(leaf))
+		return __this_address;
+
+	/* Make sure the entries array doesn't crash into the name info. */
+	entries = xfs_attr3_leaf_entryp(bp->b_addr);
+	if ((char *)&entries[ichdr.count] >
+	    (char *)bp->b_addr + ichdr.firstused)
+		return __this_address;
+
+	/*
+	 * NOTE: This verifier historically failed empty leaf buffers because
+	 * we expect the fork to be in another format. Empty attr fork format
+	 * conversions are possible during xattr set, however, and format
+	 * conversion is not atomic with the xattr set that triggers it. We
+	 * cannot assume leaf blocks are non-empty until that is addressed.
+	*/
+	buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+	for (i = 0, ent = entries; i < ichdr.count; ent++, i++) {
+		fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr,
+				ent, i, &last_hashval);
+		if (fa)
+			return fa;
+	}
+
+	/*
+	 * Quickly check the freemap information.  Attribute data has to be
+	 * aligned to 4-byte boundaries, and likewise for the free space.
+	 *
+	 * Note that for 64k block size filesystems, the freemap entries cannot
+	 * overflow as they are only be16 fields. However, when checking end
+	 * pointer of the freemap, we have to be careful to detect overflows and
+	 * so use uint32_t for those checks.
+	 */
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		if (ichdr.freemap[i].base > mp->m_attr_geo->blksize)
+			return __this_address;
+		if (ichdr.freemap[i].base & 0x3)
+			return __this_address;
+		if (ichdr.freemap[i].size > mp->m_attr_geo->blksize)
+			return __this_address;
+		if (ichdr.freemap[i].size & 0x3)
+			return __this_address;
+
+		/* be care of 16 bit overflows here */
+		end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size;
+		if (end < ichdr.freemap[i].base)
+			return __this_address;
+		if (end > mp->m_attr_geo->blksize)
+			return __this_address;
+	}
+
+	return NULL;
+}
+
+static void
+xfs_attr3_leaf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_attr3_leaf_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_attr3_leaf_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	xfs_failaddr_t		fa;
+
+	if (xfs_has_crc(mp) &&
+	     !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_attr3_leaf_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+	.name = "xfs_attr3_leaf",
+	.magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+		     cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
+	.verify_read = xfs_attr3_leaf_read_verify,
+	.verify_write = xfs_attr3_leaf_write_verify,
+	.verify_struct = xfs_attr3_leaf_verify,
+};
+
+int
+xfs_attr3_leaf_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK,
+			&xfs_attr3_leaf_buf_ops);
+	if (!err && tp && *bpp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+	return err;
+}
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+/*
+ * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE
+ * flag on disk - if there's an incomplete attr then recovery needs to tear it
+ * down. If there's no incomplete attr, then recovery needs to tear that attr
+ * down to replace it with the attr that has been logged. In this case, the
+ * INCOMPLETE flag will not be set in attr->attr_filter, but rather
+ * XFS_DA_OP_RECOVERY will be set in args->op_flags.
+ */
+static bool
+xfs_attr_match(
+	struct xfs_da_args	*args,
+	uint8_t			namelen,
+	unsigned char		*name,
+	int			flags)
+{
+
+	if (args->namelen != namelen)
+		return false;
+	if (memcmp(args->name, name, namelen) != 0)
+		return false;
+
+	/* Recovery ignores the INCOMPLETE flag. */
+	if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
+	    args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
+		return true;
+
+	/* All remaining matches need to be filtered by INCOMPLETE state. */
+	if (args->attr_filter !=
+	    (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
+		return false;
+	return true;
+}
+
+static int
+xfs_attr_copy_value(
+	struct xfs_da_args	*args,
+	unsigned char		*value,
+	int			valuelen)
+{
+	/*
+	 * No copy if all we have to do is get the length
+	 */
+	if (!args->valuelen) {
+		args->valuelen = valuelen;
+		return 0;
+	}
+
+	/*
+	 * No copy if the length of the existing buffer is too small
+	 */
+	if (args->valuelen < valuelen) {
+		args->valuelen = valuelen;
+		return -ERANGE;
+	}
+
+	if (!args->value) {
+		args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP);
+		if (!args->value)
+			return -ENOMEM;
+	}
+	args->valuelen = valuelen;
+
+	/* remote block xattr requires IO for copy-in */
+	if (args->rmtblkno)
+		return xfs_attr_rmtval_get(args);
+
+	/*
+	 * This is to prevent a GCC warning because the remote xattr case
+	 * doesn't have a value to pass in. In that case, we never reach here,
+	 * but GCC can't work that out and so throws a "passing NULL to
+	 * memcpy" warning.
+	 */
+	if (!value)
+		return -EINVAL;
+	memcpy(args->value, value, valuelen);
+	return 0;
+}
+
+/*========================================================================
+ * External routines when attribute fork size < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Query whether the total requested number of attr fork bytes of extended
+ * attribute space will be able to fit inline.
+ *
+ * Returns zero if not, else the i_forkoff fork offset to be used in the
+ * literal area for attribute data once the new bytes have been added.
+ *
+ * i_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * special case for dev/uuid inodes, they have fixed size data forks.
+ */
+int
+xfs_attr_shortform_bytesfit(
+	struct xfs_inode	*dp,
+	int			bytes)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	int64_t			dsize;
+	int			minforkoff;
+	int			maxforkoff;
+	int			offset;
+
+	/*
+	 * Check if the new size could fit at all first:
+	 */
+	if (bytes > XFS_LITINO(mp))
+		return 0;
+
+	/* rounded down */
+	offset = (XFS_LITINO(mp) - bytes) >> 3;
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) {
+		minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		return (offset >= minforkoff) ? minforkoff : 0;
+	}
+
+	/*
+	 * If the requested numbers of bytes is smaller or equal to the
+	 * current attribute fork size we can always proceed.
+	 *
+	 * Note that if_bytes in the data fork might actually be larger than
+	 * the current data fork size is due to delalloc extents. In that
+	 * case either the extent count will go down when they are converted
+	 * to real extents, or the delalloc conversion will take care of the
+	 * literal area rebalancing.
+	 */
+	if (bytes <= xfs_inode_attr_fork_size(dp))
+		return dp->i_forkoff;
+
+	/*
+	 * For attr2 we can try to move the forkoff if there is space in the
+	 * literal area, but for the old format we are done if there is no
+	 * space in the fixed attribute fork.
+	 */
+	if (!xfs_has_attr2(mp))
+		return 0;
+
+	dsize = dp->i_df.if_bytes;
+
+	switch (dp->i_df.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/*
+		 * If there is no attr fork and the data fork is extents,
+		 * determine if creating the default attr fork will result
+		 * in the extents form migrating to btree. If so, the
+		 * minimum offset only needs to be the space required for
+		 * the btree root.
+		 */
+		if (!dp->i_forkoff && dp->i_df.if_bytes >
+		    xfs_default_attroffset(dp))
+			dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		/*
+		 * If we have a data btree then keep forkoff if we have one,
+		 * otherwise we are adding a new attr, so then we set
+		 * minforkoff to where the btree root can finish so we have
+		 * plenty of room for attrs
+		 */
+		if (dp->i_forkoff) {
+			if (offset < dp->i_forkoff)
+				return 0;
+			return dp->i_forkoff;
+		}
+		dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+		break;
+	}
+
+	/*
+	 * A data fork btree root must have space for at least
+	 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+	 */
+	minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+	minforkoff = roundup(minforkoff, 8) >> 3;
+
+	/* attr fork btree root can have at least this many key/ptr pairs */
+	maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	maxforkoff = maxforkoff >> 3;	/* rounded down */
+
+	if (offset >= maxforkoff)
+		return maxforkoff;
+	if (offset >= minforkoff)
+		return offset;
+	return 0;
+}
+
+/*
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless:
+ * - noattr2 mount option is set,
+ * - on-disk version bit says it is already set, or
+ * - the attr2 mount option is not set to enable automatic upgrade from attr1.
+ */
+STATIC void
+xfs_sbversion_add_attr2(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp)
+{
+	if (xfs_has_noattr2(mp))
+		return;
+	if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
+		return;
+	if (!xfs_has_attr2(mp))
+		return;
+
+	spin_lock(&mp->m_sb_lock);
+	xfs_add_attr2(mp);
+	spin_unlock(&mp->m_sb_lock);
+	xfs_log_sb(tp);
+}
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+void
+xfs_attr_shortform_create(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_ifork	*ifp = &dp->i_af;
+	struct xfs_attr_sf_hdr	*hdr;
+
+	trace_xfs_attr_sf_create(args);
+
+	ASSERT(ifp->if_bytes == 0);
+	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS)
+		ifp->if_format = XFS_DINODE_FMT_LOCAL;
+	xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+	hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+	memset(hdr, 0, sizeof(*hdr));
+	hdr->totsize = cpu_to_be16(sizeof(*hdr));
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+
+/*
+ * Return -EEXIST if attr is found, or -ENOATTR if not
+ * args:  args containing attribute name and namelen
+ * sfep:  If not null, pointer will be set to the last attr entry found on
+	  -EEXIST.  On -ENOATTR pointer is left at the last entry in the list
+ * basep: If not null, pointer is set to the byte offset of the entry in the
+ *	  list on -EEXIST.  On -ENOATTR, pointer is left at the byte offset of
+ *	  the last entry in the list
+ */
+int
+xfs_attr_sf_findname(
+	struct xfs_da_args	 *args,
+	struct xfs_attr_sf_entry **sfep,
+	unsigned int		 *basep)
+{
+	struct xfs_attr_shortform *sf;
+	struct xfs_attr_sf_entry *sfe;
+	unsigned int		base = sizeof(struct xfs_attr_sf_hdr);
+	int			size = 0;
+	int			end;
+	int			i;
+
+	sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
+	sfe = &sf->list[0];
+	end = sf->hdr.count;
+	for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
+			     base += size, i++) {
+		size = xfs_attr_sf_entsize(sfe);
+		if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
+				    sfe->flags))
+			continue;
+		break;
+	}
+
+	if (sfep != NULL)
+		*sfep = sfe;
+
+	if (basep != NULL)
+		*basep = base;
+
+	if (i == end)
+		return -ENOATTR;
+	return -EEXIST;
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+void
+xfs_attr_shortform_add(
+	struct xfs_da_args		*args,
+	int				forkoff)
+{
+	struct xfs_attr_shortform	*sf;
+	struct xfs_attr_sf_entry	*sfe;
+	int				offset, size;
+	struct xfs_mount		*mp;
+	struct xfs_inode		*dp;
+	struct xfs_ifork		*ifp;
+
+	trace_xfs_attr_sf_add(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	dp->i_forkoff = forkoff;
+
+	ifp = &dp->i_af;
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+	if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
+		ASSERT(0);
+
+	offset = (char *)sfe - (char *)sf;
+	size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
+	xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+	sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
+
+	sfe->namelen = args->namelen;
+	sfe->valuelen = args->valuelen;
+	sfe->flags = args->attr_filter;
+	memcpy(sfe->nameval, args->name, args->namelen);
+	memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+	sf->hdr.count++;
+	be16_add_cpu(&sf->hdr.totsize, size);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+	xfs_sbversion_add_attr2(mp, args->trans);
+}
+
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+void
+xfs_attr_fork_remove(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp)
+{
+	ASSERT(ip->i_af.if_nextents == 0);
+
+	xfs_ifork_zap_attr(ip);
+	ip->i_forkoff = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Remove an attribute from the shortform attribute list structure.
+ */
+int
+xfs_attr_sf_removename(
+	struct xfs_da_args		*args)
+{
+	struct xfs_attr_shortform	*sf;
+	struct xfs_attr_sf_entry	*sfe;
+	int				size = 0, end, totsize;
+	unsigned int			base;
+	struct xfs_mount		*mp;
+	struct xfs_inode		*dp;
+	int				error;
+
+	trace_xfs_attr_sf_remove(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
+
+	error = xfs_attr_sf_findname(args, &sfe, &base);
+
+	/*
+	 * If we are recovering an operation, finding nothing to
+	 * remove is not an error - it just means there was nothing
+	 * to clean up.
+	 */
+	if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
+		return 0;
+	if (error != -EEXIST)
+		return error;
+	size = xfs_attr_sf_entsize(sfe);
+
+	/*
+	 * Fix up the attribute fork data, covering the hole
+	 */
+	end = base + size;
+	totsize = be16_to_cpu(sf->hdr.totsize);
+	if (end != totsize)
+		memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
+	sf->hdr.count--;
+	be16_add_cpu(&sf->hdr.totsize, -size);
+
+	/*
+	 * Fix up the start offset of the attribute fork
+	 */
+	totsize -= size;
+	if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
+	    (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+	    !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
+		xfs_attr_fork_remove(dp, args->trans);
+	} else {
+		xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+		dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+		ASSERT(dp->i_forkoff);
+		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+				(args->op_flags & XFS_DA_OP_ADDNAME) ||
+				!xfs_has_attr2(mp) ||
+				dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
+		xfs_trans_log_inode(args->trans, dp,
+					XFS_ILOG_CORE | XFS_ILOG_ADATA);
+	}
+
+	xfs_sbversion_add_attr2(mp, args->trans);
+
+	return 0;
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+	struct xfs_attr_shortform *sf;
+	struct xfs_attr_sf_entry *sfe;
+	int i;
+	struct xfs_ifork *ifp;
+
+	trace_xfs_attr_sf_lookup(args);
+
+	ifp = &args->dp->i_af;
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count;
+				sfe = xfs_attr_sf_nextentry(sfe), i++) {
+		if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+				sfe->flags))
+			return -EEXIST;
+	}
+	return -ENOATTR;
+}
+
+/*
+ * Retrieve the attribute value and length.
+ *
+ * If args->valuelen is zero, only the length needs to be returned.  Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
+ */
+int
+xfs_attr_shortform_getvalue(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_shortform *sf;
+	struct xfs_attr_sf_entry *sfe;
+	int			i;
+
+	ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+	sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count;
+				sfe = xfs_attr_sf_nextentry(sfe), i++) {
+		if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+				sfe->flags))
+			return xfs_attr_copy_value(args,
+				&sfe->nameval[args->namelen], sfe->valuelen);
+	}
+	return -ENOATTR;
+}
+
+/* Convert from using the shortform to the leaf format. */
+int
+xfs_attr_shortform_to_leaf(
+	struct xfs_da_args		*args)
+{
+	struct xfs_inode		*dp;
+	struct xfs_attr_shortform	*sf;
+	struct xfs_attr_sf_entry	*sfe;
+	struct xfs_da_args		nargs;
+	char				*tmpbuffer;
+	int				error, i, size;
+	xfs_dablk_t			blkno;
+	struct xfs_buf			*bp;
+	struct xfs_ifork		*ifp;
+
+	trace_xfs_attr_sf_to_leaf(args);
+
+	dp = args->dp;
+	ifp = &dp->i_af;
+	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+	size = be16_to_cpu(sf->hdr.totsize);
+	tmpbuffer = kmem_alloc(size, 0);
+	ASSERT(tmpbuffer != NULL);
+	memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+	sf = (struct xfs_attr_shortform *)tmpbuffer;
+
+	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+	xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
+
+	bp = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		goto out;
+
+	ASSERT(blkno == 0);
+	error = xfs_attr3_leaf_create(args, blkno, &bp);
+	if (error)
+		goto out;
+
+	memset((char *)&nargs, 0, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.geo = args->geo;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count; i++) {
+		nargs.name = sfe->nameval;
+		nargs.namelen = sfe->namelen;
+		nargs.value = &sfe->nameval[nargs.namelen];
+		nargs.valuelen = sfe->valuelen;
+		nargs.hashval = xfs_da_hashname(sfe->nameval,
+						sfe->namelen);
+		nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK;
+		error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
+		ASSERT(error == -ENOATTR);
+		error = xfs_attr3_leaf_add(bp, &nargs);
+		ASSERT(error != -ENOSPC);
+		if (error)
+			goto out;
+		sfe = xfs_attr_sf_nextentry(sfe);
+	}
+	error = 0;
+out:
+	kmem_free(tmpbuffer);
+	return error;
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	struct xfs_attr3_icleaf_hdr leafhdr;
+	int			bytes;
+	int			i;
+	struct xfs_mount	*mp = bp->b_mount;
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+	entry = xfs_attr3_leaf_entryp(leaf);
+
+	bytes = sizeof(struct xfs_attr_sf_hdr);
+	for (i = 0; i < leafhdr.count; entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* don't copy partial entries */
+		if (!(entry->flags & XFS_ATTR_LOCAL))
+			return 0;
+		name_loc = xfs_attr3_leaf_name_local(leaf, i);
+		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return 0;
+		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return 0;
+		bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
+					be16_to_cpu(name_loc->valuelen));
+	}
+	if (xfs_has_attr2(dp->i_mount) &&
+	    (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
+		return -1;
+	return xfs_attr_shortform_bytesfit(dp, bytes);
+}
+
+/* Verify the consistency of an inline attribute fork. */
+xfs_failaddr_t
+xfs_attr_shortform_verify(
+	struct xfs_inode		*ip)
+{
+	struct xfs_attr_shortform	*sfp;
+	struct xfs_attr_sf_entry	*sfep;
+	struct xfs_attr_sf_entry	*next_sfep;
+	char				*endp;
+	struct xfs_ifork		*ifp;
+	int				i;
+	int64_t				size;
+
+	ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+	ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
+	sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+	size = ifp->if_bytes;
+
+	/*
+	 * Give up if the attribute is way too short.
+	 */
+	if (size < sizeof(struct xfs_attr_sf_hdr))
+		return __this_address;
+
+	endp = (char *)sfp + size;
+
+	/* Check all reported entries */
+	sfep = &sfp->list[0];
+	for (i = 0; i < sfp->hdr.count; i++) {
+		/*
+		 * struct xfs_attr_sf_entry has a variable length.
+		 * Check the fixed-offset parts of the structure are
+		 * within the data buffer.
+		 * xfs_attr_sf_entry is defined with a 1-byte variable
+		 * array at the end, so we must subtract that off.
+		 */
+		if (((char *)sfep + sizeof(*sfep)) >= endp)
+			return __this_address;
+
+		/* Don't allow names with known bad length. */
+		if (sfep->namelen == 0)
+			return __this_address;
+
+		/*
+		 * Check that the variable-length part of the structure is
+		 * within the data buffer.  The next entry starts after the
+		 * name component, so nextentry is an acceptable test.
+		 */
+		next_sfep = xfs_attr_sf_nextentry(sfep);
+		if ((char *)next_sfep > endp)
+			return __this_address;
+
+		/*
+		 * Check for unknown flags.  Short form doesn't support
+		 * the incomplete or local bits, so we can use the namespace
+		 * mask here.
+		 */
+		if (sfep->flags & ~XFS_ATTR_NSP_ONDISK_MASK)
+			return __this_address;
+
+		/*
+		 * Check for invalid namespace combinations.  We only allow
+		 * one namespace flag per xattr, so we can just count the
+		 * bits (i.e. hweight) here.
+		 */
+		if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1)
+			return __this_address;
+
+		sfep = next_sfep;
+	}
+	if ((void *)sfep != (void *)endp)
+		return __this_address;
+
+	return NULL;
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr3_leaf_to_shortform(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args,
+	int			forkoff)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_da_args	nargs;
+	struct xfs_inode	*dp = args->dp;
+	char			*tmpbuffer;
+	int			error;
+	int			i;
+
+	trace_xfs_attr_leaf_to_sf(args);
+
+	tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+	if (!tmpbuffer)
+		return -ENOMEM;
+
+	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+
+	leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	entry = xfs_attr3_leaf_entryp(leaf);
+
+	/* XXX (dgc): buffer is about to be marked stale - why zero it? */
+	memset(bp->b_addr, 0, args->geo->blksize);
+
+	/*
+	 * Clean out the prior contents of the attribute list.
+	 */
+	error = xfs_da_shrink_inode(args, 0, bp);
+	if (error)
+		goto out;
+
+	if (forkoff == -1) {
+		/*
+		 * Don't remove the attr fork if this operation is the first
+		 * part of a attr replace operations. We're going to add a new
+		 * attr immediately, so we need to keep the attr fork around in
+		 * this case.
+		 */
+		if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+			ASSERT(xfs_has_attr2(dp->i_mount));
+			ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
+			xfs_attr_fork_remove(dp, args->trans);
+		}
+		goto out;
+	}
+
+	xfs_attr_shortform_create(args);
+
+	/*
+	 * Copy the attributes
+	 */
+	memset((char *)&nargs, 0, sizeof(nargs));
+	nargs.geo = args->geo;
+	nargs.dp = dp;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+	for (i = 0; i < ichdr.count; entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;	/* don't copy partial entries */
+		if (!entry->nameidx)
+			continue;
+		ASSERT(entry->flags & XFS_ATTR_LOCAL);
+		name_loc = xfs_attr3_leaf_name_local(leaf, i);
+		nargs.name = name_loc->nameval;
+		nargs.namelen = name_loc->namelen;
+		nargs.value = &name_loc->nameval[nargs.namelen];
+		nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+		nargs.hashval = be32_to_cpu(entry->hashval);
+		nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK;
+		xfs_attr_shortform_add(&nargs, forkoff);
+	}
+	error = 0;
+
+out:
+	kmem_free(tmpbuffer);
+	return error;
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr3_leaf_to_node(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr icleafhdr;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_da3_icnode_hdr icnodehdr;
+	struct xfs_da_intnode	*node;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp1 = NULL;
+	struct xfs_buf		*bp2 = NULL;
+	xfs_dablk_t		blkno;
+	int			error;
+
+	trace_xfs_attr_leaf_to_node(args);
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+		error = -EIO;
+		goto out;
+	}
+
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		goto out;
+	error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1);
+	if (error)
+		goto out;
+
+	error = xfs_da_get_buf(args->trans, dp, blkno, &bp2, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+
+	/* copy leaf to new buffer, update identifiers */
+	xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
+	bp2->b_ops = bp1->b_ops;
+	memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
+	if (xfs_has_crc(mp)) {
+		struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+		hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2));
+	}
+	xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	node = bp1->b_addr;
+	xfs_da3_node_hdr_from_disk(mp, &icnodehdr, node);
+
+	leaf = bp2->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+
+	/* both on-disk, don't endian-flip twice */
+	icnodehdr.btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+	icnodehdr.btree[0].before = cpu_to_be32(blkno);
+	icnodehdr.count = 1;
+	xfs_da3_node_hdr_to_disk(dp->i_mount, node, &icnodehdr);
+	xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
+	error = 0;
+out:
+	return error;
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+STATIC int
+xfs_attr3_leaf_create(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		blkno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	trace_xfs_attr_leaf_create(args);
+
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, &bp,
+					    XFS_ATTR_FORK);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_attr3_leaf_buf_ops;
+	xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
+	leaf = bp->b_addr;
+	memset(leaf, 0, args->geo->blksize);
+
+	memset(&ichdr, 0, sizeof(ichdr));
+	ichdr.firstused = args->geo->blksize;
+
+	if (xfs_has_crc(mp)) {
+		struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+		ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+		hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
+
+		ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+	} else {
+		ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+		ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+	}
+	ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+	xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr3_leaf_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk)
+{
+	xfs_dablk_t blkno;
+	int error;
+
+	trace_xfs_attr_leaf_split(state->args);
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+	error = xfs_da_grow_inode(state->args, &blkno);
+	if (error)
+		return error;
+	error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
+	if (error)
+		return error;
+	newblk->blkno = blkno;
+	newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+	/*
+	 * Rebalance the entries across the two leaves.
+	 * NOTE: rebalance() currently depends on the 2nd block being empty.
+	 */
+	xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da3_blk_link(state, oldblk, newblk);
+	if (error)
+		return error;
+
+	/*
+	 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+	 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+	 * "new" attrs info.  Will need the "old" info to remove it later.
+	 *
+	 * Insert the "new" entry in the correct block.
+	 */
+	if (state->inleaf) {
+		trace_xfs_attr_leaf_add_old(state->args);
+		error = xfs_attr3_leaf_add(oldblk->bp, state->args);
+	} else {
+		trace_xfs_attr_leaf_add_new(state->args);
+		error = xfs_attr3_leaf_add(newblk->bp, state->args);
+	}
+
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+	return error;
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr3_leaf_add(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	int			tablesize;
+	int			entsize;
+	int			sum;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_add(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	ASSERT(args->index >= 0 && args->index <= ichdr.count);
+	entsize = xfs_attr_leaf_newentsize(args, NULL);
+
+	/*
+	 * Search through freemap for first-fit on new name length.
+	 * (may need to figure in size of entry struct too)
+	 */
+	tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+		if (tablesize > ichdr.firstused) {
+			sum += ichdr.freemap[i].size;
+			continue;
+		}
+		if (!ichdr.freemap[i].size)
+			continue;	/* no space in this map */
+		tmp = entsize;
+		if (ichdr.freemap[i].base < ichdr.firstused)
+			tmp += sizeof(xfs_attr_leaf_entry_t);
+		if (ichdr.freemap[i].size >= tmp) {
+			tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+			goto out_log_hdr;
+		}
+		sum += ichdr.freemap[i].size;
+	}
+
+	/*
+	 * If there are no holes in the address space of the block,
+	 * and we don't have enough freespace, then compaction will do us
+	 * no good and we should just give up.
+	 */
+	if (!ichdr.holes && sum < entsize)
+		return -ENOSPC;
+
+	/*
+	 * Compact the entries to coalesce free space.
+	 * This may change the hdr->count via dropping INCOMPLETE entries.
+	 */
+	xfs_attr3_leaf_compact(args, &ichdr, bp);
+
+	/*
+	 * After compaction, the block is guaranteed to have only one
+	 * free region, in freemap[0].  If it is not big enough, give up.
+	 */
+	if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+		tmp = -ENOSPC;
+		goto out_log_hdr;
+	}
+
+	tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
+
+out_log_hdr:
+	xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+				xfs_attr3_leaf_hdr_size(leaf)));
+	return tmp;
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr3_leaf_add_work(
+	struct xfs_buf		*bp,
+	struct xfs_attr3_icleaf_hdr *ichdr,
+	struct xfs_da_args	*args,
+	int			mapindex)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_mount	*mp;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_add_work(args);
+
+	leaf = bp->b_addr;
+	ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+	ASSERT(args->index >= 0 && args->index <= ichdr->count);
+
+	/*
+	 * Force open some space in the entry array and fill it in.
+	 */
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	if (args->index < ichdr->count) {
+		tmp  = ichdr->count - args->index;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		memmove(entry + 1, entry, tmp);
+		xfs_trans_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	}
+	ichdr->count++;
+
+	/*
+	 * Allocate space for the new string (at the end of the run).
+	 */
+	mp = args->trans->t_mountp;
+	ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
+	ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+	ASSERT(ichdr->freemap[mapindex].size >=
+		xfs_attr_leaf_newentsize(args, NULL));
+	ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
+	ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+	ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
+
+	entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+				     ichdr->freemap[mapindex].size);
+	entry->hashval = cpu_to_be32(args->hashval);
+	entry->flags = args->attr_filter;
+	if (tmp)
+		entry->flags |= XFS_ATTR_LOCAL;
+	if (args->op_flags & XFS_DA_OP_REPLACE) {
+		if (!(args->op_flags & XFS_DA_OP_LOGGED))
+			entry->flags |= XFS_ATTR_INCOMPLETE;
+		if ((args->blkno2 == args->blkno) &&
+		    (args->index2 <= args->index)) {
+			args->index2++;
+		}
+	}
+	xfs_trans_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	ASSERT((args->index == 0) ||
+	       (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+	ASSERT((args->index == ichdr->count - 1) ||
+	       (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
+
+	/*
+	 * For "remote" attribute values, simply note that we need to
+	 * allocate space for the "remote" value.  We can't actually
+	 * allocate the extents in this transaction, and we can't decide
+	 * which blocks they should be as we might allocate more blocks
+	 * as part of this transaction (a split operation for example).
+	 */
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		name_loc->namelen = args->namelen;
+		name_loc->valuelen = cpu_to_be16(args->valuelen);
+		memcpy((char *)name_loc->nameval, args->name, args->namelen);
+		memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+				   be16_to_cpu(name_loc->valuelen));
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->namelen = args->namelen;
+		memcpy((char *)name_rmt->name, args->name, args->namelen);
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		/* just in case */
+		name_rmt->valuelen = 0;
+		name_rmt->valueblk = 0;
+		args->rmtblkno = 1;
+		args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+		args->rmtvaluelen = args->valuelen;
+	}
+	xfs_trans_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+				   xfs_attr_leaf_entsize(leaf, args->index)));
+
+	/*
+	 * Update the control info for this leaf node
+	 */
+	if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+		ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+	ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf));
+	tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		if (ichdr->freemap[i].base == tmp) {
+			ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+			ichdr->freemap[i].size -=
+				min_t(uint16_t, ichdr->freemap[i].size,
+						sizeof(xfs_attr_leaf_entry_t));
+		}
+	}
+	ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+	return 0;
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr3_leaf_compact(
+	struct xfs_da_args	*args,
+	struct xfs_attr3_icleaf_hdr *ichdr_dst,
+	struct xfs_buf		*bp)
+{
+	struct xfs_attr_leafblock *leaf_src;
+	struct xfs_attr_leafblock *leaf_dst;
+	struct xfs_attr3_icleaf_hdr ichdr_src;
+	struct xfs_trans	*trans = args->trans;
+	char			*tmpbuffer;
+
+	trace_xfs_attr_leaf_compact(args);
+
+	tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+	memset(bp->b_addr, 0, args->geo->blksize);
+	leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+	leaf_dst = bp->b_addr;
+
+	/*
+	 * Copy the on-disk header back into the destination buffer to ensure
+	 * all the information in the header that is not part of the incore
+	 * header structure is preserved.
+	 */
+	memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+	/* Initialise the incore headers */
+	ichdr_src = *ichdr_dst;	/* struct copy */
+	ichdr_dst->firstused = args->geo->blksize;
+	ichdr_dst->usedbytes = 0;
+	ichdr_dst->count = 0;
+	ichdr_dst->holes = 0;
+	ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+	ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+						ichdr_dst->freemap[0].base;
+
+	/* write the header back to initialise the underlying buffer */
+	xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst);
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate name/value pairs packed and in sequence.
+	 */
+	xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+				leaf_dst, ichdr_dst, 0, ichdr_src.count);
+	/*
+	 * this logs the entire buffer, but the caller must write the header
+	 * back to the buffer when it is finished modifying it.
+	 */
+	xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
+
+	kmem_free(tmpbuffer);
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+	struct xfs_buf	*leaf1_bp,
+	struct xfs_attr3_icleaf_hdr *leaf1hdr,
+	struct xfs_buf	*leaf2_bp,
+	struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+	struct xfs_attr_leaf_entry *entries1;
+	struct xfs_attr_leaf_entry *entries2;
+
+	entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+	entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+	if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+	    ((be32_to_cpu(entries2[0].hashval) <
+	      be32_to_cpu(entries1[0].hashval)) ||
+	     (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+	      be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+		return 1;
+	}
+	return 0;
+}
+
+int
+xfs_attr_leaf_order(
+	struct xfs_buf	*leaf1_bp,
+	struct xfs_buf	*leaf2_bp)
+{
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	struct xfs_mount *mp = leaf1_bp->b_mount;
+
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
+	return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.  Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr3_leaf_rebalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_args	*args;
+	struct xfs_attr_leafblock *leaf1;
+	struct xfs_attr_leafblock *leaf2;
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	struct xfs_attr_leaf_entry *entries1;
+	struct xfs_attr_leaf_entry *entries2;
+	int			count;
+	int			totallen;
+	int			max;
+	int			space;
+	int			swap;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+	leaf1 = blk1->bp->b_addr;
+	leaf2 = blk2->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1);
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2);
+	ASSERT(ichdr2.count == 0);
+	args = state->args;
+
+	trace_xfs_attr_leaf_rebalance(args);
+
+	/*
+	 * Check ordering of blocks, reverse if it makes things simpler.
+	 *
+	 * NOTE: Given that all (current) callers pass in an empty
+	 * second block, this code should never set "swap".
+	 */
+	swap = 0;
+	if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+		swap(blk1, blk2);
+
+		/* swap structures rather than reconverting them */
+		swap(ichdr1, ichdr2);
+
+		leaf1 = blk1->bp->b_addr;
+		leaf2 = blk2->bp->b_addr;
+		swap = 1;
+	}
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.  Then get
+	 * the direction to copy and the number of elements to move.
+	 *
+	 * "inleaf" is true if the new entry should be inserted into blk1.
+	 * If "swap" is also true, then reverse the sense of "inleaf".
+	 */
+	state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+						      blk2, &ichdr2,
+						      &count, &totallen);
+	if (swap)
+		state->inleaf = !state->inleaf;
+
+	/*
+	 * Move any entries required from leaf to leaf:
+	 */
+	if (count < ichdr1.count) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count = ichdr1.count - count;
+		space  = ichdr1.usedbytes - totallen;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf2 is the destination, compact it if it looks tight.
+		 */
+		max  = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+		max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
+		if (space > max)
+			xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
+
+		/*
+		 * Move high entries from leaf1 to low end of leaf2.
+		 */
+		xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+				ichdr1.count - count, leaf2, &ichdr2, 0, count);
+
+	} else if (count > ichdr1.count) {
+		/*
+		 * I assert that since all callers pass in an empty
+		 * second buffer, this code should never execute.
+		 */
+		ASSERT(0);
+
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count -= ichdr1.count;
+		space  = totallen - ichdr1.usedbytes;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf1 is the destination, compact it if it looks tight.
+		 */
+		max  = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+		max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
+		if (space > max)
+			xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
+
+		/*
+		 * Move low entries from leaf2 to high end of leaf1.
+		 */
+		xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+					ichdr1.count, count);
+	}
+
+	xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1);
+	xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2);
+	xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+	xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	entries1 = xfs_attr3_leaf_entryp(leaf1);
+	entries2 = xfs_attr3_leaf_entryp(leaf2);
+	blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+	blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 * NOTE: this code depends on the (current) situation that the
+	 * second block was originally empty.
+	 *
+	 * If the insertion point moved to the 2nd block, we must adjust
+	 * the index.  We must also track the entry just following the
+	 * new entry for use in an "atomic rename" operation, that entry
+	 * is always the "old" entry and the "new" entry is what we are
+	 * inserting.  The index/blkno fields refer to the "old" entry,
+	 * while the index2/blkno2 fields refer to the "new" entry.
+	 */
+	if (blk1->index > ichdr1.count) {
+		ASSERT(state->inleaf == 0);
+		blk2->index = blk1->index - ichdr1.count;
+		args->index = args->index2 = blk2->index;
+		args->blkno = args->blkno2 = blk2->blkno;
+	} else if (blk1->index == ichdr1.count) {
+		if (state->inleaf) {
+			args->index = blk1->index;
+			args->blkno = blk1->blkno;
+			args->index2 = 0;
+			args->blkno2 = blk2->blkno;
+		} else {
+			/*
+			 * On a double leaf split, the original attr location
+			 * is already stored in blkno2/index2, so don't
+			 * overwrite it overwise we corrupt the tree.
+			 */
+			blk2->index = blk1->index - ichdr1.count;
+			args->index = blk2->index;
+			args->blkno = blk2->blkno;
+			if (!state->extravalid) {
+				/*
+				 * set the new attr location to match the old
+				 * one and let the higher level split code
+				 * decide where in the leaf to place it.
+				 */
+				args->index2 = blk2->index;
+				args->blkno2 = blk2->blkno;
+			}
+		}
+	} else {
+		ASSERT(state->inleaf == 1);
+		args->index = args->index2 = blk1->index;
+		args->blkno = args->blkno2 = blk1->blkno;
+	}
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr3_leaf_figure_balance(
+	struct xfs_da_state		*state,
+	struct xfs_da_state_blk		*blk1,
+	struct xfs_attr3_icleaf_hdr	*ichdr1,
+	struct xfs_da_state_blk		*blk2,
+	struct xfs_attr3_icleaf_hdr	*ichdr2,
+	int				*countarg,
+	int				*usedbytesarg)
+{
+	struct xfs_attr_leafblock	*leaf1 = blk1->bp->b_addr;
+	struct xfs_attr_leafblock	*leaf2 = blk2->bp->b_addr;
+	struct xfs_attr_leaf_entry	*entry;
+	int				count;
+	int				max;
+	int				index;
+	int				totallen = 0;
+	int				half;
+	int				lastdelta;
+	int				foundit = 0;
+	int				tmp;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.
+	 */
+	max = ichdr1->count + ichdr2->count;
+	half = (max + 1) * sizeof(*entry);
+	half += ichdr1->usedbytes + ichdr2->usedbytes +
+			xfs_attr_leaf_newentsize(state->args, NULL);
+	half /= 2;
+	lastdelta = state->args->geo->blksize;
+	entry = xfs_attr3_leaf_entryp(leaf1);
+	for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A)	(((A) < 0) ? -(A) : (A))
+		/*
+		 * The new entry is in the first block, account for it.
+		 */
+		if (count == blk1->index) {
+			tmp = totallen + sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args, NULL);
+			if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+				break;
+			lastdelta = XFS_ATTR_ABS(half - tmp);
+			totallen = tmp;
+			foundit = 1;
+		}
+
+		/*
+		 * Wrap around into the second block if necessary.
+		 */
+		if (count == ichdr1->count) {
+			leaf1 = leaf2;
+			entry = xfs_attr3_leaf_entryp(leaf1);
+			index = 0;
+		}
+
+		/*
+		 * Figure out if next leaf entry would be too much.
+		 */
+		tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+									index);
+		if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+			break;
+		lastdelta = XFS_ATTR_ABS(half - tmp);
+		totallen = tmp;
+#undef XFS_ATTR_ABS
+	}
+
+	/*
+	 * Calculate the number of usedbytes that will end up in lower block.
+	 * If new entry not in lower block, fix up the count.
+	 */
+	totallen -= count * sizeof(*entry);
+	if (foundit) {
+		totallen -= sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args, NULL);
+	}
+
+	*countarg = count;
+	*usedbytesarg = totallen;
+	return foundit;
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr3_leaf_toosmall(
+	struct xfs_da_state	*state,
+	int			*action)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		blkno;
+	int			bytes;
+	int			forward;
+	int			error;
+	int			retval;
+	int			i;
+
+	trace_xfs_attr_leaf_toosmall(state->args);
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	leaf = blk->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf);
+	bytes = xfs_attr3_leaf_hdr_size(leaf) +
+		ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+		ichdr.usedbytes;
+	if (bytes > (state->args->geo->blksize >> 1)) {
+		*action = 0;	/* blk over 50%, don't try to join */
+		return 0;
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (ichdr.count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (ichdr.forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return error;
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return 0;
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink an attribute list over time.
+	 */
+	/* start with smaller blk num */
+	forward = ichdr.forw < ichdr.back;
+	for (i = 0; i < 2; forward = !forward, i++) {
+		struct xfs_attr3_icleaf_hdr ichdr2;
+		if (forward)
+			blkno = ichdr.forw;
+		else
+			blkno = ichdr.back;
+		if (blkno == 0)
+			continue;
+		error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
+					blkno, &bp);
+		if (error)
+			return error;
+
+		xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr);
+
+		bytes = state->args->geo->blksize -
+			(state->args->geo->blksize >> 2) -
+			ichdr.usedbytes - ichdr2.usedbytes -
+			((ichdr.count + ichdr2.count) *
+					sizeof(xfs_attr_leaf_entry_t)) -
+			xfs_attr3_leaf_hdr_size(leaf);
+
+		xfs_trans_brelse(state->args->trans, bp);
+		if (bytes >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da3_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return error;
+	if (retval) {
+		*action = 0;
+	} else {
+		*action = 1;
+	}
+	return 0;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr3_leaf_remove(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	int			before;
+	int			after;
+	int			smallest;
+	int			entsize;
+	int			tablesize;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_remove(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+
+	ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
+	ASSERT(args->index >= 0 && args->index < ichdr.count);
+	ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+					xfs_attr3_leaf_hdr_size(leaf));
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+	ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+	ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+	/*
+	 * Scan through free region table:
+	 *    check for adjacency of free'd entry with an existing one,
+	 *    find smallest free region in case we need to replace it,
+	 *    adjust any map that borders the entry table,
+	 */
+	tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+	tmp = ichdr.freemap[0].size;
+	before = after = -1;
+	smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+	entsize = xfs_attr_leaf_entsize(leaf, args->index);
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+		ASSERT(ichdr.freemap[i].size < args->geo->blksize);
+		if (ichdr.freemap[i].base == tablesize) {
+			ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+			ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
+		}
+
+		if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+				be16_to_cpu(entry->nameidx)) {
+			before = i;
+		} else if (ichdr.freemap[i].base ==
+				(be16_to_cpu(entry->nameidx) + entsize)) {
+			after = i;
+		} else if (ichdr.freemap[i].size < tmp) {
+			tmp = ichdr.freemap[i].size;
+			smallest = i;
+		}
+	}
+
+	/*
+	 * Coalesce adjacent freemap regions,
+	 * or replace the smallest region.
+	 */
+	if ((before >= 0) || (after >= 0)) {
+		if ((before >= 0) && (after >= 0)) {
+			ichdr.freemap[before].size += entsize;
+			ichdr.freemap[before].size += ichdr.freemap[after].size;
+			ichdr.freemap[after].base = 0;
+			ichdr.freemap[after].size = 0;
+		} else if (before >= 0) {
+			ichdr.freemap[before].size += entsize;
+		} else {
+			ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+			ichdr.freemap[after].size += entsize;
+		}
+	} else {
+		/*
+		 * Replace smallest region (if it is smaller than free'd entry)
+		 */
+		if (ichdr.freemap[smallest].size < entsize) {
+			ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+			ichdr.freemap[smallest].size = entsize;
+		}
+	}
+
+	/*
+	 * Did we remove the first entry?
+	 */
+	if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
+		smallest = 1;
+	else
+		smallest = 0;
+
+	/*
+	 * Compress the remaining entries and zero out the removed stuff.
+	 */
+	memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+	ichdr.usedbytes -= entsize;
+	xfs_trans_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+				   entsize));
+
+	tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+	memmove(entry, entry + 1, tmp);
+	ichdr.count--;
+	xfs_trans_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+	memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
+
+	/*
+	 * If we removed the first entry, re-find the first used byte
+	 * in the name area.  Note that if the entry was the "firstused",
+	 * then we don't have a "hole" in our block resulting from
+	 * removing the name.
+	 */
+	if (smallest) {
+		tmp = args->geo->blksize;
+		entry = xfs_attr3_leaf_entryp(leaf);
+		for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+			ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+			ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+			if (be16_to_cpu(entry->nameidx) < tmp)
+				tmp = be16_to_cpu(entry->nameidx);
+		}
+		ichdr.firstused = tmp;
+		ASSERT(ichdr.firstused != 0);
+	} else {
+		ichdr.holes = 1;	/* mark as needing compaction */
+	}
+	xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+					  xfs_attr3_leaf_hdr_size(leaf)));
+
+	/*
+	 * Check if leaf is less than 50% full, caller may want to
+	 * "join" the leaf with a sibling if so.
+	 */
+	tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+	      ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+	return tmp < args->geo->magicpct; /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr3_leaf_unbalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+	struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+	struct xfs_attr3_icleaf_hdr drophdr;
+	struct xfs_attr3_icleaf_hdr savehdr;
+	struct xfs_attr_leaf_entry *entry;
+
+	trace_xfs_attr_leaf_unbalance(state->args);
+
+	drop_leaf = drop_blk->bp->b_addr;
+	save_leaf = save_blk->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf);
+	xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf);
+	entry = xfs_attr3_leaf_entryp(drop_leaf);
+
+	/*
+	 * Save last hashval from dying block for later Btree fixup.
+	 */
+	drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
+
+	/*
+	 * Check if we need a temp buffer, or can we do it in place.
+	 * Note that we don't check "leaf" for holes because we will
+	 * always be dropping it, toosmall() decided that for us already.
+	 */
+	if (savehdr.holes == 0) {
+		/*
+		 * dest leaf has no holes, so we add there.  May need
+		 * to make some room in the entry array.
+		 */
+		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+					 drop_blk->bp, &drophdr)) {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						save_leaf, &savehdr, 0,
+						drophdr.count);
+		} else {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						save_leaf, &savehdr,
+						savehdr.count, drophdr.count);
+		}
+	} else {
+		/*
+		 * Destination has holes, so we make a temporary copy
+		 * of the leaf and add them both to that.
+		 */
+		struct xfs_attr_leafblock *tmp_leaf;
+		struct xfs_attr3_icleaf_hdr tmphdr;
+
+		tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
+
+		/*
+		 * Copy the header into the temp leaf so that all the stuff
+		 * not in the incore header is present and gets copied back in
+		 * once we've moved all the entries.
+		 */
+		memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+		memset(&tmphdr, 0, sizeof(tmphdr));
+		tmphdr.magic = savehdr.magic;
+		tmphdr.forw = savehdr.forw;
+		tmphdr.back = savehdr.back;
+		tmphdr.firstused = state->args->geo->blksize;
+
+		/* write the header to the temp buffer to initialise it */
+		xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr);
+
+		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+					 drop_blk->bp, &drophdr)) {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						tmp_leaf, &tmphdr, 0,
+						drophdr.count);
+			xfs_attr3_leaf_moveents(state->args,
+						save_leaf, &savehdr, 0,
+						tmp_leaf, &tmphdr, tmphdr.count,
+						savehdr.count);
+		} else {
+			xfs_attr3_leaf_moveents(state->args,
+						save_leaf, &savehdr, 0,
+						tmp_leaf, &tmphdr, 0,
+						savehdr.count);
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						tmp_leaf, &tmphdr, tmphdr.count,
+						drophdr.count);
+		}
+		memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
+		savehdr = tmphdr; /* struct copy */
+		kmem_free(tmp_leaf);
+	}
+
+	xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
+	xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
+					   state->args->geo->blksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	entry = xfs_attr3_leaf_entryp(save_leaf);
+	save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr3_leaf_lookup_int(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	xfs_dahash_t		hashval;
+	int			probe;
+	int			span;
+
+	trace_xfs_attr_leaf_lookup(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+	if (ichdr.count >= args->geo->blksize / 8) {
+		xfs_buf_mark_corrupt(bp);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Binary search.  (note: small blocks will skip this loop)
+	 */
+	hashval = args->hashval;
+	probe = span = ichdr.count / 2;
+	for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
+		span /= 2;
+		if (be32_to_cpu(entry->hashval) < hashval)
+			probe += span;
+		else if (be32_to_cpu(entry->hashval) > hashval)
+			probe -= span;
+		else
+			break;
+	}
+	if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
+		xfs_buf_mark_corrupt(bp);
+		return -EFSCORRUPTED;
+	}
+	if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
+		xfs_buf_mark_corrupt(bp);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Since we may have duplicate hashval's, find the first matching
+	 * hashval in the leaf.
+	 */
+	while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
+		entry--;
+		probe--;
+	}
+	while (probe < ichdr.count &&
+	       be32_to_cpu(entry->hashval) < hashval) {
+		entry++;
+		probe++;
+	}
+	if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
+		args->index = probe;
+		return -ENOATTR;
+	}
+
+	/*
+	 * Duplicate keys may be present, so search all of them for a match.
+	 */
+	for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
+			entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			name_loc = xfs_attr3_leaf_name_local(leaf, probe);
+			if (!xfs_attr_match(args, name_loc->namelen,
+					name_loc->nameval, entry->flags))
+				continue;
+			args->index = probe;
+			return -EEXIST;
+		} else {
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
+			if (!xfs_attr_match(args, name_rmt->namelen,
+					name_rmt->name, entry->flags))
+				continue;
+			args->index = probe;
+			args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+			args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+			args->rmtblkcnt = xfs_attr3_rmt_blocks(
+							args->dp->i_mount,
+							args->rmtvaluelen);
+			return -EEXIST;
+		}
+	}
+	args->index = probe;
+	return -ENOATTR;
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ *
+ * If args->valuelen is zero, only the length needs to be returned.  Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
+ */
+int
+xfs_attr3_leaf_getvalue(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	ASSERT(ichdr.count < args->geo->blksize / 8);
+	ASSERT(args->index < ichdr.count);
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		ASSERT(name_loc->namelen == args->namelen);
+		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+		return xfs_attr_copy_value(args,
+					&name_loc->nameval[args->namelen],
+					be16_to_cpu(name_loc->valuelen));
+	}
+
+	name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+	ASSERT(name_rmt->namelen == args->namelen);
+	ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+	args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+	args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+	args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+					       args->rmtvaluelen);
+	return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr3_leaf_moveents(
+	struct xfs_da_args		*args,
+	struct xfs_attr_leafblock	*leaf_s,
+	struct xfs_attr3_icleaf_hdr	*ichdr_s,
+	int				start_s,
+	struct xfs_attr_leafblock	*leaf_d,
+	struct xfs_attr3_icleaf_hdr	*ichdr_d,
+	int				start_d,
+	int				count)
+{
+	struct xfs_attr_leaf_entry	*entry_s;
+	struct xfs_attr_leaf_entry	*entry_d;
+	int				desti;
+	int				tmp;
+	int				i;
+
+	/*
+	 * Check for nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+	       ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+	ASSERT(ichdr_s->magic == ichdr_d->magic);
+	ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
+	ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+					+ xfs_attr3_leaf_hdr_size(leaf_s));
+	ASSERT(ichdr_d->count < args->geo->blksize / 8);
+	ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+					+ xfs_attr3_leaf_hdr_size(leaf_d));
+
+	ASSERT(start_s < ichdr_s->count);
+	ASSERT(start_d <= ichdr_d->count);
+	ASSERT(count <= ichdr_s->count);
+
+
+	/*
+	 * Move the entries in the destination leaf up to make a hole?
+	 */
+	if (start_d < ichdr_d->count) {
+		tmp  = ichdr_d->count - start_d;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+		entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+		memmove(entry_d, entry_s, tmp);
+	}
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate attribute info packed and in sequence.
+	 */
+	entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+	entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+	desti = start_d;
+	for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+		ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
+		tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+		/*
+		 * Code to drop INCOMPLETE entries.  Difficult to use as we
+		 * may also need to change the insertion index.  Code turned
+		 * off for 6.2, should be revisited later.
+		 */
+		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+			ichdr_s->usedbytes -= tmp;
+			ichdr_s->count -= 1;
+			entry_d--;	/* to compensate for ++ in loop hdr */
+			desti--;
+			if ((start_s + i) < offset)
+				result++;	/* insertion index adjustment */
+		} else {
+#endif /* GROT */
+			ichdr_d->firstused -= tmp;
+			/* both on-disk, don't endian flip twice */
+			entry_d->hashval = entry_s->hashval;
+			entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
+			entry_d->flags = entry_s->flags;
+			ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+							<= args->geo->blksize);
+			memmove(xfs_attr3_leaf_name(leaf_d, desti),
+				xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
+			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+							<= args->geo->blksize);
+			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+			ichdr_s->usedbytes -= tmp;
+			ichdr_d->usedbytes += tmp;
+			ichdr_s->count -= 1;
+			ichdr_d->count += 1;
+			tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf_d);
+			ASSERT(ichdr_d->firstused >= tmp);
+#ifdef GROT
+		}
+#endif /* GROT */
+	}
+
+	/*
+	 * Zero out the entries we just copied.
+	 */
+	if (start_s == ichdr_s->count) {
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + args->geo->blksize));
+		memset(entry_s, 0, tmp);
+	} else {
+		/*
+		 * Move the remaining entries down to fill the hole,
+		 * then zero the entries at the top.
+		 */
+		tmp  = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+		entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+		memmove(entry_d, entry_s, tmp);
+
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + args->geo->blksize));
+		memset(entry_s, 0, tmp);
+	}
+
+	/*
+	 * Fill in the freemap information
+	 */
+	ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+	ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+	ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+	ichdr_d->freemap[1].base = 0;
+	ichdr_d->freemap[2].base = 0;
+	ichdr_d->freemap[1].size = 0;
+	ichdr_d->freemap[2].size = 0;
+	ichdr_s->holes = 1;	/* leaf may not be compact */
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(
+	struct xfs_buf	*bp,
+	int		*count)
+{
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_mount *mp = bp->b_mount;
+
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
+	entries = xfs_attr3_leaf_entryp(bp->b_addr);
+	if (count)
+		*count = ichdr.count;
+	if (!ichdr.count)
+		return 0;
+	return be32_to_cpu(entries[ichdr.count - 1].hashval);
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+STATIC int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+	struct xfs_attr_leaf_entry *entries;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int size;
+
+	entries = xfs_attr3_leaf_entryp(leaf);
+	if (entries[index].flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, index);
+		size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+						   be16_to_cpu(name_loc->valuelen));
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
+		size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
+	}
+	return size;
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(
+	struct xfs_da_args	*args,
+	int			*local)
+{
+	int			size;
+
+	size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+	if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+		if (local)
+			*local = 1;
+		return size;
+	}
+	if (local)
+		*local = 0;
+	return xfs_attr_leaf_entsize_remote(args->namelen);
+}
+
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_clearflag(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp;
+	int			error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen;
+	char *name;
+#endif /* DEBUG */
+
+	trace_xfs_attr_leaf_clearflag(args);
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+	if (error)
+		return error;
+
+	leaf = bp->b_addr;
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	ASSERT(args->index < ichdr.count);
+	ASSERT(args->index >= 0);
+
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		namelen = name_loc->namelen;
+		name = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		namelen = name_rmt->namelen;
+		name = (char *)name_rmt->name;
+	}
+	ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
+	ASSERT(namelen == args->namelen);
+	ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+	entry->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+	if (args->rmtblkno) {
+		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+		xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+
+	return 0;
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_setflag(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp;
+	int error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr;
+#endif
+
+	trace_xfs_attr_leaf_setflag(args);
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+	if (error)
+		return error;
+
+	leaf = bp->b_addr;
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+	ASSERT(args->index < ichdr.count);
+	ASSERT(args->index >= 0);
+#endif
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+	ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+	entry->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp,
+			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->valueblk = 0;
+		name_rmt->valuelen = 0;
+		xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+
+	return 0;
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr3_leaf_flipflags(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf1;
+	struct xfs_attr_leafblock *leaf2;
+	struct xfs_attr_leaf_entry *entry1;
+	struct xfs_attr_leaf_entry *entry2;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp1;
+	struct xfs_buf		*bp2;
+	int error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen1, namelen2;
+	char *name1, *name2;
+#endif /* DEBUG */
+
+	trace_xfs_attr_leaf_flipflags(args);
+
+	/*
+	 * Read the block containing the "old" attr
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1);
+	if (error)
+		return error;
+
+	/*
+	 * Read the block containing the "new" attr, if it is different
+	 */
+	if (args->blkno2 != args->blkno) {
+		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
+					   &bp2);
+		if (error)
+			return error;
+	} else {
+		bp2 = bp1;
+	}
+
+	leaf1 = bp1->b_addr;
+	entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
+
+	leaf2 = bp2->b_addr;
+	entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
+
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1);
+	ASSERT(args->index < ichdr1.count);
+	ASSERT(args->index >= 0);
+
+	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2);
+	ASSERT(args->index2 < ichdr2.count);
+	ASSERT(args->index2 >= 0);
+
+	if (entry1->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
+		namelen1 = name_loc->namelen;
+		name1 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+		namelen1 = name_rmt->namelen;
+		name1 = (char *)name_rmt->name;
+	}
+	if (entry2->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
+		namelen2 = name_loc->namelen;
+		name2 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+		namelen2 = name_rmt->namelen;
+		name2 = (char *)name_rmt->name;
+	}
+	ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
+	ASSERT(namelen1 == namelen2);
+	ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+	ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+	ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+	entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp1,
+			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+	if (args->rmtblkno) {
+		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+		xfs_trans_log_buf(args->trans, bp1,
+			 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+	}
+
+	entry2->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp2,
+			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+		name_rmt->valueblk = 0;
+		name_rmt->valuelen = 0;
+		xfs_trans_log_buf(args->trans, bp2,
+			 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
new file mode 100644
index 000000000..368f4d9fa
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define	__XFS_ATTR_LEAF_H__
+
+struct attrlist;
+struct xfs_attr_list_context;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Incore version of the attribute leaf header.
+ */
+struct xfs_attr3_icleaf_hdr {
+	uint32_t	forw;
+	uint32_t	back;
+	uint16_t	magic;
+	uint16_t	count;
+	uint16_t	usedbytes;
+	/*
+	 * Firstused is 32-bit here instead of 16-bit like the on-disk variant
+	 * to support maximum fsb size of 64k without overflow issues throughout
+	 * the attr code. Instead, the overflow condition is handled on
+	 * conversion to/from disk.
+	 */
+	uint32_t	firstused;
+	__u8		holes;
+	struct {
+		uint16_t	base;
+		uint16_t	size;
+	} freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute fork size < XFS_LITINO(mp).
+ */
+void	xfs_attr_shortform_create(struct xfs_da_args *args);
+void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
+int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int	xfs_attr_sf_removename(struct xfs_da_args *args);
+int	xfs_attr_sf_findname(struct xfs_da_args *args,
+			     struct xfs_attr_sf_entry **sfep,
+			     unsigned int *basep);
+int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
+int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
+xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
+void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
+
+/*
+ * Internal routines when attribute fork size == XFS_LBSIZE(mp).
+ */
+int	xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int	xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
+				   struct xfs_da_args *args, int forkoff);
+int	xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int	xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int	xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_attr3_leaf_split(struct xfs_da_state *state,
+				   struct xfs_da_state_blk *oldblk,
+				   struct xfs_da_state_blk *newblk);
+int	xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
+					struct xfs_da_args *args);
+int	xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int	xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
+				 struct xfs_da_args *args);
+int	xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
+				    struct xfs_da_args *args);
+int	xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+				struct xfs_attr_list_context *context);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void	xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
+				       struct xfs_da_state_blk *drop_blk,
+				       struct xfs_da_state_blk *save_blk);
+/*
+ * Utility routines.
+ */
+xfs_dahash_t	xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
+int	xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
+				   struct xfs_buf *leaf2_bp);
+int	xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
+int	xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			xfs_dablk_t bno, struct xfs_buf **bpp);
+void	xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
+				     struct xfs_attr3_icleaf_hdr *to,
+				     struct xfs_attr_leafblock *from);
+void	xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
+				   struct xfs_attr_leafblock *to,
+				   struct xfs_attr3_icleaf_hdr *from);
+
+#endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
new file mode 100644
index 000000000..d440393b4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -0,0 +1,712 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
+
+/*
+ * Remote Attribute Values
+ * =======================
+ *
+ * Remote extended attribute values are conceptually simple -- they're written
+ * to data blocks mapped by an inode's attribute fork, and they have an upper
+ * size limit of 64k.  Setting a value does not involve the XFS log.
+ *
+ * However, on a v5 filesystem, maximally sized remote attr values require one
+ * block more than 64k worth of space to hold both the remote attribute value
+ * header (64 bytes).  On a 4k block filesystem this results in a 68k buffer;
+ * on a 64k block filesystem, this would be a 128k buffer.  Note that the log
+ * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k).
+ * Therefore, we /must/ ensure that remote attribute value buffers never touch
+ * the logging system and therefore never have a log item.
+ */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+	struct xfs_mount *mp,
+	int		attrlen)
+{
+	if (xfs_has_crc(mp)) {
+		int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+		return (attrlen + buflen - 1) / buflen;
+	}
+	return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static xfs_failaddr_t
+xfs_attr3_rmt_hdr_ok(
+	void			*ptr,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (bno != be64_to_cpu(rmt->rm_blkno))
+		return __this_address;
+	if (offset != be32_to_cpu(rmt->rm_offset))
+		return __this_address;
+	if (size != be32_to_cpu(rmt->rm_bytes))
+		return __this_address;
+	if (ino != be64_to_cpu(rmt->rm_owner))
+		return __this_address;
+
+	/* ok */
+	return NULL;
+}
+
+static xfs_failaddr_t
+xfs_attr3_rmt_verify(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	void			*ptr,
+	int			fsbsize,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (!xfs_verify_magic(bp, rmt->rm_magic))
+		return __this_address;
+	if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+	if (be64_to_cpu(rmt->rm_blkno) != bno)
+		return __this_address;
+	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+		return __this_address;
+	if (be32_to_cpu(rmt->rm_offset) +
+				be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
+		return __this_address;
+	if (rmt->rm_owner == 0)
+		return __this_address;
+
+	return NULL;
+}
+
+static int
+__xfs_attr3_rmt_read_verify(
+	struct xfs_buf	*bp,
+	bool		check_crc,
+	xfs_failaddr_t	*failaddr)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	char		*ptr;
+	int		len;
+	xfs_daddr_t	bno;
+	int		blksize = mp->m_attr_geo->blksize;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_has_crc(mp))
+		return 0;
+
+	ptr = bp->b_addr;
+	bno = xfs_buf_daddr(bp);
+	len = BBTOB(bp->b_length);
+	ASSERT(len >= blksize);
+
+	while (len > 0) {
+		if (check_crc &&
+		    !xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+			*failaddr = __this_address;
+			return -EFSBADCRC;
+		}
+		*failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
+		if (*failaddr)
+			return -EFSCORRUPTED;
+		len -= blksize;
+		ptr += blksize;
+		bno += BTOBB(blksize);
+	}
+
+	if (len != 0) {
+		*failaddr = __this_address;
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+	int		error;
+
+	error = __xfs_attr3_rmt_read_verify(bp, true, &fa);
+	if (error)
+		xfs_verifier_error(bp, error, fa);
+}
+
+static xfs_failaddr_t
+xfs_attr3_rmt_verify_struct(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+	int		error;
+
+	error = __xfs_attr3_rmt_read_verify(bp, false, &fa);
+	return error ? fa : NULL;
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	xfs_failaddr_t	fa;
+	int		blksize = mp->m_attr_geo->blksize;
+	char		*ptr;
+	int		len;
+	xfs_daddr_t	bno;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_has_crc(mp))
+		return;
+
+	ptr = bp->b_addr;
+	bno = xfs_buf_daddr(bp);
+	len = BBTOB(bp->b_length);
+	ASSERT(len >= blksize);
+
+	while (len > 0) {
+		struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+
+		fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
+		if (fa) {
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+			return;
+		}
+
+		/*
+		 * Ensure we aren't writing bogus LSNs to disk. See
+		 * xfs_attr3_rmt_hdr_set() for the explanation.
+		 */
+		if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) {
+			xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+			return;
+		}
+		xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
+
+		len -= blksize;
+		ptr += blksize;
+		bno += BTOBB(blksize);
+	}
+
+	if (len != 0)
+		xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+	.name = "xfs_attr3_rmt",
+	.magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
+	.verify_read = xfs_attr3_rmt_read_verify,
+	.verify_write = xfs_attr3_rmt_write_verify,
+	.verify_struct = xfs_attr3_rmt_verify_struct,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+	struct xfs_mount	*mp,
+	void			*ptr,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (!xfs_has_crc(mp))
+		return 0;
+
+	rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+	rmt->rm_offset = cpu_to_be32(offset);
+	rmt->rm_bytes = cpu_to_be32(size);
+	uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid);
+	rmt->rm_owner = cpu_to_be64(ino);
+	rmt->rm_blkno = cpu_to_be64(bno);
+
+	/*
+	 * Remote attribute blocks are written synchronously, so we don't
+	 * have an LSN that we can stamp in them that makes any sense to log
+	 * recovery. To ensure that log recovery handles overwrites of these
+	 * blocks sanely (i.e. once they've been freed and reallocated as some
+	 * other type of metadata) we need to ensure that the LSN has a value
+	 * that tells log recovery to ignore the LSN and overwrite the buffer
+	 * with whatever is in it's log. To do this, we use the magic
+	 * NULLCOMMITLSN to indicate that the LSN is invalid.
+	 */
+	rmt->rm_lsn = cpu_to_be64(NULLCOMMITLSN);
+
+	return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_ino_t	ino,
+	int		*offset,
+	int		*valuelen,
+	uint8_t		**dst)
+{
+	char		*src = bp->b_addr;
+	xfs_daddr_t	bno = xfs_buf_daddr(bp);
+	int		len = BBTOB(bp->b_length);
+	int		blksize = mp->m_attr_geo->blksize;
+
+	ASSERT(len >= blksize);
+
+	while (len > 0 && *valuelen > 0) {
+		int hdr_size = 0;
+		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+		byte_cnt = min(*valuelen, byte_cnt);
+
+		if (xfs_has_crc(mp)) {
+			if (xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+						  byte_cnt, bno)) {
+				xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+					bno, *offset, byte_cnt, ino);
+				return -EFSCORRUPTED;
+			}
+			hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+		}
+
+		memcpy(*dst, src + hdr_size, byte_cnt);
+
+		/* roll buffer forwards */
+		len -= blksize;
+		src += blksize;
+		bno += BTOBB(blksize);
+
+		/* roll attribute data forwards */
+		*valuelen -= byte_cnt;
+		*dst += byte_cnt;
+		*offset += byte_cnt;
+	}
+	return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_ino_t	ino,
+	int		*offset,
+	int		*valuelen,
+	uint8_t		**src)
+{
+	char		*dst = bp->b_addr;
+	xfs_daddr_t	bno = xfs_buf_daddr(bp);
+	int		len = BBTOB(bp->b_length);
+	int		blksize = mp->m_attr_geo->blksize;
+
+	ASSERT(len >= blksize);
+
+	while (len > 0 && *valuelen > 0) {
+		int hdr_size;
+		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+		byte_cnt = min(*valuelen, byte_cnt);
+		hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+						 byte_cnt, bno);
+
+		memcpy(dst + hdr_size, *src, byte_cnt);
+
+		/*
+		 * If this is the last block, zero the remainder of it.
+		 * Check that we are actually the last block, too.
+		 */
+		if (byte_cnt + hdr_size < blksize) {
+			ASSERT(*valuelen - byte_cnt == 0);
+			ASSERT(len == blksize);
+			memset(dst + hdr_size + byte_cnt, 0,
+					blksize - hdr_size - byte_cnt);
+		}
+
+		/* roll buffer forwards */
+		len -= blksize;
+		dst += blksize;
+		bno += BTOBB(blksize);
+
+		/* roll attribute data forwards */
+		*valuelen -= byte_cnt;
+		*src += byte_cnt;
+		*offset += byte_cnt;
+	}
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
+int
+xfs_attr_rmtval_get(
+	struct xfs_da_args	*args)
+{
+	struct xfs_bmbt_irec	map[ATTR_RMTVALUE_MAPSIZE];
+	struct xfs_mount	*mp = args->dp->i_mount;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		lblkno = args->rmtblkno;
+	uint8_t			*dst = args->value;
+	int			valuelen;
+	int			nmap;
+	int			error;
+	int			blkcnt = args->rmtblkcnt;
+	int			i;
+	int			offset = 0;
+
+	trace_xfs_attr_rmtval_get(args);
+
+	ASSERT(args->valuelen != 0);
+	ASSERT(args->rmtvaluelen == args->valuelen);
+
+	valuelen = args->rmtvaluelen;
+	while (valuelen > 0) {
+		nmap = ATTR_RMTVALUE_MAPSIZE;
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap >= 1);
+
+		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+			xfs_daddr_t	dblkno;
+			int		dblkcnt;
+
+			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+			       (map[i].br_startblock != HOLESTARTBLOCK));
+			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+			dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
+					0, &bp, &xfs_attr3_rmt_buf_ops);
+			if (error)
+				return error;
+
+			error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+							&offset, &valuelen,
+							&dst);
+			xfs_buf_relse(bp);
+			if (error)
+				return error;
+
+			/* roll attribute extent map forwards */
+			lblkno += map[i].br_blockcount;
+			blkcnt -= map[i].br_blockcount;
+		}
+	}
+	ASSERT(valuelen == 0);
+	return 0;
+}
+
+/*
+ * Find a "hole" in the attribute address space large enough for us to drop the
+ * new attributes value into
+ */
+int
+xfs_attr_rmt_find_hole(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			error;
+	int			blkcnt;
+	xfs_fileoff_t		lfileoff = 0;
+
+	/*
+	 * Because CRC enable attributes have headers, we can't just do a
+	 * straight byte to FSB conversion and have to take the header space
+	 * into account.
+	 */
+	blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+						   XFS_ATTR_FORK);
+	if (error)
+		return error;
+
+	args->rmtblkno = (xfs_dablk_t)lfileoff;
+	args->rmtblkcnt = blkcnt;
+
+	return 0;
+}
+
+int
+xfs_attr_rmtval_set_value(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_bmbt_irec	map;
+	xfs_dablk_t		lblkno;
+	uint8_t			*src = args->value;
+	int			blkcnt;
+	int			valuelen;
+	int			nmap;
+	int			error;
+	int			offset = 0;
+
+	/*
+	 * Roll through the "value", copying the attribute value to the
+	 * already-allocated blocks.  Blocks are written synchronously
+	 * so that we can know they are all on disk before we turn off
+	 * the INCOMPLETE flag.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	valuelen = args->rmtvaluelen;
+	while (valuelen > 0) {
+		struct xfs_buf	*bp;
+		xfs_daddr_t	dblkno;
+		int		dblkcnt;
+
+		ASSERT(blkcnt > 0);
+
+		nmap = 1;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, &map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, &bp);
+		if (error)
+			return error;
+		bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+		xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+				       &valuelen, &src);
+
+		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
+
+
+		/* roll attribute extent map forwards */
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+	ASSERT(valuelen == 0);
+	return 0;
+}
+
+/* Mark stale any incore buffers for the remote value. */
+int
+xfs_attr_rmtval_stale(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*map,
+	xfs_buf_flags_t		incore_flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) ||
+	    XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
+		return -EFSCORRUPTED;
+
+	error = xfs_buf_incore(mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, map->br_startblock),
+			XFS_FSB_TO_BB(mp, map->br_blockcount),
+			incore_flags, &bp);
+	if (error) {
+		if (error == -ENOENT)
+			return 0;
+		return error;
+	}
+
+	xfs_buf_stale(bp);
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+/*
+ * Find a hole for the attr and store it in the delayed attr context.  This
+ * initializes the context to roll through allocating an attr extent for a
+ * delayed attr operation
+ */
+int
+xfs_attr_rmtval_find_space(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args		*args = attr->xattri_da_args;
+	struct xfs_bmbt_irec		*map = &attr->xattri_map;
+	int				error;
+
+	attr->xattri_lblkno = 0;
+	attr->xattri_blkcnt = 0;
+	args->rmtblkcnt = 0;
+	args->rmtblkno = 0;
+	memset(map, 0, sizeof(struct xfs_bmbt_irec));
+
+	error = xfs_attr_rmt_find_hole(args);
+	if (error)
+		return error;
+
+	attr->xattri_blkcnt = args->rmtblkcnt;
+	attr->xattri_lblkno = args->rmtblkno;
+
+	return 0;
+}
+
+/*
+ * Write one block of the value associated with an attribute into the
+ * out-of-line buffer that we have defined for it. This is similar to a subset
+ * of xfs_attr_rmtval_set, but records the current block to the delayed attr
+ * context, and leaves transaction handling to the caller.
+ */
+int
+xfs_attr_rmtval_set_blk(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args		*args = attr->xattri_da_args;
+	struct xfs_inode		*dp = args->dp;
+	struct xfs_bmbt_irec		*map = &attr->xattri_map;
+	int nmap;
+	int error;
+
+	nmap = 1;
+	error = xfs_bmapi_write(args->trans, dp,
+			(xfs_fileoff_t)attr->xattri_lblkno,
+			attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+			map, &nmap);
+	if (error)
+		return error;
+
+	ASSERT(nmap == 1);
+	ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
+	       (map->br_startblock != HOLESTARTBLOCK));
+
+	/* roll attribute extent map forwards */
+	attr->xattri_lblkno += map->br_blockcount;
+	attr->xattri_blkcnt -= map->br_blockcount;
+
+	return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_invalidate(
+	struct xfs_da_args	*args)
+{
+	xfs_dablk_t		lblkno;
+	int			blkcnt;
+	int			error;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's blocks.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	while (blkcnt > 0) {
+		struct xfs_bmbt_irec	map;
+		int			nmap;
+
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1))
+			return -EFSCORRUPTED;
+		error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
+		if (error)
+			return error;
+
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+	return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the out-of-line
+ * buffer that it is stored on. Returns -EAGAIN for the caller to refresh the
+ * transaction and re-call the function.  Callers should keep calling this
+ * routine until it returns something other than -EAGAIN.
+ */
+int
+xfs_attr_rmtval_remove(
+	struct xfs_attr_intent		*attr)
+{
+	struct xfs_da_args		*args = attr->xattri_da_args;
+	int				error, done;
+
+	/*
+	 * Unmap value blocks for this attr.
+	 */
+	error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno,
+			    args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done);
+	if (error)
+		return error;
+
+	/*
+	 * We don't need an explicit state here to pick up where we left off. We
+	 * can figure it out using the !done return code. The actual value of
+	 * attr->xattri_dela_state may be some value reminiscent of the calling
+	 * function, but it's value is irrelevant with in the context of this
+	 * function. Once we are done here, the next state is set as needed by
+	 * the parent
+	 */
+	if (!done) {
+		trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state,
+						    args->dp);
+		return -EAGAIN;
+	}
+
+	args->rmtblkno = 0;
+	args->rmtblkcnt = 0;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
new file mode 100644
index 000000000..d097ec6c4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_REMOTE_H__
+#define	__XFS_ATTR_REMOTE_H__
+
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
+		xfs_buf_flags_t incore_flags);
+int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr);
+int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
+int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
+int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr);
+int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr);
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
new file mode 100644
index 000000000..37578b369
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define	__XFS_ATTR_SF_H__
+
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+	uint8_t		entno;		/* entry number in original list */
+	uint8_t		namelen;	/* length of name value (no null) */
+	uint8_t		valuelen;	/* length of value */
+	uint8_t		flags;		/* flags bits (see xfs_attr_leaf.h) */
+	xfs_dahash_t	hash;		/* this entry's hash value */
+	unsigned char	*name;		/* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+
+#define XFS_ATTR_SF_ENTSIZE_MAX			/* max space for name&value */ \
+	((1 << (NBBY*(int)sizeof(uint8_t))) - 1)
+
+/* space name/value uses */
+static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen)
+{
+	return sizeof(struct xfs_attr_sf_entry) + nlen + vlen;
+}
+
+/* space an entry uses */
+static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
+{
+	return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
+}
+
+/* next entry in struct */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
+{
+	return (void *)sfep + xfs_attr_sf_entsize(sfep);
+}
+
+#endif	/* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
new file mode 100644
index 000000000..40ce5f309
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_log_format.h"
+#include "xfs_bit.h"
+
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+
+/*
+ * Return whether bitmap is empty.
+ * Size is number of words in the bitmap, which is padded to word boundary
+ * Returns 1 for empty, 0 for non-empty.
+ */
+int
+xfs_bitmap_empty(uint *map, uint size)
+{
+	uint i;
+
+	for (i = 0; i < size; i++) {
+		if (map[i] != 0)
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint	size, uint start_bit)
+{
+	uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+	uint result = 0;
+	uint tmp;
+
+	size <<= BIT_TO_WORD_SHIFT;
+
+	ASSERT(start_bit < size);
+	size -= start_bit & ~(NBWORD - 1);
+	start_bit &= (NBWORD - 1);
+	if (start_bit) {
+		tmp = *p++;
+		/* set to one first offset bits prior to start */
+		tmp |= (~0U >> (NBWORD-start_bit));
+		if (tmp != ~0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	while (size) {
+		if ((tmp = *p++) != ~0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	return result - start_bit;
+found:
+	return result + ffz(tmp) - start_bit;
+}
+
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there.  It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+	uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+	uint result = start_bit & ~(NBWORD - 1);
+	uint tmp;
+
+	size <<= BIT_TO_WORD_SHIFT;
+
+	if (start_bit >= size)
+		return -1;
+	size -= result;
+	start_bit &= (NBWORD - 1);
+	if (start_bit) {
+		tmp = *p++;
+		/* set to zero first offset bits prior to start */
+		tmp &= (~0U << start_bit);
+		if (tmp != 0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	while (size) {
+		if ((tmp = *p++) != 0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	return -1;
+found:
+	return result + ffs(tmp) - 1;
+}
diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
new file mode 100644
index 000000000..a04f266ae
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bit.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BIT_H__
+#define	__XFS_BIT_H__
+
+/*
+ * XFS bit manipulation routines.
+ */
+
+/*
+ * masks with n high/low bits set, 64-bit values
+ */
+static inline uint64_t xfs_mask64hi(int n)
+{
+	return (uint64_t)-1 << (64 - (n));
+}
+static inline uint32_t xfs_mask32lo(int n)
+{
+	return ((uint32_t)1 << (n)) - 1;
+}
+static inline uint64_t xfs_mask64lo(int n)
+{
+	return ((uint64_t)1 << (n)) - 1;
+}
+
+/* Get high bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_highbit32(uint32_t v)
+{
+	return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(uint64_t v)
+{
+	return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(uint32_t v)
+{
+	return ffs(v) - 1;
+}
+
+/* Get low bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_lowbit64(uint64_t v)
+{
+	uint32_t	w = (uint32_t)v;
+	int		n = 0;
+
+	if (w) {	/* lower bits */
+		n = ffs(w);
+	} else {	/* upper bits */
+		w = (uint32_t)(v >> 32);
+		if (w) {
+			n = ffs(w);
+			if (n)
+				n += 32;
+		}
+	}
+	return n - 1;
+}
+
+/* Return whether bitmap is empty (1 == empty) */
+extern int xfs_bitmap_empty(uint *map, uint size);
+
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+
+#endif	/* __XFS_BIT_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
new file mode 100644
index 000000000..49d0d4ea6
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -0,0 +1,6230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_filestream.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_refcount.h"
+#include "xfs_icache.h"
+#include "xfs_iomap.h"
+
+struct kmem_cache		*xfs_bmap_intent_cache;
+
+/*
+ * Miscellaneous helper functions
+ */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	int		whichfork)	/* data or attr fork */
+{
+	uint64_t	maxblocks;	/* max blocks at this level */
+	xfs_extnum_t	maxleafents;	/* max leaf entries possible */
+	int		level;		/* btree level */
+	int		maxrootrecs;	/* max records in root block */
+	int		minleafrecs;	/* min records in leaf block */
+	int		minnoderecs;	/* min records in node block */
+	int		sz;		/* root block size */
+
+	/*
+	 * The maximum number of extents in a fork, hence the maximum number of
+	 * leaf entries, is controlled by the size of the on-disk extent count.
+	 *
+	 * Note that we can no longer assume that if we are in ATTR1 that the
+	 * fork offset of all the inodes will be
+	 * (xfs_default_attroffset(ip) >> 3) because we could have mounted with
+	 * ATTR2 and then mounted back with ATTR1, keeping the i_forkoff's fixed
+	 * but probably at various positions. Therefore, for both ATTR1 and
+	 * ATTR2 we have to assume the worst case scenario of a minimum size
+	 * available.
+	 */
+	maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+				whichfork);
+	if (whichfork == XFS_DATA_FORK)
+		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+	else
+		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+
+	maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
+	minleafrecs = mp->m_bmap_dmnr[0];
+	minnoderecs = mp->m_bmap_dmnr[1];
+	maxblocks = howmany_64(maxleafents, minleafrecs);
+	for (level = 1; maxblocks > 1; level++) {
+		if (maxblocks <= maxrootrecs)
+			maxblocks = 1;
+		else
+			maxblocks = howmany_64(maxblocks, minnoderecs);
+	}
+	mp->m_bm_maxlevels[whichfork] = level;
+	ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
+}
+
+unsigned int
+xfs_bmap_compute_attr_offset(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_sb.sb_inodesize == 256)
+		return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+}
+
+STATIC int				/* error */
+xfs_bmbt_lookup_eq(
+	struct xfs_btree_cur	*cur,
+	struct xfs_bmbt_irec	*irec,
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.b = *irec;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int				/* error */
+xfs_bmbt_lookup_first(
+	struct xfs_btree_cur	*cur,
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = 0;
+	cur->bc_rec.b.br_startblock = 0;
+	cur->bc_rec.b.br_blockcount = 0;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
+	return whichfork != XFS_COW_FORK &&
+		ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+		ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
+	return whichfork != XFS_COW_FORK &&
+		ifp->if_format == XFS_DINODE_FMT_BTREE &&
+		ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given by irec
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+	struct xfs_btree_cur	*cur,
+	struct xfs_bmbt_irec	*irec)
+{
+	union xfs_btree_rec	rec;
+
+	xfs_bmbt_disk_set_all(&rec.bmbt, irec);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_filblks_t	len)		/* delayed extent length */
+{
+	int		level;		/* btree level number */
+	int		maxrecs;	/* maximum record count at this level */
+	xfs_mount_t	*mp;		/* mount structure */
+	xfs_filblks_t	rval;		/* return value */
+
+	mp = ip->i_mount;
+	maxrecs = mp->m_bmap_dmxr[0];
+	for (level = 0, rval = 0;
+	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+	     level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		rval += len;
+		if (len == 1)
+			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+				level - 1;
+		if (level == 0)
+			maxrecs = mp->m_bmap_dmxr[1];
+	}
+	return rval;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+	struct xfs_inode	*ip)
+{
+	if (ip->i_df.if_format == XFS_DINODE_FMT_DEV)
+		return roundup(sizeof(xfs_dev_t), 8);
+	return M_IGEO(ip->i_mount)->attr_fork_offset;
+}
+
+/*
+ * Helper routine to reset inode i_forkoff field when switching attribute fork
+ * from local to extent format - we reset it where possible to make space
+ * available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	if (whichfork == XFS_ATTR_FORK &&
+	    ip->i_df.if_format != XFS_DINODE_FMT_DEV &&
+	    ip->i_df.if_format != XFS_DINODE_FMT_BTREE) {
+		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+		if (dfl_forkoff > ip->i_forkoff)
+			ip->i_forkoff = dfl_forkoff;
+	}
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+	struct xfs_btree_cur	*cur,
+	xfs_fsblock_t		bno)
+{
+	struct xfs_log_item	*lip;
+	int			i;
+
+	if (!cur)
+		return NULL;
+
+	for (i = 0; i < cur->bc_maxlevels; i++) {
+		if (!cur->bc_levels[i].bp)
+			break;
+		if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno)
+			return cur->bc_levels[i].bp;
+	}
+
+	/* Chase down all the log items to see if the bp is there */
+	list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) {
+		struct xfs_buf_log_item	*bip = (struct xfs_buf_log_item *)lip;
+
+		if (bip->bli_item.li_type == XFS_LI_BUF &&
+		    xfs_buf_daddr(bip->bli_buf) == bno)
+			return bip->bli_buf;
+	}
+
+	return NULL;
+}
+
+STATIC void
+xfs_check_block(
+	struct xfs_btree_block	*block,
+	xfs_mount_t		*mp,
+	int			root,
+	short			sz)
+{
+	int			i, j, dmxr;
+	__be64			*pp, *thispa;	/* pointer to block address */
+	xfs_bmbt_key_t		*prevp, *keyp;
+
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+	prevp = NULL;
+	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+		dmxr = mp->m_bmap_dmxr[0];
+		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+		if (prevp) {
+			ASSERT(be64_to_cpu(prevp->br_startoff) <
+			       be64_to_cpu(keyp->br_startoff));
+		}
+		prevp = keyp;
+
+		/*
+		 * Compare the block numbers to see if there are dups.
+		 */
+		if (root)
+			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+		else
+			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+			if (root)
+				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+			else
+				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+			if (*thispa == *pp) {
+				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld",
+					__func__, j, i,
+					(unsigned long long)be64_to_cpu(*thispa));
+				xfs_err(mp, "%s: ptrs are equal in node\n",
+					__func__);
+				xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			}
+		}
+	}
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves. THis becomes prohibitively expensive for large extent count
+ * files, so don't bother with inodes that have more than 10,000 extents in
+ * them. The btree record ordering checks will still be done, so for such large
+ * bmapbt constructs that is going to catch most corruptions.
+ */
+STATIC void
+xfs_bmap_check_leaf_extents(
+	struct xfs_btree_cur	*cur,	/* btree cursor or null */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	struct xfs_buf		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_extnum_t		i=0, j;	/* index into the extents list */
+	int			level;	/* btree level, for checking */
+	__be64			*pp;	/* pointer to block address */
+	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
+	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
+	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
+	int			bp_release = 0;
+
+	if (ifp->if_format != XFS_DINODE_FMT_BTREE)
+		return;
+
+	/* skip large extent count inodes */
+	if (ip->i_df.if_nextents > 10000)
+		return;
+
+	bno = NULLFSBLOCK;
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		/* See if buf is in cur first */
+		bp_release = 0;
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (!bp) {
+			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				goto error_norelse;
+		}
+		block = XFS_BUF_TO_BLOCK(bp);
+		if (level == 0)
+			break;
+
+		/*
+		 * Check this block for basic sanity (increasing keys and
+		 * no duplicate blocks).
+		 */
+
+		xfs_check_block(block, mp, 0, 0);
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+	}
+
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	i = 0;
+
+	/*
+	 * Loop over all leaf nodes checking that all extents are in the right order.
+	 */
+	for (;;) {
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = xfs_btree_get_numrecs(block);
+
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+		/*
+		 * Check all the extents to make sure they are OK.
+		 * If we had a previous block, the last entry should
+		 * conform with the first entry in this one.
+		 */
+
+		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+		if (i) {
+			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+			       xfs_bmbt_disk_get_blockcount(&last) <=
+			       xfs_bmbt_disk_get_startoff(ep));
+		}
+		for (j = 1; j < num_recs; j++) {
+			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+			       xfs_bmbt_disk_get_blockcount(ep) <=
+			       xfs_bmbt_disk_get_startoff(nextp));
+			ep = nextp;
+		}
+
+		last = *ep;
+		i += num_recs;
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+
+		bp_release = 0;
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (!bp) {
+			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				goto error_norelse;
+		}
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+
+	return;
+
+error0:
+	xfs_warn(mp, "%s: at error0", __func__);
+	if (bp_release)
+		xfs_trans_brelse(NULL, bp);
+error_norelse:
+	xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
+		__func__, i);
+	xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	return;
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the caller's original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	uint32_t		flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap)
+{
+	int			i;		/* index to map values */
+
+	ASSERT(ret_nmap <= nmap);
+
+	for (i = 0; i < ret_nmap; i++) {
+		ASSERT(mval[i].br_blockcount > 0);
+		if (!(flags & XFS_BMAPI_ENTIRE)) {
+			ASSERT(mval[i].br_startoff >= bno);
+			ASSERT(mval[i].br_blockcount <= len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+			       bno + len);
+		} else {
+			ASSERT(mval[i].br_startoff < bno + len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+			       bno);
+		}
+		ASSERT(i == 0 ||
+		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+		       mval[i].br_startoff);
+		ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+		       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+		       mval[i].br_state == XFS_EXT_UNWRITTEN);
+	}
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
+#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)	do { } while (0)
+#endif /* DEBUG */
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	struct xfs_inode	*ip,	/* incore inode pointer */
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork)  /* data or attr fork */
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_btree_block	*rblock = ifp->if_broot;
+	struct xfs_btree_block	*cblock;/* child btree block */
+	xfs_fsblock_t		cbno;	/* child block number */
+	struct xfs_buf		*cbp;	/* child block's buffer */
+	int			error;	/* error return value */
+	__be64			*pp;	/* ptr to block address */
+	struct xfs_owner_info	oinfo;
+
+	/* check if we actually need the extent format first: */
+	if (!xfs_bmap_wants_extents(ip, whichfork))
+		return 0;
+
+	ASSERT(cur);
+	ASSERT(whichfork != XFS_COW_FORK);
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
+	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+	cbno = be64_to_cpu(*pp);
+#ifdef DEBUG
+	if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1)))
+		return -EFSCORRUPTED;
+#endif
+	error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
+				&xfs_bmbt_buf_ops);
+	if (error)
+		return error;
+	cblock = XFS_BUF_TO_BLOCK(cbp);
+	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+		return error;
+	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+	xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
+	ip->i_nblocks--;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, cbp);
+	if (cur->bc_levels[0].bp == cbp)
+		cur->bc_levels[0].bp = NULL;
+	xfs_iroot_realloc(ip, -1, whichfork);
+	ASSERT(ifp->if_broot == NULL);
+	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+	*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+	return 0;
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode pointer */
+	struct xfs_btree_cur	**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*ablock;	/* allocated (child) bt block */
+	struct xfs_buf		*abp;		/* buffer for ablock */
+	struct xfs_alloc_arg	args;		/* allocation arguments */
+	struct xfs_bmbt_rec	*arp;		/* child record pointer */
+	struct xfs_btree_block	*block;		/* btree root block */
+	struct xfs_btree_cur	*cur;		/* bmap btree cursor */
+	int			error;		/* error return value */
+	struct xfs_ifork	*ifp;		/* inode fork pointer */
+	struct xfs_bmbt_key	*kp;		/* root block key pointer */
+	struct xfs_mount	*mp;		/* mount structure */
+	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	rec;
+	xfs_extnum_t		cnt = 0;
+
+	mp = ip->i_mount;
+	ASSERT(whichfork != XFS_COW_FORK);
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
+
+	/*
+	 * Make space in the inode incore. This needs to be undone if we fail
+	 * to expand the root.
+	 */
+	xfs_iroot_realloc(ip, 1, whichfork);
+
+	/*
+	 * Fill in the root.
+	 */
+	block = ifp->if_broot;
+	xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS);
+	/*
+	 * Need a cursor.  Can't allocate until bb_level is filled in.
+	 */
+	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+	cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+	/*
+	 * Convert to a btree with two levels, one record in root.
+	 */
+	ifp->if_format = XFS_DINODE_FMT_BTREE;
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = mp;
+	xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork);
+	if (tp->t_firstblock == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+	} else if (tp->t_flags & XFS_TRANS_LOWMODE) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = tp->t_firstblock;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.fsbno = tp->t_firstblock;
+	}
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = wasdel;
+	*logflagsp = 0;
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto out_root_realloc;
+
+	if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
+		error = -ENOSPC;
+		goto out_root_realloc;
+	}
+
+	/*
+	 * Allocation can't fail, the space was reserved.
+	 */
+	ASSERT(tp->t_firstblock == NULLFSBLOCK ||
+	       args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock));
+	tp->t_firstblock = args.fsbno;
+	cur->bc_ino.allocated++;
+	ip->i_nblocks++;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+	error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, args.fsbno),
+			mp->m_bsize, 0, &abp);
+	if (error)
+		goto out_unreserve_dquot;
+
+	/*
+	 * Fill in the child block.
+	 */
+	abp->b_ops = &xfs_bmbt_buf_ops;
+	ablock = XFS_BUF_TO_BLOCK(abp);
+	xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp),
+				XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
+				XFS_BTREE_LONG_PTRS);
+
+	for_each_xfs_iext(ifp, &icur, &rec) {
+		if (isnullstartblock(rec.br_startblock))
+			continue;
+		arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt);
+		xfs_bmbt_disk_set_all(arp, &rec);
+		cnt++;
+	}
+	ASSERT(cnt == ifp->if_nextents);
+	xfs_btree_set_numrecs(ablock, cnt);
+
+	/*
+	 * Fill in the root key and pointer.
+	 */
+	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+						be16_to_cpu(block->bb_level)));
+	*pp = cpu_to_be64(args.fsbno);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+	ASSERT(*curp == NULL);
+	*curp = cur;
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+	return 0;
+
+out_unreserve_dquot:
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+out_root_realloc:
+	xfs_iroot_realloc(ip, -1, whichfork);
+	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+	ASSERT(ifp->if_broot == NULL);
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+
+	return error;
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+void
+xfs_bmap_local_to_extents_empty(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	ASSERT(whichfork != XFS_COW_FORK);
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+	ASSERT(ifp->if_bytes == 0);
+	ASSERT(ifp->if_nextents == 0);
+
+	xfs_bmap_forkoff_reset(ip, whichfork);
+	ifp->if_u1.if_root = NULL;
+	ifp->if_height = 0;
+	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork,
+	void		(*init_fn)(struct xfs_trans *tp,
+				   struct xfs_buf *bp,
+				   struct xfs_inode *ip,
+				   struct xfs_ifork *ifp))
+{
+	int		error = 0;
+	int		flags;		/* logging flags returned */
+	struct xfs_ifork *ifp;		/* inode fork pointer */
+	xfs_alloc_arg_t	args;		/* allocation arguments */
+	struct xfs_buf	*bp;		/* buffer for extent block */
+	struct xfs_bmbt_irec rec;
+	struct xfs_iext_cursor icur;
+
+	/*
+	 * We don't want to deal with the case of keeping inode data inline yet.
+	 * So sending the data fork of a regular inode is invalid.
+	 */
+	ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+	if (!ifp->if_bytes) {
+		xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
+		flags = XFS_ILOG_CORE;
+		goto done;
+	}
+
+	flags = 0;
+	error = 0;
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = ip->i_mount;
+	xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0);
+	/*
+	 * Allocate a block.  We know we need only one, since the
+	 * file currently fits in an inode.
+	 */
+	if (tp->t_firstblock == NULLFSBLOCK) {
+		args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else {
+		args.fsbno = tp->t_firstblock;
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	}
+	args.total = total;
+	args.minlen = args.maxlen = args.prod = 1;
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto done;
+
+	/* Can't fail, the space was reserved. */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(args.len == 1);
+	tp->t_firstblock = args.fsbno;
+	error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(args.mp, args.fsbno),
+			args.mp->m_bsize, 0, &bp);
+	if (error)
+		goto done;
+
+	/*
+	 * Initialize the block, copy the data and log the remote buffer.
+	 *
+	 * The callout is responsible for logging because the remote format
+	 * might differ from the local format and thus we don't know how much to
+	 * log here. Note that init_fn must also set the buffer log item type
+	 * correctly.
+	 */
+	init_fn(tp, bp, ip, ifp);
+
+	/* account for the change in fork size */
+	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+	xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
+	flags |= XFS_ILOG_CORE;
+
+	ifp->if_u1.if_root = NULL;
+	ifp->if_height = 0;
+
+	rec.br_startoff = 0;
+	rec.br_startblock = args.fsbno;
+	rec.br_blockcount = 1;
+	rec.br_state = XFS_EXT_NORM;
+	xfs_iext_first(ifp, &icur);
+	xfs_iext_insert(ip, &icur, &rec, 0);
+
+	ifp->if_nextents = 1;
+	ip->i_nblocks = 1;
+	xfs_trans_mod_dquot_byino(tp, ip,
+		XFS_TRANS_DQ_BCOUNT, 1L);
+	flags |= xfs_ilog_fext(whichfork);
+
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			*flags)		/* inode logging flags */
+{
+	struct xfs_btree_block	*block = ip->i_df.if_broot;
+	struct xfs_btree_cur	*cur;		/* btree cursor */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* file system mount struct */
+	int			stat;		/* newroot status */
+
+	mp = ip->i_mount;
+
+	if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
+		*flags |= XFS_ILOG_DBROOT;
+	else {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+		error = xfs_bmbt_lookup_first(cur, &stat);
+		if (error)
+			goto error0;
+		/* must be at least one entry */
+		if (XFS_IS_CORRUPT(mp, stat != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
+			goto error0;
+		if (stat == 0) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			return -ENOSPC;
+		}
+		cur->bc_ino.allocated = 0;
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	}
+	return 0;
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode pointer */
+	int			*flags)		/* inode logging flags */
+{
+	struct xfs_btree_cur	*cur;		/* bmap btree cursor */
+	int			error;		/* error return value */
+
+	if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
+	    xfs_inode_data_fork_size(ip))
+		return 0;
+	cur = NULL;
+	error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
+					  XFS_DATA_FORK);
+	if (cur) {
+		cur->bc_ino.allocated = 0;
+		xfs_btree_del_cursor(cur, error);
+	}
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode pointer */
+	int			*flags)		/* inode logging flags */
+{
+	struct xfs_da_args	dargs;		/* args for dir/attr code */
+
+	if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip))
+		return 0;
+
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		memset(&dargs, 0, sizeof(dargs));
+		dargs.geo = ip->i_mount->m_dir_geo;
+		dargs.dp = ip;
+		dargs.total = dargs.geo->fsbcount;
+		dargs.whichfork = XFS_DATA_FORK;
+		dargs.trans = tp;
+		return xfs_dir2_sf_to_block(&dargs);
+	}
+
+	if (S_ISLNK(VFS_I(ip)->i_mode))
+		return xfs_bmap_local_to_extents(tp, ip, 1, flags,
+						 XFS_DATA_FORK,
+						 xfs_symlink_local_to_remote);
+
+	/* should only be called for types that support local format data */
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Set an inode attr fork offset based on the format of the data fork.
+ */
+static int
+xfs_bmap_set_attrforkoff(
+	struct xfs_inode	*ip,
+	int			size,
+	int			*version)
+{
+	int			default_size = xfs_default_attroffset(ip) >> 3;
+
+	switch (ip->i_df.if_format) {
+	case XFS_DINODE_FMT_DEV:
+		ip->i_forkoff = default_size;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+		if (!ip->i_forkoff)
+			ip->i_forkoff = default_size;
+		else if (xfs_has_attr2(ip->i_mount) && version)
+			*version = 2;
+		break;
+	default:
+		ASSERT(0);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int						/* error code */
+xfs_bmap_add_attrfork(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			size,		/* space new attribute needs */
+	int			rsvd)		/* xact may use reserved blks */
+{
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int			blks;		/* space reservation */
+	int			version = 1;	/* superblock attr version */
+	int			logflags;	/* logging flags */
+	int			error;		/* error return value */
+
+	ASSERT(xfs_inode_has_attr_fork(ip) == 0);
+
+	mp = ip->i_mount;
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+
+	blks = XFS_ADDAFORK_SPACE_RES(mp);
+
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
+			rsvd, &tp);
+	if (error)
+		return error;
+	if (xfs_inode_has_attr_fork(ip))
+		goto trans_cancel;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	error = xfs_bmap_set_attrforkoff(ip, size, &version);
+	if (error)
+		goto trans_cancel;
+
+	xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+	logflags = 0;
+	switch (ip->i_df.if_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		error = xfs_bmap_add_attrfork_local(tp, ip, &logflags);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_bmap_add_attrfork_extents(tp, ip, &logflags);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_bmap_add_attrfork_btree(tp, ip, &logflags);
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (error)
+		goto trans_cancel;
+	if (!xfs_has_attr(mp) ||
+	   (!xfs_has_attr2(mp) && version == 2)) {
+		bool log_sb = false;
+
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_has_attr(mp)) {
+			xfs_add_attr(mp);
+			log_sb = true;
+		}
+		if (!xfs_has_attr2(mp) && version == 2) {
+			xfs_add_attr2(mp);
+			log_sb = true;
+		}
+		spin_unlock(&mp->m_sb_lock);
+		if (log_sb)
+			xfs_log_sb(tp);
+	}
+
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+trans_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Internal and external extent tree search functions.
+ */
+
+struct xfs_iread_state {
+	struct xfs_iext_cursor	icur;
+	xfs_extnum_t		loaded;
+};
+
+/* Stuff every bmbt record from this block into the incore extent map. */
+static int
+xfs_iread_bmbt_block(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*priv)
+{
+	struct xfs_iread_state	*ir = priv;
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_inode	*ip = cur->bc_ino.ip;
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	struct xfs_bmbt_rec	*frp;
+	xfs_extnum_t		num_recs;
+	xfs_extnum_t		j;
+	int			whichfork = cur->bc_ino.whichfork;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	block = xfs_btree_get_block(cur, level, &bp);
+
+	/* Abort if we find more records than nextents. */
+	num_recs = xfs_btree_get_numrecs(block);
+	if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) {
+		xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).",
+				(unsigned long long)ip->i_ino);
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
+				sizeof(*block), __this_address);
+		return -EFSCORRUPTED;
+	}
+
+	/* Copy records into the incore cache. */
+	frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+	for (j = 0; j < num_recs; j++, frp++, ir->loaded++) {
+		struct xfs_bmbt_irec	new;
+		xfs_failaddr_t		fa;
+
+		xfs_bmbt_disk_get_all(frp, &new);
+		fa = xfs_bmap_validate_extent(ip, whichfork, &new);
+		if (fa) {
+			xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+					"xfs_iread_extents(2)", frp,
+					sizeof(*frp), fa);
+			return -EFSCORRUPTED;
+		}
+		xfs_iext_insert(ip, &ir->icur, &new,
+				xfs_bmap_fork_to_state(whichfork));
+		trace_xfs_read_extent(ip, &ir->icur,
+				xfs_bmap_fork_to_state(whichfork), _THIS_IP_);
+		xfs_iext_next(ifp, &ir->icur);
+	}
+
+	return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ */
+int
+xfs_iread_extents(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_iread_state	ir;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	if (!xfs_need_iread_extents(ifp))
+		return 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	ir.loaded = 0;
+	xfs_iext_first(ifp, &ir.icur);
+	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+	error = xfs_btree_visit_blocks(cur, xfs_iread_bmbt_block,
+			XFS_BTREE_VISIT_RECORDS, &ir);
+	xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out;
+
+	if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+	ASSERT(ir.loaded == xfs_iext_count(ifp));
+	return 0;
+out:
+	xfs_iext_destroy(ifp);
+	return error;
+}
+
+/*
+ * Returns the relative block number of the first unused block(s) in the given
+ * fork with at least "len" logically contiguous blocks free.  This is the
+ * lowest-address hole if the fork has holes, else the first block past the end
+ * of fork.  Return 0 if the fork is currently local (in-inode).
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_extlen_t		len,		/* size of hole to find */
+	xfs_fileoff_t		*first_unused,	/* unused block */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_bmbt_irec	got;
+	struct xfs_iext_cursor	icur;
+	xfs_fileoff_t		lastaddr = 0;
+	xfs_fileoff_t		lowest, max;
+	int			error;
+
+	if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
+		*first_unused = 0;
+		return 0;
+	}
+
+	ASSERT(xfs_ifork_has_extents(ifp));
+
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		return error;
+
+	lowest = max = *first_unused;
+	for_each_xfs_iext(ifp, &icur, &got) {
+		/*
+		 * See if the hole before this extent will work.
+		 */
+		if (got.br_startoff >= lowest + len &&
+		    got.br_startoff - max >= len)
+			break;
+		lastaddr = got.br_startoff + got.br_blockcount;
+		max = XFS_FILEOFF_MAX(lastaddr, lowest);
+	}
+
+	*first_unused = max;
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block - 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*last_block,	/* last block */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_bmbt_irec	got;
+	struct xfs_iext_cursor	icur;
+	int			error;
+
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		*last_block = 0;
+		return 0;
+	case XFS_DINODE_FMT_BTREE:
+	case XFS_DINODE_FMT_EXTENTS:
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		return error;
+
+	if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got))
+		*last_block = 0;
+	return 0;
+}
+
+int
+xfs_bmap_last_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*rec,
+	int			*is_empty)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_iext_cursor	icur;
+	int			error;
+
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		return error;
+
+	xfs_iext_last(ifp, &icur);
+	if (!xfs_iext_get_extent(ifp, &icur, rec))
+		*is_empty = 1;
+	else
+		*is_empty = 0;
+	return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	bma->aeof = false;
+	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+				     &is_empty);
+	if (error)
+		return error;
+
+	if (is_empty) {
+		bma->aeof = true;
+		return 0;
+	}
+
+	/*
+	 * Check if we are allocation or past the last extent, or at least into
+	 * the last delayed allocated extent.
+	 */
+	bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+		(bma->offset >= rec.br_startoff &&
+		 isnullstartblock(rec.br_startblock));
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*last_block,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	*last_block = 0;
+
+	if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
+		return 0;
+
+	if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp)))
+		return -EFSCORRUPTED;
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+	if (error || is_empty)
+		return error;
+
+	*last_block = rec.br_startoff + rec.br_blockcount;
+	return 0;
+}
+
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+
+/*
+ * Convert a delayed allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = bma->ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(bma->ip, whichfork);
+	struct xfs_bmbt_irec	*new = &bma->got;
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0;	/* return value (logging flags) */
+	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
+	xfs_filblks_t		da_new; /* new count del alloc blocks used */
+	xfs_filblks_t		da_old; /* old count del alloc blocks used */
+	xfs_filblks_t		temp=0;	/* value for da_new calculations */
+	int			tmp_rval;	/* partial logging flags */
+	struct xfs_bmbt_irec	old;
+
+	ASSERT(whichfork != XFS_ATTR_FORK);
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!bma->cur ||
+	       (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+
+	XFS_STATS_INC(mp, xs_add_exlist);
+
+#define	LEFT		r[0]
+#define	RIGHT		r[1]
+#define	PREV		r[2]
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	xfs_iext_get_extent(ifp, &bma->icur, &PREV);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(isnullstartblock(PREV.br_startblock));
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+	da_old = startblockval(PREV.br_startblock);
+	da_new = 0;
+
+	/*
+	 * Set flags determining what part of the previous delayed allocation
+	 * extent is being replaced by a real allocation.
+	 */
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (xfs_iext_peek_prev_extent(ifp, &bma->icur, &LEFT)) {
+		state |= BMAP_LEFT_VALID;
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == new->br_state &&
+	    LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (xfs_iext_peek_next_extent(ifp, &bma->icur, &RIGHT)) {
+		state |= BMAP_RIGHT_VALID;
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    new->br_state == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= XFS_MAX_BMBT_EXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	error = 0;
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount;
+
+		xfs_iext_remove(bma->ip, &bma->icur, state);
+		xfs_iext_remove(bma->ip, &bma->icur, state);
+		xfs_iext_prev(ifp, &bma->icur);
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
+		ifp->if_nextents--;
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(bma->cur, &LEFT);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		old = LEFT;
+		LEFT.br_blockcount += PREV.br_blockcount;
+
+		xfs_iext_remove(bma->ip, &bma->icur, state);
+		xfs_iext_prev(ifp, &bma->icur);
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(bma->cur, &LEFT);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The right neighbor is contiguous, the left is not. Take care
+		 * with delay -> unwritten extent allocation here because the
+		 * delalloc record we are overwriting is always written.
+		 */
+		PREV.br_startblock = new->br_startblock;
+		PREV.br_blockcount += RIGHT.br_blockcount;
+		PREV.br_state = new->br_state;
+
+		xfs_iext_next(ifp, &bma->icur);
+		xfs_iext_remove(bma->ip, &bma->icur, state);
+		xfs_iext_prev(ifp, &bma->icur);
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(bma->cur, &PREV);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		PREV.br_startblock = new->br_startblock;
+		PREV.br_state = new->br_state;
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+		ifp->if_nextents++;
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is contiguous.
+		 */
+		old = LEFT;
+		temp = PREV.br_blockcount - new->br_blockcount;
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+				startblockval(PREV.br_startblock));
+
+		LEFT.br_blockcount += new->br_blockcount;
+
+		PREV.br_blockcount = temp;
+		PREV.br_startoff += new->br_blockcount;
+		PREV.br_startblock = nullstartblock(da_new);
+
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+		xfs_iext_prev(ifp, &bma->icur);
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(bma->cur, &LEFT);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is not contiguous.
+		 */
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
+		ifp->if_nextents++;
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					&bma->cur, 1, &tmp_rval, whichfork);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+
+		temp = PREV.br_blockcount - new->br_blockcount;
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock) -
+			(bma->cur ? bma->cur->bc_ino.allocated : 0));
+
+		PREV.br_startoff = new_endoff;
+		PREV.br_blockcount = temp;
+		PREV.br_startblock = nullstartblock(da_new);
+		xfs_iext_next(ifp, &bma->icur);
+		xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
+		xfs_iext_prev(ifp, &bma->icur);
+		break;
+
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		old = RIGHT;
+		RIGHT.br_startoff = new->br_startoff;
+		RIGHT.br_startblock = new->br_startblock;
+		RIGHT.br_blockcount += new->br_blockcount;
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(bma->cur, &RIGHT);
+			if (error)
+				goto done;
+		}
+
+		temp = PREV.br_blockcount - new->br_blockcount;
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock));
+
+		PREV.br_blockcount = temp;
+		PREV.br_startblock = nullstartblock(da_new);
+
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+		xfs_iext_next(ifp, &bma->icur);
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT);
+		break;
+
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is not contiguous.
+		 */
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
+		ifp->if_nextents++;
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				&bma->cur, 1, &tmp_rval, whichfork);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+
+		temp = PREV.br_blockcount - new->br_blockcount;
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock) -
+			(bma->cur ? bma->cur->bc_ino.allocated : 0));
+
+		PREV.br_startblock = nullstartblock(da_new);
+		PREV.br_blockcount = temp;
+		xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
+		xfs_iext_next(ifp, &bma->icur);
+		break;
+
+	case 0:
+		/*
+		 * Filling in the middle part of a previous delayed allocation.
+		 * Contiguity is impossible here.
+		 * This case is avoided almost all the time.
+		 *
+		 * We start with a delayed allocation:
+		 *
+		 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+		 *  PREV @ idx
+		 *
+	         * and we are allocating:
+		 *                     +rrrrrrrrrrrrrrrrr+
+		 *			      new
+		 *
+		 * and we set it up for insertion as:
+		 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+		 *                            new
+		 *  PREV @ idx          LEFT              RIGHT
+		 *                      inserted at idx + 1
+		 */
+		old = PREV;
+
+		/* LEFT is the new middle */
+		LEFT = *new;
+
+		/* RIGHT is the new right */
+		RIGHT.br_state = PREV.br_state;
+		RIGHT.br_startoff = new_endoff;
+		RIGHT.br_blockcount =
+			PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		RIGHT.br_startblock =
+			nullstartblock(xfs_bmap_worst_indlen(bma->ip,
+					RIGHT.br_blockcount));
+
+		/* truncate PREV */
+		PREV.br_blockcount = new->br_startoff - PREV.br_startoff;
+		PREV.br_startblock =
+			nullstartblock(xfs_bmap_worst_indlen(bma->ip,
+					PREV.br_blockcount));
+		xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+
+		xfs_iext_next(ifp, &bma->icur);
+		xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state);
+		xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state);
+		ifp->if_nextents++;
+
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					&bma->cur, 1, &tmp_rval, whichfork);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+
+		da_new = startblockval(PREV.br_startblock) +
+			 startblockval(RIGHT.br_startblock);
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	/* add reverse mapping unless caller opted out */
+	if (!(bma->flags & XFS_BMAPI_NORMAP))
+		xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				&bma->cur, da_old > 0, &tmp_logflags,
+				whichfork);
+		bma->logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	if (da_new != da_old)
+		xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
+
+	if (bma->cur) {
+		da_new += bma->cur->bc_ino.allocated;
+		bma->cur->bc_ino.allocated = 0;
+	}
+
+	/* adjust for changes in reserved delayed indirect blocks */
+	if (da_new != da_old) {
+		ASSERT(state == 0 || da_new < da_old);
+		error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new),
+				false);
+	}
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+done:
+	if (whichfork != XFS_COW_FORK)
+		bma->logflags |= rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+}
+
+/*
+ * Convert an unwritten allocation to a real allocation or vice versa.
+ */
+int					/* error */
+xfs_bmap_add_extent_unwritten_real(
+	struct xfs_trans	*tp,
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	int			whichfork,
+	struct xfs_iext_cursor	*icur,
+	struct xfs_btree_cur	**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	int			*logflagsp) /* inode logging flags */
+{
+	struct xfs_btree_cur	*cur;	/* btree cursor */
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	struct xfs_ifork	*ifp;	/* inode fork pointer */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0;	/* return value (logging flags) */
+	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	old;
+
+	*logflagsp = 0;
+
+	cur = *curp;
+	ifp = xfs_ifork_ptr(ip, whichfork);
+
+	ASSERT(!isnullstartblock(new->br_startblock));
+
+	XFS_STATS_INC(mp, xs_add_exlist);
+
+#define	LEFT		r[0]
+#define	RIGHT		r[1]
+#define	PREV		r[2]
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	error = 0;
+	xfs_iext_get_extent(ifp, icur, &PREV);
+	ASSERT(new->br_state != PREV.br_state);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (xfs_iext_peek_prev_extent(ifp, icur, &LEFT)) {
+		state |= BMAP_LEFT_VALID;
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == new->br_state &&
+	    LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (xfs_iext_peek_next_extent(ifp, icur, &RIGHT)) {
+		state |= BMAP_RIGHT_VALID;
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    new->br_state == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= XFS_MAX_BMBT_EXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount;
+
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_prev(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &LEFT);
+		ifp->if_nextents -= 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &LEFT);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		LEFT.br_blockcount += PREV.br_blockcount;
+
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_prev(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &LEFT);
+		ifp->if_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(cur, &PREV, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &LEFT);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		PREV.br_blockcount += RIGHT.br_blockcount;
+		PREV.br_state = new->br_state;
+
+		xfs_iext_next(ifp, icur);
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_prev(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &PREV);
+		ifp->if_nextents--;
+
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &PREV);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		PREV.br_state = new->br_state;
+		xfs_iext_update_extent(ip, state, icur, &PREV);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(cur, new, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &PREV);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		LEFT.br_blockcount += new->br_blockcount;
+
+		old = PREV;
+		PREV.br_startoff += new->br_blockcount;
+		PREV.br_startblock += new->br_blockcount;
+		PREV.br_blockcount -= new->br_blockcount;
+
+		xfs_iext_update_extent(ip, state, icur, &PREV);
+		xfs_iext_prev(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &LEFT);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &PREV);
+			if (error)
+				goto done;
+			error = xfs_btree_decrement(cur, 0, &i);
+			if (error)
+				goto done;
+			error = xfs_bmbt_update(cur, &LEFT);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		old = PREV;
+		PREV.br_startoff += new->br_blockcount;
+		PREV.br_startblock += new->br_blockcount;
+		PREV.br_blockcount -= new->br_blockcount;
+
+		xfs_iext_update_extent(ip, state, icur, &PREV);
+		xfs_iext_insert(ip, icur, new, state);
+		ifp->if_nextents++;
+
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &PREV);
+			if (error)
+				goto done;
+			cur->bc_rec.b = *new;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		}
+		break;
+
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		old = PREV;
+		PREV.br_blockcount -= new->br_blockcount;
+
+		RIGHT.br_startoff = new->br_startoff;
+		RIGHT.br_startblock = new->br_startblock;
+		RIGHT.br_blockcount += new->br_blockcount;
+
+		xfs_iext_update_extent(ip, state, icur, &PREV);
+		xfs_iext_next(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &RIGHT);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &PREV);
+			if (error)
+				goto done;
+			error = xfs_btree_increment(cur, 0, &i);
+			if (error)
+				goto done;
+			error = xfs_bmbt_update(cur, &RIGHT);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		old = PREV;
+		PREV.br_blockcount -= new->br_blockcount;
+
+		xfs_iext_update_extent(ip, state, icur, &PREV);
+		xfs_iext_next(ifp, icur);
+		xfs_iext_insert(ip, icur, new, state);
+		ifp->if_nextents++;
+
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &PREV);
+			if (error)
+				goto done;
+			error = xfs_bmbt_lookup_eq(cur, new, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		}
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		old = PREV;
+		PREV.br_blockcount = new->br_startoff - PREV.br_startoff;
+
+		r[0] = *new;
+		r[1].br_startoff = new_endoff;
+		r[1].br_blockcount =
+			old.br_startoff + old.br_blockcount - new_endoff;
+		r[1].br_startblock = new->br_startblock + new->br_blockcount;
+		r[1].br_state = PREV.br_state;
+
+		xfs_iext_update_extent(ip, state, icur, &PREV);
+		xfs_iext_next(ifp, icur);
+		xfs_iext_insert(ip, icur, &r[1], state);
+		xfs_iext_insert(ip, icur, &r[0], state);
+		ifp->if_nextents += 2;
+
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			/* new right extent - oldext */
+			error = xfs_bmbt_update(cur, &r[1]);
+			if (error)
+				goto done;
+			/* new left extent - oldext */
+			cur->bc_rec.b = PREV;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			/*
+			 * Reset the cursor to the position of the new extent
+			 * we are about to insert as we can't trust it after
+			 * the previous insert.
+			 */
+			error = xfs_bmbt_lookup_eq(cur, new, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			/* new middle extent - newext */
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	/* update reverse mappings */
+	xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
+				&tmp_logflags, whichfork);
+		*logflagsp |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (cur) {
+		cur->bc_ino.allocated = 0;
+		*curp = cur;
+	}
+
+	xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+done:
+	*logflagsp |= rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+}
+
+/*
+ * Convert a hole to a delayed allocation.
+ */
+STATIC void
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	int			whichfork,
+	struct xfs_iext_cursor	*icur,
+	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
+{
+	struct xfs_ifork	*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_filblks_t		newlen=0;	/* new indirect size */
+	xfs_filblks_t		oldlen=0;	/* old indirect size */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
+	xfs_filblks_t		temp;	 /* temp for indirect calculations */
+
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	ASSERT(isnullstartblock(new->br_startblock));
+
+	/*
+	 * Check and set flags if this segment has a left neighbor
+	 */
+	if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
+		state |= BMAP_LEFT_VALID;
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	/*
+	 * Check and set flags if the current (right) segment exists.
+	 * If it doesn't exist, we're converting the hole at end-of-file.
+	 */
+	if (xfs_iext_get_extent(ifp, icur, &right)) {
+		state |= BMAP_RIGHT_VALID;
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	/*
+	 * Set contiguity flags on the left and right neighbors.
+	 * Don't let extents get too large, even if the pieces are contiguous.
+	 */
+	if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     (left.br_blockcount + new->br_blockcount +
+	      right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
+		state |= BMAP_RIGHT_CONTIG;
+
+	/*
+	 * Switch out based on the contiguity flags.
+	 */
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with delayed allocations
+		 * on the left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		temp = left.br_blockcount + new->br_blockcount +
+			right.br_blockcount;
+
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+					 oldlen);
+		left.br_startblock = nullstartblock(newlen);
+		left.br_blockcount = temp;
+
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_prev(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &left);
+		break;
+
+	case BMAP_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		temp = left.br_blockcount + new->br_blockcount;
+
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+					 oldlen);
+		left.br_blockcount = temp;
+		left.br_startblock = nullstartblock(newlen);
+
+		xfs_iext_prev(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &left);
+		break;
+
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		temp = new->br_blockcount + right.br_blockcount;
+		oldlen = startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+					 oldlen);
+		right.br_startoff = new->br_startoff;
+		right.br_startblock = nullstartblock(newlen);
+		right.br_blockcount = temp;
+		xfs_iext_update_extent(ip, state, icur, &right);
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * delayed allocation.
+		 * Insert a new entry.
+		 */
+		oldlen = newlen = 0;
+		xfs_iext_insert(ip, icur, new, state);
+		break;
+	}
+	if (oldlen != newlen) {
+		ASSERT(oldlen > newlen);
+		xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
+				 false);
+		/*
+		 * Nothing to do for disk quota accounting here.
+		 */
+		xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen);
+	}
+}
+
+/*
+ * Convert a hole to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_iext_cursor	*icur,
+	struct xfs_btree_cur	**curp,
+	struct xfs_bmbt_irec	*new,
+	int			*logflagsp,
+	uint32_t		flags)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_btree_cur	*cur = *curp;
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			rval=0;	/* return value (logging flags) */
+	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
+	struct xfs_bmbt_irec	old;
+
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+
+	XFS_STATS_INC(mp, xs_add_exlist);
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 */
+	if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
+		state |= BMAP_LEFT_VALID;
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	/*
+	 * Check and set flags if this segment has a current value.
+	 * Not true if we're inserting into the "hole" at eof.
+	 */
+	if (xfs_iext_get_extent(ifp, icur, &right)) {
+		state |= BMAP_RIGHT_VALID;
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	/*
+	 * We're inserting a real allocation between "left" and "right".
+	 * Set the contiguity flags.  Don't let extents get too large.
+	 */
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_startblock + left.br_blockcount == new->br_startblock &&
+	    left.br_state == new->br_state &&
+	    left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_startblock + new->br_blockcount == right.br_startblock &&
+	    new->br_state == right.br_state &&
+	    new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     left.br_blockcount + new->br_blockcount +
+	     right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	error = 0;
+	/*
+	 * Select which case we're in here, and implement it.
+	 */
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with real allocations on the
+		 * left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		left.br_blockcount += new->br_blockcount + right.br_blockcount;
+
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_prev(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &left);
+		ifp->if_nextents--;
+
+		if (cur == NULL) {
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(cur, &right, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_delete(cur, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_decrement(cur, 0, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &left);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		old = left;
+		left.br_blockcount += new->br_blockcount;
+
+		xfs_iext_prev(ifp, icur);
+		xfs_iext_update_extent(ip, state, icur, &left);
+
+		if (cur == NULL) {
+			rval = xfs_ilog_fext(whichfork);
+		} else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &left);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		old = right;
+
+		right.br_startoff = new->br_startoff;
+		right.br_startblock = new->br_startblock;
+		right.br_blockcount += new->br_blockcount;
+		xfs_iext_update_extent(ip, state, icur, &right);
+
+		if (cur == NULL) {
+			rval = xfs_ilog_fext(whichfork);
+		} else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(cur, &old, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_bmbt_update(cur, &right);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * real allocation.
+		 * Insert a new entry.
+		 */
+		xfs_iext_insert(ip, icur, new, state);
+		ifp->if_nextents++;
+
+		if (cur == NULL) {
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(cur, new, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+			error = xfs_btree_insert(cur, &i);
+			if (error)
+				goto done;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		}
+		break;
+	}
+
+	/* add reverse mapping unless caller opted out */
+	if (!(flags & XFS_BMAPI_NORMAP))
+		xfs_rmap_map_extent(tp, ip, whichfork, new);
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, curp, 0,
+				&tmp_logflags, whichfork);
+		*logflagsp |= tmp_logflags;
+		cur = *curp;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (cur)
+		cur->bc_ino.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(cur, ip, whichfork);
+done:
+	*logflagsp |= rval;
+	return error;
+}
+
+/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
+ * Adjust the size of the new extent based on i_extsize and rt extsize.
+ */
+int
+xfs_bmap_extsize_align(
+	xfs_mount_t	*mp,
+	xfs_bmbt_irec_t	*gotp,		/* next extent pointer */
+	xfs_bmbt_irec_t	*prevp,		/* previous extent pointer */
+	xfs_extlen_t	extsz,		/* align to this extent size */
+	int		rt,		/* is this a realtime inode? */
+	int		eof,		/* is extent at end-of-file? */
+	int		delay,		/* creating delalloc extent? */
+	int		convert,	/* overwriting unwritten extent? */
+	xfs_fileoff_t	*offp,		/* in/out: aligned offset */
+	xfs_extlen_t	*lenp)		/* in/out: aligned length */
+{
+	xfs_fileoff_t	orig_off;	/* original offset */
+	xfs_extlen_t	orig_alen;	/* original length */
+	xfs_fileoff_t	orig_end;	/* original off+len */
+	xfs_fileoff_t	nexto;		/* next file offset */
+	xfs_fileoff_t	prevo;		/* previous file offset */
+	xfs_fileoff_t	align_off;	/* temp for offset */
+	xfs_extlen_t	align_alen;	/* temp for length */
+	xfs_extlen_t	temp;		/* temp for calculations */
+
+	if (convert)
+		return 0;
+
+	orig_off = align_off = *offp;
+	orig_alen = align_alen = *lenp;
+	orig_end = orig_off + orig_alen;
+
+	/*
+	 * If this request overlaps an existing extent, then don't
+	 * attempt to perform any additional alignment.
+	 */
+	if (!delay && !eof &&
+	    (orig_off >= gotp->br_startoff) &&
+	    (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+		return 0;
+	}
+
+	/*
+	 * If the file offset is unaligned vs. the extent size
+	 * we need to align it.  This will be possible unless
+	 * the file was previously written with a kernel that didn't
+	 * perform this alignment, or if a truncate shot us in the
+	 * foot.
+	 */
+	div_u64_rem(orig_off, extsz, &temp);
+	if (temp) {
+		align_alen += temp;
+		align_off -= temp;
+	}
+
+	/* Same adjustment for the end of the requested area. */
+	temp = (align_alen % extsz);
+	if (temp)
+		align_alen += extsz - temp;
+
+	/*
+	 * For large extent hint sizes, the aligned extent might be larger than
+	 * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
+	 * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
+	 * allocation loops handle short allocation just fine, so it is safe to
+	 * do this. We only want to do it when we are forced to, though, because
+	 * it means more allocation operations are required.
+	 */
+	while (align_alen > XFS_MAX_BMBT_EXTLEN)
+		align_alen -= extsz;
+	ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
+
+	/*
+	 * If the previous block overlaps with this proposed allocation
+	 * then move the start forward without adjusting the length.
+	 */
+	if (prevp->br_startoff != NULLFILEOFF) {
+		if (prevp->br_startblock == HOLESTARTBLOCK)
+			prevo = prevp->br_startoff;
+		else
+			prevo = prevp->br_startoff + prevp->br_blockcount;
+	} else
+		prevo = 0;
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	/*
+	 * If the next block overlaps with this proposed allocation
+	 * then move the start back without adjusting the length,
+	 * but not before offset 0.
+	 * This may of course make the start overlap previous block,
+	 * and if we hit the offset 0 limit then the next block
+	 * can still overlap too.
+	 */
+	if (!eof && gotp->br_startoff != NULLFILEOFF) {
+		if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+		    (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+			nexto = gotp->br_startoff + gotp->br_blockcount;
+		else
+			nexto = gotp->br_startoff;
+	} else
+		nexto = NULLFILEOFF;
+	if (!eof &&
+	    align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto)
+		align_off = nexto > align_alen ? nexto - align_alen : 0;
+	/*
+	 * If we're now overlapping the next or previous extent that
+	 * means we can't fit an extsz piece in this hole.  Just move
+	 * the start forward to the first valid spot and set
+	 * the length so we hit the end.
+	 */
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	if (align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto &&
+	    nexto != NULLFILEOFF) {
+		ASSERT(nexto > prevo);
+		align_alen = nexto - align_off;
+	}
+
+	/*
+	 * If realtime, and the result isn't a multiple of the realtime
+	 * extent size we need to remove blocks until it is.
+	 */
+	if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+		/*
+		 * We're not covering the original request, or
+		 * we won't be able to once we fix the length.
+		 */
+		if (orig_off < align_off ||
+		    orig_end > align_off + align_alen ||
+		    align_alen - temp < orig_alen)
+			return -EINVAL;
+		/*
+		 * Try to fix it by moving the start up.
+		 */
+		if (align_off + temp <= orig_off) {
+			align_alen -= temp;
+			align_off += temp;
+		}
+		/*
+		 * Try to fix it by moving the end in.
+		 */
+		else if (align_off + align_alen - temp >= orig_end)
+			align_alen -= temp;
+		/*
+		 * Set the start to the minimum then trim the length.
+		 */
+		else {
+			align_alen -= orig_off - align_off;
+			align_off = orig_off;
+			align_alen -= align_alen % mp->m_sb.sb_rextsize;
+		}
+		/*
+		 * Result doesn't cover the request, fail it.
+		 */
+		if (orig_off < align_off || orig_end > align_off + align_alen)
+			return -EINVAL;
+	} else {
+		ASSERT(orig_off >= align_off);
+		/* see XFS_BMBT_MAX_EXTLEN handling above */
+		ASSERT(orig_end <= align_off + align_alen ||
+		       align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
+	}
+
+#ifdef DEBUG
+	if (!eof && gotp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off + align_alen <= gotp->br_startoff);
+	if (prevp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+
+	*lenp = align_alen;
+	*offp = align_off;
+	return 0;
+}
+
+#define XFS_ALLOC_GAP_UNITS	4
+
+void
+xfs_bmap_adjacent(
+	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
+{
+	xfs_fsblock_t	adjust;		/* adjustment to block numbers */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		rt;		/* true if inode is realtime */
+
+#define	ISVALID(x,y)	\
+	(rt ? \
+		(x) < mp->m_sb.sb_rblocks : \
+		XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+		XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+	mp = ap->ip->i_mount;
+	nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+	rt = XFS_IS_REALTIME_INODE(ap->ip) &&
+		(ap->datatype & XFS_ALLOC_USERDATA);
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
+							ap->tp->t_firstblock);
+	/*
+	 * If allocating at eof, and there's a previous real block,
+	 * try to use its last block as our starting point.
+	 */
+	if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+	    !isnullstartblock(ap->prev.br_startblock) &&
+	    ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+		    ap->prev.br_startblock)) {
+		ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
+		/*
+		 * Adjust for the gap between prevp and us.
+		 */
+		adjust = ap->offset -
+			(ap->prev.br_startoff + ap->prev.br_blockcount);
+		if (adjust &&
+		    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+			ap->blkno += adjust;
+	}
+	/*
+	 * If not at eof, then compare the two neighbor blocks.
+	 * Figure out whether either one gives us a good starting point,
+	 * and pick the better one.
+	 */
+	else if (!ap->eof) {
+		xfs_fsblock_t	gotbno;		/* right side block number */
+		xfs_fsblock_t	gotdiff=0;	/* right side difference */
+		xfs_fsblock_t	prevbno;	/* left side block number */
+		xfs_fsblock_t	prevdiff=0;	/* left side difference */
+
+		/*
+		 * If there's a previous (left) block, select a requested
+		 * start block based on it.
+		 */
+		if (ap->prev.br_startoff != NULLFILEOFF &&
+		    !isnullstartblock(ap->prev.br_startblock) &&
+		    (prevbno = ap->prev.br_startblock +
+			       ap->prev.br_blockcount) &&
+		    ISVALID(prevbno, ap->prev.br_startblock)) {
+			/*
+			 * Calculate gap to end of previous block.
+			 */
+			adjust = prevdiff = ap->offset -
+				(ap->prev.br_startoff +
+				 ap->prev.br_blockcount);
+			/*
+			 * Figure the startblock based on the previous block's
+			 * end and the gap size.
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an invalid block
+			 * number, then just use the end of the previous block.
+			 */
+			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+			    ISVALID(prevbno + prevdiff,
+				    ap->prev.br_startblock))
+				prevbno += adjust;
+			else
+				prevdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+				prevbno = NULLFSBLOCK;
+		}
+		/*
+		 * No previous block or can't follow it, just default.
+		 */
+		else
+			prevbno = NULLFSBLOCK;
+		/*
+		 * If there's a following (right) block, select a requested
+		 * start block based on it.
+		 */
+		if (!isnullstartblock(ap->got.br_startblock)) {
+			/*
+			 * Calculate gap to start of next block.
+			 */
+			adjust = gotdiff = ap->got.br_startoff - ap->offset;
+			/*
+			 * Figure the startblock based on the next block's
+			 * start and the gap size.
+			 */
+			gotbno = ap->got.br_startblock;
+			/*
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an invalid block
+			 * number, then just use the start of the next block
+			 * offset by our length.
+			 */
+			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+			    ISVALID(gotbno - gotdiff, gotbno))
+				gotbno -= adjust;
+			else if (ISVALID(gotbno - ap->length, gotbno)) {
+				gotbno -= ap->length;
+				gotdiff += adjust - ap->length;
+			} else
+				gotdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+				gotbno = NULLFSBLOCK;
+		}
+		/*
+		 * No next block, just default.
+		 */
+		else
+			gotbno = NULLFSBLOCK;
+		/*
+		 * If both valid, pick the better one, else the only good
+		 * one, else ap->blkno is already set (to 0 or the inode block).
+		 */
+		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+			ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
+		else if (prevbno != NULLFSBLOCK)
+			ap->blkno = prevbno;
+		else if (gotbno != NULLFSBLOCK)
+			ap->blkno = gotbno;
+	}
+#undef ISVALID
+}
+
+static int
+xfs_bmap_longest_free_extent(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		ag,
+	xfs_extlen_t		*blen,
+	int			*notinit)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_perag	*pag;
+	xfs_extlen_t		longest;
+	int			error = 0;
+
+	pag = xfs_perag_get(mp, ag);
+	if (!pag->pagf_init) {
+		error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK,
+				NULL);
+		if (error) {
+			/* Couldn't lock the AGF, so skip this AG. */
+			if (error == -EAGAIN) {
+				*notinit = 1;
+				error = 0;
+			}
+			goto out;
+		}
+	}
+
+	longest = xfs_alloc_longest_free_extent(pag,
+				xfs_alloc_min_freelist(mp, pag),
+				xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
+	if (*blen < longest)
+		*blen = longest;
+
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen,
+	int			notinit)
+{
+	if (notinit || *blen < ap->minlen) {
+		/*
+		 * Since we did a BUF_TRYLOCK above, it is possible that
+		 * there is space for this request.
+		 */
+		args->minlen = ap->minlen;
+	} else if (*blen < args->maxlen) {
+		/*
+		 * If the best seen length is less than the request length,
+		 * use the best as the minimum.
+		 */
+		args->minlen = *blen;
+	} else {
+		/*
+		 * Otherwise we've seen an extent as big as maxlen, use that
+		 * as the minimum.
+		 */
+		args->minlen = args->maxlen;
+	}
+}
+
+STATIC int
+xfs_bmap_btalloc_nullfb(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	xfs_agnumber_t		ag, startag;
+	int			notinit = 0;
+	int			error;
+
+	args->type = XFS_ALLOCTYPE_START_BNO;
+	args->total = ap->total;
+
+	startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (startag == NULLAGNUMBER)
+		startag = ag = 0;
+
+	while (*blen < args->maxlen) {
+		error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+						     &notinit);
+		if (error)
+			return error;
+
+		if (++ag == mp->m_sb.sb_agcount)
+			ag = 0;
+		if (ag == startag)
+			break;
+	}
+
+	xfs_bmap_select_minlen(ap, args, blen, notinit);
+	return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	xfs_agnumber_t		ag;
+	int			notinit = 0;
+	int			error;
+
+	args->type = XFS_ALLOCTYPE_NEAR_BNO;
+	args->total = ap->total;
+
+	ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (ag == NULLAGNUMBER)
+		ag = 0;
+
+	error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+	if (error)
+		return error;
+
+	if (*blen < args->maxlen) {
+		error = xfs_filestream_new_ag(ap, &ag);
+		if (error)
+			return error;
+
+		error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+						     &notinit);
+		if (error)
+			return error;
+
+	}
+
+	xfs_bmap_select_minlen(ap, args, blen, notinit);
+
+	/*
+	 * Set the failure fallback case to look in the selected AG as stream
+	 * may have moved.
+	 */
+	ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+	return 0;
+}
+
+/* Update all inode and quota accounting for the allocation we just did. */
+static void
+xfs_bmap_btalloc_accounting(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args)
+{
+	if (ap->flags & XFS_BMAPI_COWFORK) {
+		/*
+		 * COW fork blocks are in-core only and thus are treated as
+		 * in-core quota reservation (like delalloc blocks) even when
+		 * converted to real blocks. The quota reservation is not
+		 * accounted to disk until blocks are remapped to the data
+		 * fork. So if these blocks were previously delalloc, we
+		 * already have quota reservation and there's nothing to do
+		 * yet.
+		 */
+		if (ap->wasdel) {
+			xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+			return;
+		}
+
+		/*
+		 * Otherwise, we've allocated blocks in a hole. The transaction
+		 * has acquired in-core quota reservation for this extent.
+		 * Rather than account these as real blocks, however, we reduce
+		 * the transaction quota reservation based on the allocation.
+		 * This essentially transfers the transaction quota reservation
+		 * to that of a delalloc extent.
+		 */
+		ap->ip->i_delayed_blks += args->len;
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS,
+				-(long)args->len);
+		return;
+	}
+
+	/* data/attr fork only */
+	ap->ip->i_nblocks += args->len;
+	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+	if (ap->wasdel) {
+		ap->ip->i_delayed_blks -= args->len;
+		xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+	}
+	xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+		ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
+		args->len);
+}
+
+static int
+xfs_bmap_compute_alignments(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args)
+{
+	struct xfs_mount	*mp = args->mp;
+	xfs_extlen_t		align = 0; /* minimum allocation alignment */
+	int			stripe_align = 0;
+
+	/* stripe alignment for allocation is determined by mount parameters */
+	if (mp->m_swidth && xfs_has_swalloc(mp))
+		stripe_align = mp->m_swidth;
+	else if (mp->m_dalign)
+		stripe_align = mp->m_dalign;
+
+	if (ap->flags & XFS_BMAPI_COWFORK)
+		align = xfs_get_cowextsz_hint(ap->ip);
+	else if (ap->datatype & XFS_ALLOC_USERDATA)
+		align = xfs_get_extsz_hint(ap->ip);
+	if (align) {
+		if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
+					ap->eof, 0, ap->conv, &ap->offset,
+					&ap->length))
+			ASSERT(0);
+		ASSERT(ap->length);
+	}
+
+	/* apply extent size hints if obtained earlier */
+	if (align) {
+		args->prod = align;
+		div_u64_rem(ap->offset, args->prod, &args->mod);
+		if (args->mod)
+			args->mod = args->prod - args->mod;
+	} else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
+		args->prod = 1;
+		args->mod = 0;
+	} else {
+		args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
+		div_u64_rem(ap->offset, args->prod, &args->mod);
+		if (args->mod)
+			args->mod = args->prod - args->mod;
+	}
+
+	return stripe_align;
+}
+
+static void
+xfs_bmap_process_allocated_extent(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_fileoff_t		orig_offset,
+	xfs_extlen_t		orig_length)
+{
+	int			nullfb;
+
+	nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+
+	/*
+	 * check the allocation happened at the same or higher AG than
+	 * the first block that was allocated.
+	 */
+	ASSERT(nullfb ||
+		XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <=
+		XFS_FSB_TO_AGNO(args->mp, args->fsbno));
+
+	ap->blkno = args->fsbno;
+	if (nullfb)
+		ap->tp->t_firstblock = args->fsbno;
+	ap->length = args->len;
+	/*
+	 * If the extent size hint is active, we tried to round the
+	 * caller's allocation request offset down to extsz and the
+	 * length up to another extsz boundary.  If we found a free
+	 * extent we mapped it in starting at this new offset.  If the
+	 * newly mapped space isn't long enough to cover any of the
+	 * range of offsets that was originally requested, move the
+	 * mapping up so that we can fill as much of the caller's
+	 * original request as possible.  Free space is apparently
+	 * very fragmented so we're unlikely to be able to satisfy the
+	 * hints anyway.
+	 */
+	if (ap->length <= orig_length)
+		ap->offset = orig_offset;
+	else if (ap->offset + ap->length < orig_offset + orig_length)
+		ap->offset = orig_offset + orig_length - ap->length;
+	xfs_bmap_btalloc_accounting(ap, args);
+}
+
+#ifdef DEBUG
+static int
+xfs_bmap_exact_minlen_extent_alloc(
+	struct xfs_bmalloca	*ap)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	struct xfs_alloc_arg	args = { .tp = ap->tp, .mp = mp };
+	xfs_fileoff_t		orig_offset;
+	xfs_extlen_t		orig_length;
+	int			error;
+
+	ASSERT(ap->length);
+
+	if (ap->minlen != 1) {
+		ap->blkno = NULLFSBLOCK;
+		ap->length = 0;
+		return 0;
+	}
+
+	orig_offset = ap->offset;
+	orig_length = ap->length;
+
+	args.alloc_minlen_only = 1;
+
+	xfs_bmap_compute_alignments(ap, &args);
+
+	if (ap->tp->t_firstblock == NULLFSBLOCK) {
+		/*
+		 * Unlike the longest extent available in an AG, we don't track
+		 * the length of an AG's shortest extent.
+		 * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and
+		 * hence we can afford to start traversing from the 0th AG since
+		 * we need not be concerned about a drop in performance in
+		 * "debug only" code paths.
+		 */
+		ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0);
+	} else {
+		ap->blkno = ap->tp->t_firstblock;
+	}
+
+	args.fsbno = ap->blkno;
+	args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+	args.type = XFS_ALLOCTYPE_FIRST_AG;
+	args.minlen = args.maxlen = ap->minlen;
+	args.total = ap->total;
+
+	args.alignment = 1;
+	args.minalignslop = 0;
+
+	args.minleft = ap->minleft;
+	args.wasdel = ap->wasdel;
+	args.resv = XFS_AG_RESV_NONE;
+	args.datatype = ap->datatype;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		return error;
+
+	if (args.fsbno != NULLFSBLOCK) {
+		xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+			orig_length);
+	} else {
+		ap->blkno = NULLFSBLOCK;
+		ap->length = 0;
+	}
+
+	return 0;
+}
+#else
+
+#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED)
+
+#endif
+
+STATIC int
+xfs_bmap_btalloc(
+	struct xfs_bmalloca	*ap)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	struct xfs_alloc_arg	args = { .tp = ap->tp, .mp = mp };
+	xfs_alloctype_t		atype = 0;
+	xfs_agnumber_t		fb_agno;	/* ag number of ap->firstblock */
+	xfs_agnumber_t		ag;
+	xfs_fileoff_t		orig_offset;
+	xfs_extlen_t		orig_length;
+	xfs_extlen_t		blen;
+	xfs_extlen_t		nextminlen = 0;
+	int			nullfb; /* true if ap->firstblock isn't set */
+	int			isaligned;
+	int			tryagain;
+	int			error;
+	int			stripe_align;
+
+	ASSERT(ap->length);
+	orig_offset = ap->offset;
+	orig_length = ap->length;
+
+	stripe_align = xfs_bmap_compute_alignments(ap, &args);
+
+	nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
+							ap->tp->t_firstblock);
+	if (nullfb) {
+		if ((ap->datatype & XFS_ALLOC_USERDATA) &&
+		    xfs_inode_is_filestream(ap->ip)) {
+			ag = xfs_filestream_lookup_ag(ap->ip);
+			ag = (ag != NULLAGNUMBER) ? ag : 0;
+			ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
+		} else {
+			ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+		}
+	} else
+		ap->blkno = ap->tp->t_firstblock;
+
+	xfs_bmap_adjacent(ap);
+
+	/*
+	 * If allowed, use ap->blkno; otherwise must use firstblock since
+	 * it's in the right allocation group.
+	 */
+	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
+		;
+	else
+		ap->blkno = ap->tp->t_firstblock;
+	/*
+	 * Normal allocation, done through xfs_alloc_vextent.
+	 */
+	tryagain = isaligned = 0;
+	args.fsbno = ap->blkno;
+	args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+
+	/* Trim the allocation back to the maximum an AG can fit. */
+	args.maxlen = min(ap->length, mp->m_ag_max_usable);
+	blen = 0;
+	if (nullfb) {
+		/*
+		 * Search for an allocation group with a single extent large
+		 * enough for the request.  If one isn't found, then adjust
+		 * the minimum allocation size to the largest space found.
+		 */
+		if ((ap->datatype & XFS_ALLOC_USERDATA) &&
+		    xfs_inode_is_filestream(ap->ip))
+			error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+		else
+			error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+		if (error)
+			return error;
+	} else if (ap->tp->t_flags & XFS_TRANS_LOWMODE) {
+		if (xfs_inode_is_filestream(ap->ip))
+			args.type = XFS_ALLOCTYPE_FIRST_AG;
+		else
+			args.type = XFS_ALLOCTYPE_START_BNO;
+		args.total = args.minlen = ap->minlen;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.total = ap->total;
+		args.minlen = ap->minlen;
+	}
+
+	/*
+	 * If we are not low on available data blocks, and the underlying
+	 * logical volume manager is a stripe, and the file offset is zero then
+	 * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof
+	 * is only set if the allocation length is >= the stripe unit and the
+	 * allocation offset is at the end of file.
+	 */
+	if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) {
+		if (!ap->offset) {
+			args.alignment = stripe_align;
+			atype = args.type;
+			isaligned = 1;
+			/*
+			 * Adjust minlen to try and preserve alignment if we
+			 * can't guarantee an aligned maxlen extent.
+			 */
+			if (blen > args.alignment &&
+			    blen <= args.maxlen + args.alignment)
+				args.minlen = blen - args.alignment;
+			args.minalignslop = 0;
+		} else {
+			/*
+			 * First try an exact bno allocation.
+			 * If it fails then do a near or start bno
+			 * allocation with alignment turned on.
+			 */
+			atype = args.type;
+			tryagain = 1;
+			args.type = XFS_ALLOCTYPE_THIS_BNO;
+			args.alignment = 1;
+			/*
+			 * Compute the minlen+alignment for the
+			 * next case.  Set slop so that the value
+			 * of minlen+alignment+slop doesn't go up
+			 * between the calls.
+			 */
+			if (blen > stripe_align && blen <= args.maxlen)
+				nextminlen = blen - stripe_align;
+			else
+				nextminlen = args.minlen;
+			if (nextminlen + stripe_align > args.minlen + 1)
+				args.minalignslop =
+					nextminlen + stripe_align -
+					args.minlen - 1;
+			else
+				args.minalignslop = 0;
+		}
+	} else {
+		args.alignment = 1;
+		args.minalignslop = 0;
+	}
+	args.minleft = ap->minleft;
+	args.wasdel = ap->wasdel;
+	args.resv = XFS_AG_RESV_NONE;
+	args.datatype = ap->datatype;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		return error;
+
+	if (tryagain && args.fsbno == NULLFSBLOCK) {
+		/*
+		 * Exact allocation failed. Now try with alignment
+		 * turned on.
+		 */
+		args.type = atype;
+		args.fsbno = ap->blkno;
+		args.alignment = stripe_align;
+		args.minlen = nextminlen;
+		args.minalignslop = 0;
+		isaligned = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		/*
+		 * allocation failed, so turn off alignment and
+		 * try again.
+		 */
+		args.type = atype;
+		args.fsbno = ap->blkno;
+		args.alignment = 0;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (args.fsbno == NULLFSBLOCK && nullfb &&
+	    args.minlen > ap->minlen) {
+		args.minlen = ap->minlen;
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = ap->blkno;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (args.fsbno == NULLFSBLOCK && nullfb) {
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.total = ap->minlen;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+		ap->tp->t_flags |= XFS_TRANS_LOWMODE;
+	}
+
+	if (args.fsbno != NULLFSBLOCK) {
+		xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+			orig_length);
+	} else {
+		ap->blkno = NULLFSBLOCK;
+		ap->length = 0;
+	}
+	return 0;
+}
+
+/* Trim extent to fit a logical block range. */
+void
+xfs_trim_extent(
+	struct xfs_bmbt_irec	*irec,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len)
+{
+	xfs_fileoff_t		distance;
+	xfs_fileoff_t		end = bno + len;
+
+	if (irec->br_startoff + irec->br_blockcount <= bno ||
+	    irec->br_startoff >= end) {
+		irec->br_blockcount = 0;
+		return;
+	}
+
+	if (irec->br_startoff < bno) {
+		distance = bno - irec->br_startoff;
+		if (isnullstartblock(irec->br_startblock))
+			irec->br_startblock = DELAYSTARTBLOCK;
+		if (irec->br_startblock != DELAYSTARTBLOCK &&
+		    irec->br_startblock != HOLESTARTBLOCK)
+			irec->br_startblock += distance;
+		irec->br_startoff += distance;
+		irec->br_blockcount -= distance;
+	}
+
+	if (end < irec->br_startoff + irec->br_blockcount) {
+		distance = irec->br_startoff + irec->br_blockcount - end;
+		irec->br_blockcount -= distance;
+	}
+}
+
+/*
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+	struct xfs_bmbt_irec	*mval,
+	struct xfs_bmbt_irec	*got,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			n,
+	uint32_t		flags)
+{
+	if ((flags & XFS_BMAPI_ENTIRE) ||
+	    got->br_startoff + got->br_blockcount <= obno) {
+		*mval = *got;
+		if (isnullstartblock(got->br_startblock))
+			mval->br_startblock = DELAYSTARTBLOCK;
+		return;
+	}
+
+	if (obno > *bno)
+		*bno = obno;
+	ASSERT((*bno >= obno) || (n == 0));
+	ASSERT(*bno < end);
+	mval->br_startoff = *bno;
+	if (isnullstartblock(got->br_startblock))
+		mval->br_startblock = DELAYSTARTBLOCK;
+	else
+		mval->br_startblock = got->br_startblock +
+					(*bno - got->br_startoff);
+	/*
+	 * Return the minimum of what we got and what we asked for for
+	 * the length.  We can use the len variable here because it is
+	 * modified below and we could have been there before coming
+	 * here if the first part of the allocation didn't overlap what
+	 * was asked for.
+	 */
+	mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+			got->br_blockcount - (*bno - got->br_startoff));
+	mval->br_state = got->br_state;
+	ASSERT(mval->br_blockcount <= len);
+	return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+	struct xfs_bmbt_irec	**map,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		*len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			*n,
+	uint32_t		flags)
+{
+	xfs_bmbt_irec_t	*mval = *map;
+
+	ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+	       ((mval->br_startoff + mval->br_blockcount) <= end));
+	ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+	       (mval->br_startoff < obno));
+
+	*bno = mval->br_startoff + mval->br_blockcount;
+	*len = end - *bno;
+	if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+		/* update previous map with new information */
+		ASSERT(mval->br_startblock == mval[-1].br_startblock);
+		ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+		ASSERT(mval->br_state == mval[-1].br_state);
+		mval[-1].br_blockcount = mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != HOLESTARTBLOCK &&
+		   mval->br_startblock == mval[-1].br_startblock +
+					  mval[-1].br_blockcount &&
+		   mval[-1].br_state == mval->br_state) {
+		ASSERT(mval->br_startoff ==
+		       mval[-1].br_startoff + mval[-1].br_blockcount);
+		mval[-1].br_blockcount += mval->br_blockcount;
+	} else if (*n > 0 &&
+		   mval->br_startblock == DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+		   mval->br_startoff ==
+		   mval[-1].br_startoff + mval[-1].br_blockcount) {
+		mval[-1].br_blockcount += mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (!((*n == 0) &&
+		     ((mval->br_startoff + mval->br_blockcount) <=
+		      obno))) {
+		mval++;
+		(*n)++;
+	}
+	*map = mval;
+}
+
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*mval,
+	int			*nmap,
+	uint32_t		flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			whichfork = xfs_bmapi_whichfork(flags);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_bmbt_irec	got;
+	xfs_fileoff_t		obno;
+	xfs_fileoff_t		end;
+	struct xfs_iext_cursor	icur;
+	int			error;
+	bool			eof = false;
+	int			n = 0;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE)));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+
+	if (WARN_ON_ONCE(!ifp))
+		return -EFSCORRUPTED;
+
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
+		return -EFSCORRUPTED;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	XFS_STATS_INC(mp, xs_blk_mapr);
+
+	error = xfs_iread_extents(NULL, ip, whichfork);
+	if (error)
+		return error;
+
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got))
+		eof = true;
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		/* Reading past eof, act as though there's a hole up to end. */
+		if (eof)
+			got.br_startoff = end;
+		if (got.br_startoff > bno) {
+			/* Reading in a hole.  */
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+			mval->br_state = XFS_EXT_NORM;
+			bno += mval->br_blockcount;
+			len -= mval->br_blockcount;
+			mval++;
+			n++;
+			continue;
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		if (!xfs_iext_next_extent(ifp, &icur, &got))
+			eof = true;
+	}
+	*nmap = n;
+	return 0;
+}
+
+/*
+ * Add a delayed allocation extent to an inode. Blocks are reserved from the
+ * global pool and the extent inserted into the inode in-core extent tree.
+ *
+ * On entry, got refers to the first extent beyond the offset of the extent to
+ * allocate or eof is specified if no such extent exists. On return, got refers
+ * to the extent record that was inserted to the inode fork.
+ *
+ * Note that the allocated extent may have been merged with contiguous extents
+ * during insertion into the inode fork. Thus, got does not reflect the current
+ * state of the inode fork on return. If necessary, the caller can use lastx to
+ * look up the updated record in the inode fork.
+ */
+int
+xfs_bmapi_reserve_delalloc(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_fileoff_t		off,
+	xfs_filblks_t		len,
+	xfs_filblks_t		prealloc,
+	struct xfs_bmbt_irec	*got,
+	struct xfs_iext_cursor	*icur,
+	int			eof)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	xfs_extlen_t		alen;
+	xfs_extlen_t		indlen;
+	int			error;
+	xfs_fileoff_t		aoff = off;
+
+	/*
+	 * Cap the alloc length. Keep track of prealloc so we know whether to
+	 * tag the inode before we return.
+	 */
+	alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
+	if (!eof)
+		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+	if (prealloc && alen >= len)
+		prealloc = alen - len;
+
+	/* Figure out the extent size, adjust alen */
+	if (whichfork == XFS_COW_FORK) {
+		struct xfs_bmbt_irec	prev;
+		xfs_extlen_t		extsz = xfs_get_cowextsz_hint(ip);
+
+		if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
+			prev.br_startoff = NULLFILEOFF;
+
+		error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
+					       1, 0, &aoff, &alen);
+		ASSERT(!error);
+	}
+
+	/*
+	 * Make a transaction-less quota reservation for delayed allocation
+	 * blocks.  This number gets adjusted later.  We return if we haven't
+	 * allocated blocks already inside this loop.
+	 */
+	error = xfs_quota_reserve_blkres(ip, alen);
+	if (error)
+		return error;
+
+	/*
+	 * Split changing sb for alen and indlen since they could be coming
+	 * from different places.
+	 */
+	indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+	ASSERT(indlen > 0);
+
+	error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
+	if (error)
+		goto out_unreserve_quota;
+
+	error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
+	if (error)
+		goto out_unreserve_blocks;
+
+
+	ip->i_delayed_blks += alen;
+	xfs_mod_delalloc(ip->i_mount, alen + indlen);
+
+	got->br_startoff = aoff;
+	got->br_startblock = nullstartblock(indlen);
+	got->br_blockcount = alen;
+	got->br_state = XFS_EXT_NORM;
+
+	xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
+
+	/*
+	 * Tag the inode if blocks were preallocated. Note that COW fork
+	 * preallocation can occur at the start or end of the extent, even when
+	 * prealloc == 0, so we must also check the aligned offset and length.
+	 */
+	if (whichfork == XFS_DATA_FORK && prealloc)
+		xfs_inode_set_eofblocks_tag(ip);
+	if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+		xfs_inode_set_cowblocks_tag(ip);
+
+	return 0;
+
+out_unreserve_blocks:
+	xfs_mod_fdblocks(mp, alen, false);
+out_unreserve_quota:
+	if (XFS_IS_QUOTA_ON(mp))
+		xfs_quota_unreserve_blkres(ip, alen);
+	return error;
+}
+
+static int
+xfs_bmap_alloc_userdata(
+	struct xfs_bmalloca	*bma)
+{
+	struct xfs_mount	*mp = bma->ip->i_mount;
+	int			whichfork = xfs_bmapi_whichfork(bma->flags);
+	int			error;
+
+	/*
+	 * Set the data type being allocated. For the data fork, the first data
+	 * in the file is treated differently to all other allocations. For the
+	 * attribute fork, we only need to ensure the allocated range is not on
+	 * the busy list.
+	 */
+	bma->datatype = XFS_ALLOC_NOBUSY;
+	if (whichfork == XFS_DATA_FORK) {
+		bma->datatype |= XFS_ALLOC_USERDATA;
+		if (bma->offset == 0)
+			bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+
+		if (mp->m_dalign && bma->length >= mp->m_dalign) {
+			error = xfs_bmap_isaeof(bma, whichfork);
+			if (error)
+				return error;
+		}
+
+		if (XFS_IS_REALTIME_INODE(bma->ip))
+			return xfs_bmap_rtalloc(bma);
+	}
+
+	if (unlikely(XFS_TEST_ERROR(false, mp,
+			XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+		return xfs_bmap_exact_minlen_extent_alloc(bma);
+
+	return xfs_bmap_btalloc(bma);
+}
+
+static int
+xfs_bmapi_allocate(
+	struct xfs_bmalloca	*bma)
+{
+	struct xfs_mount	*mp = bma->ip->i_mount;
+	int			whichfork = xfs_bmapi_whichfork(bma->flags);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(bma->ip, whichfork);
+	int			tmp_logflags = 0;
+	int			error;
+
+	ASSERT(bma->length > 0);
+
+	/*
+	 * For the wasdelay case, we could also just allocate the stuff asked
+	 * for in this bmap call but that wouldn't be as good.
+	 */
+	if (bma->wasdel) {
+		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+		bma->offset = bma->got.br_startoff;
+		if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
+			bma->prev.br_startoff = NULLFILEOFF;
+	} else {
+		bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
+		if (!bma->eof)
+			bma->length = XFS_FILBLKS_MIN(bma->length,
+					bma->got.br_startoff - bma->offset);
+	}
+
+	if (bma->flags & XFS_BMAPI_CONTIG)
+		bma->minlen = bma->length;
+	else
+		bma->minlen = 1;
+
+	if (bma->flags & XFS_BMAPI_METADATA) {
+		if (unlikely(XFS_TEST_ERROR(false, mp,
+				XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+			error = xfs_bmap_exact_minlen_extent_alloc(bma);
+		else
+			error = xfs_bmap_btalloc(bma);
+	} else {
+		error = xfs_bmap_alloc_userdata(bma);
+	}
+	if (error || bma->blkno == NULLFSBLOCK)
+		return error;
+
+	if (bma->flags & XFS_BMAPI_ZERO) {
+		error = xfs_zero_extent(bma->ip, bma->blkno, bma->length);
+		if (error)
+			return error;
+	}
+
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur)
+		bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+	/*
+	 * Bump the number of extents we've allocated
+	 * in this call.
+	 */
+	bma->nallocs++;
+
+	if (bma->cur)
+		bma->cur->bc_ino.flags =
+			bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+
+	bma->got.br_startoff = bma->offset;
+	bma->got.br_startblock = bma->blkno;
+	bma->got.br_blockcount = bma->length;
+	bma->got.br_state = XFS_EXT_NORM;
+
+	if (bma->flags & XFS_BMAPI_PREALLOC)
+		bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+	if (bma->wasdel)
+		error = xfs_bmap_add_extent_delay_real(bma, whichfork);
+	else
+		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
+				whichfork, &bma->icur, &bma->cur, &bma->got,
+				&bma->logflags, bma->flags);
+
+	bma->logflags |= tmp_logflags;
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+	 * or xfs_bmap_add_extent_hole_real might have merged it into one of
+	 * the neighbouring ones.
+	 */
+	xfs_iext_get_extent(ifp, &bma->icur, &bma->got);
+
+	ASSERT(bma->got.br_startoff <= bma->offset);
+	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+	       bma->offset + bma->length);
+	ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+	       bma->got.br_state == XFS_EXT_UNWRITTEN);
+	return 0;
+}
+
+STATIC int
+xfs_bmapi_convert_unwritten(
+	struct xfs_bmalloca	*bma,
+	struct xfs_bmbt_irec	*mval,
+	xfs_filblks_t		len,
+	uint32_t		flags)
+{
+	int			whichfork = xfs_bmapi_whichfork(flags);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(bma->ip, whichfork);
+	int			tmp_logflags = 0;
+	int			error;
+
+	/* check if we need to do unwritten->real conversion */
+	if (mval->br_state == XFS_EXT_UNWRITTEN &&
+	    (flags & XFS_BMAPI_PREALLOC))
+		return 0;
+
+	/* check if we need to do real->unwritten conversion */
+	if (mval->br_state == XFS_EXT_NORM &&
+	    (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+		return 0;
+
+	/*
+	 * Modify (by adding) the state flag, if writing.
+	 */
+	ASSERT(mval->br_blockcount <= len);
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+					bma->ip, whichfork);
+	}
+	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+	/*
+	 * Before insertion into the bmbt, zero the range being converted
+	 * if required.
+	 */
+	if (flags & XFS_BMAPI_ZERO) {
+		error = xfs_zero_extent(bma->ip, mval->br_startblock,
+					mval->br_blockcount);
+		if (error)
+			return error;
+	}
+
+	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
+			&bma->icur, &bma->cur, mval, &tmp_logflags);
+	/*
+	 * Log the inode core unconditionally in the unwritten extent conversion
+	 * path because the conversion might not have done so (e.g., if the
+	 * extent count hasn't changed). We need to make sure the inode is dirty
+	 * in the transaction for the sake of fsync(), even if nothing has
+	 * changed, because fsync() will not force the log for this transaction
+	 * unless it sees the inode pinned.
+	 *
+	 * Note: If we're only converting cow fork extents, there aren't
+	 * any on-disk updates to make, so we don't need to log anything.
+	 */
+	if (whichfork != XFS_COW_FORK)
+		bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that
+	 * xfs_bmap_add_extent_unwritten_real might have merged it into one
+	 * of the neighbouring ones.
+	 */
+	xfs_iext_get_extent(ifp, &bma->icur, &bma->got);
+
+	/*
+	 * We may have combined previously unwritten space with written space,
+	 * so generate another request.
+	 */
+	if (mval->br_blockcount < len)
+		return -EAGAIN;
+	return 0;
+}
+
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			fork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, fork);
+
+	if (tp && tp->t_firstblock != NULLFSBLOCK)
+		return 0;
+	if (ifp->if_format != XFS_DINODE_FMT_BTREE)
+		return 1;
+	return be16_to_cpu(ifp->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error.  Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should.  Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+	struct xfs_bmalloca	*bma,
+	int			whichfork,
+	int			error)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(bma->ip, whichfork);
+
+	if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+	    ifp->if_format != XFS_DINODE_FMT_EXTENTS)
+		bma->logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+		 ifp->if_format != XFS_DINODE_FMT_BTREE)
+		bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+	if (bma->logflags)
+		xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+	if (bma->cur)
+		xfs_btree_del_cursor(bma->cur, error);
+}
+
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary.  Details behaviour is controlled by the flags
+ * parameter.  Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ */
+int
+xfs_bmapi_write(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting file offs. mapped */
+	xfs_filblks_t		len,		/* length to map in file */
+	uint32_t		flags,		/* XFS_BMAPI_... */
+	xfs_extlen_t		total,		/* total blocks needed */
+	struct xfs_bmbt_irec	*mval,		/* output: map values */
+	int			*nmap)		/* i/o: mval size/count */
+{
+	struct xfs_bmalloca	bma = {
+		.tp		= tp,
+		.ip		= ip,
+		.total		= total,
+	};
+	struct xfs_mount	*mp = ip->i_mount;
+	int			whichfork = xfs_bmapi_whichfork(flags);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	xfs_fileoff_t		end;		/* end of mapped file region */
+	bool			eof = false;	/* after the end of extents */
+	int			error;		/* error return */
+	int			n;		/* current extent index */
+	xfs_fileoff_t		obno;		/* old block number (offset) */
+
+#ifdef DEBUG
+	xfs_fileoff_t		orig_bno;	/* original block number value */
+	int			orig_flags;	/* original flags arg value */
+	xfs_filblks_t		orig_len;	/* original value of len arg */
+	struct xfs_bmbt_irec	*orig_mval;	/* original value of mval */
+	int			orig_nmap;	/* original value of *nmap */
+
+	orig_bno = bno;
+	orig_len = len;
+	orig_flags = flags;
+	orig_mval = mval;
+	orig_nmap = *nmap;
+#endif
+
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(tp != NULL);
+	ASSERT(len > 0);
+	ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!(flags & XFS_BMAPI_REMAP));
+
+	/* zeroing is for currently only for data extents, not metadata */
+	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+			(XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+	/*
+	 * we can allocate unwritten extents or pre-zero allocated blocks,
+	 * but it makes no sense to do both at once. This would result in
+	 * zeroing the unwritten extent twice, but it still being an
+	 * unwritten extent....
+	 */
+	ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+			(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		return -EFSCORRUPTED;
+	}
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	XFS_STATS_INC(mp, xs_blk_mapw);
+
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		goto error0;
+
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
+		eof = true;
+	if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+		bma.prev.br_startoff = NULLFILEOFF;
+	bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+
+	n = 0;
+	end = bno + len;
+	obno = bno;
+	while (bno < end && n < *nmap) {
+		bool			need_alloc = false, wasdelay = false;
+
+		/* in hole or beyond EOF? */
+		if (eof || bma.got.br_startoff > bno) {
+			/*
+			 * CoW fork conversions should /never/ hit EOF or
+			 * holes.  There should always be something for us
+			 * to work on.
+			 */
+			ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
+			         (flags & XFS_BMAPI_COWFORK)));
+
+			need_alloc = true;
+		} else if (isnullstartblock(bma.got.br_startblock)) {
+			wasdelay = true;
+		}
+
+		/*
+		 * First, deal with the hole before the allocated space
+		 * that we found, if any.
+		 */
+		if (need_alloc || wasdelay) {
+			bma.eof = eof;
+			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+			bma.wasdel = wasdelay;
+			bma.offset = bno;
+			bma.flags = flags;
+
+			/*
+			 * There's a 32/64 bit type mismatch between the
+			 * allocation length request (which can be 64 bits in
+			 * length) and the bma length request, which is
+			 * xfs_extlen_t and therefore 32 bits. Hence we have to
+			 * check for 32-bit overflows and handle them here.
+			 */
+			if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
+				bma.length = XFS_MAX_BMBT_EXTLEN;
+			else
+				bma.length = len;
+
+			ASSERT(len > 0);
+			ASSERT(bma.length > 0);
+			error = xfs_bmapi_allocate(&bma);
+			if (error)
+				goto error0;
+			if (bma.blkno == NULLFSBLOCK)
+				break;
+
+			/*
+			 * If this is a CoW allocation, record the data in
+			 * the refcount btree for orphan recovery.
+			 */
+			if (whichfork == XFS_COW_FORK)
+				xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+						bma.length);
+		}
+
+		/* Deal with the allocated space we found.  */
+		xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+							end, n, flags);
+
+		/* Execute unwritten extent conversion if necessary */
+		error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+		if (error == -EAGAIN)
+			continue;
+		if (error)
+			goto error0;
+
+		/* update the extent map to return */
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/*
+		 * If we're done, stop now.  Stop when we've allocated
+		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+		 * the transaction may get too big.
+		 */
+		if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		bma.prev = bma.got;
+		if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got))
+			eof = true;
+	}
+	*nmap = n;
+
+	error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+			whichfork);
+	if (error)
+		goto error0;
+
+	ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE ||
+	       ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork));
+	xfs_bmapi_finish(&bma, whichfork, 0);
+	xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+		orig_nmap, *nmap);
+	return 0;
+error0:
+	xfs_bmapi_finish(&bma, whichfork, error);
+	return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_off_t		offset,
+	struct iomap		*iomap,
+	unsigned int		*seq)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	struct xfs_bmalloca	bma = { NULL };
+	uint16_t		flags = 0;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (whichfork == XFS_COW_FORK)
+		flags |= IOMAP_F_SHARED;
+
+	/*
+	 * Space for the extent and indirect blocks was reserved when the
+	 * delalloc extent was created so there's no need to do so here.
+	 */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+				XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_iext_count_may_overflow(ip, whichfork,
+			XFS_IEXT_ADD_NOSPLIT_CNT);
+	if (error == -EFBIG)
+		error = xfs_iext_count_upgrade(tp, ip,
+				XFS_IEXT_ADD_NOSPLIT_CNT);
+	if (error)
+		goto out_trans_cancel;
+
+	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+	    bma.got.br_startoff > offset_fsb) {
+		/*
+		 * No extent found in the range we are trying to convert.  This
+		 * should only happen for the COW fork, where another thread
+		 * might have moved the extent to the data fork in the meantime.
+		 */
+		WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+		error = -EAGAIN;
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * If we find a real extent here we raced with another thread converting
+	 * the extent.  Just return the real extent at this offset.
+	 */
+	if (!isnullstartblock(bma.got.br_startblock)) {
+		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+		*seq = READ_ONCE(ifp->if_seq);
+		goto out_trans_cancel;
+	}
+
+	bma.tp = tp;
+	bma.ip = ip;
+	bma.wasdel = true;
+	bma.offset = bma.got.br_startoff;
+	bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
+			XFS_MAX_BMBT_EXTLEN);
+	bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+
+	/*
+	 * When we're converting the delalloc reservations backing dirty pages
+	 * in the page cache, we must be careful about how we create the new
+	 * extents:
+	 *
+	 * New CoW fork extents are created unwritten, turned into real extents
+	 * when we're about to write the data to disk, and mapped into the data
+	 * fork after the write finishes.  End of story.
+	 *
+	 * New data fork extents must be mapped in as unwritten and converted
+	 * to real extents after the write succeeds to avoid exposing stale
+	 * disk contents if we crash.
+	 */
+	bma.flags = XFS_BMAPI_PREALLOC;
+	if (whichfork == XFS_COW_FORK)
+		bma.flags |= XFS_BMAPI_COWFORK;
+
+	if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+		bma.prev.br_startoff = NULLFILEOFF;
+
+	error = xfs_bmapi_allocate(&bma);
+	if (error)
+		goto out_finish;
+
+	error = -ENOSPC;
+	if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+		goto out_finish;
+	error = -EFSCORRUPTED;
+	if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
+		goto out_finish;
+
+	XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+	XFS_STATS_INC(mp, xs_xstrat_quick);
+
+	ASSERT(!isnullstartblock(bma.got.br_startblock));
+	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+	*seq = READ_ONCE(ifp->if_seq);
+
+	if (whichfork == XFS_COW_FORK)
+		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+
+	error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+			whichfork);
+	if (error)
+		goto out_finish;
+
+	xfs_bmapi_finish(&bma, whichfork, 0);
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_finish:
+	xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+int
+xfs_bmapi_remap(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	xfs_fsblock_t		startblock,
+	uint32_t		flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_bmbt_irec	got;
+	struct xfs_iext_cursor	icur;
+	int			whichfork = xfs_bmapi_whichfork(flags);
+	int			logflags = 0, error;
+
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	ASSERT(len > 0);
+	ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
+			   XFS_BMAPI_NORMAP)));
+	ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
+			(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
+
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		return -EFSCORRUPTED;
+	}
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		return error;
+
+	if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
+		/* make sure we only reflink into a hole. */
+		ASSERT(got.br_startoff > bno);
+		ASSERT(got.br_startoff - bno >= len);
+	}
+
+	ip->i_nblocks += len;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_ino.flags = 0;
+	}
+
+	got.br_startoff = bno;
+	got.br_startblock = startblock;
+	got.br_blockcount = len;
+	if (flags & XFS_BMAPI_PREALLOC)
+		got.br_state = XFS_EXT_UNWRITTEN;
+	else
+		got.br_state = XFS_EXT_NORM;
+
+	error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur,
+			&cur, &got, &logflags, flags);
+	if (error)
+		goto error0;
+
+	error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
+
+error0:
+	if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~XFS_ILOG_DEXT;
+	else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE)
+		logflags &= ~XFS_ILOG_DBROOT;
+
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (cur)
+		xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * When a delalloc extent is split (e.g., due to a hole punch), the original
+ * indlen reservation must be shared across the two new extents that are left
+ * behind.
+ *
+ * Given the original reservation and the worst case indlen for the two new
+ * extents (as calculated by xfs_bmap_worst_indlen()), split the original
+ * reservation fairly across the two new extents. If necessary, steal available
+ * blocks from a deleted extent to make up a reservation deficiency (e.g., if
+ * ores == 1). The number of stolen blocks is returned. The availability and
+ * subsequent accounting of stolen blocks is the responsibility of the caller.
+ */
+static xfs_filblks_t
+xfs_bmap_split_indlen(
+	xfs_filblks_t			ores,		/* original res. */
+	xfs_filblks_t			*indlen1,	/* ext1 worst indlen */
+	xfs_filblks_t			*indlen2,	/* ext2 worst indlen */
+	xfs_filblks_t			avail)		/* stealable blocks */
+{
+	xfs_filblks_t			len1 = *indlen1;
+	xfs_filblks_t			len2 = *indlen2;
+	xfs_filblks_t			nres = len1 + len2; /* new total res. */
+	xfs_filblks_t			stolen = 0;
+	xfs_filblks_t			resfactor;
+
+	/*
+	 * Steal as many blocks as we can to try and satisfy the worst case
+	 * indlen for both new extents.
+	 */
+	if (ores < nres && avail)
+		stolen = XFS_FILBLKS_MIN(nres - ores, avail);
+	ores += stolen;
+
+	 /* nothing else to do if we've satisfied the new reservation */
+	if (ores >= nres)
+		return stolen;
+
+	/*
+	 * We can't meet the total required reservation for the two extents.
+	 * Calculate the percent of the overall shortage between both extents
+	 * and apply this percentage to each of the requested indlen values.
+	 * This distributes the shortage fairly and reduces the chances that one
+	 * of the two extents is left with nothing when extents are repeatedly
+	 * split.
+	 */
+	resfactor = (ores * 100);
+	do_div(resfactor, nres);
+	len1 *= resfactor;
+	do_div(len1, 100);
+	len2 *= resfactor;
+	do_div(len2, 100);
+	ASSERT(len1 + len2 <= ores);
+	ASSERT(len1 < *indlen1 && len2 < *indlen2);
+
+	/*
+	 * Hand out the remainder to each extent. If one of the two reservations
+	 * is zero, we want to make sure that one gets a block first. The loop
+	 * below starts with len1, so hand len2 a block right off the bat if it
+	 * is zero.
+	 */
+	ores -= (len1 + len2);
+	ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores);
+	if (ores && !len2 && *indlen2) {
+		len2++;
+		ores--;
+	}
+	while (ores) {
+		if (len1 < *indlen1) {
+			len1++;
+			ores--;
+		}
+		if (!ores)
+			break;
+		if (len2 < *indlen2) {
+			len2++;
+			ores--;
+		}
+	}
+
+	*indlen1 = len1;
+	*indlen2 = len2;
+
+	return stolen;
+}
+
+int
+xfs_bmap_del_extent_delay(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_iext_cursor	*icur,
+	struct xfs_bmbt_irec	*got,
+	struct xfs_bmbt_irec	*del)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_bmbt_irec	new;
+	int64_t			da_old, da_new, da_diff = 0;
+	xfs_fileoff_t		del_endoff, got_endoff;
+	xfs_filblks_t		got_indlen, new_indlen, stolen;
+	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
+	int			error = 0;
+	bool			isrt;
+
+	XFS_STATS_INC(mp, xs_del_exlist);
+
+	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got->br_startoff + got->br_blockcount;
+	da_old = startblockval(got->br_startblock);
+	da_new = 0;
+
+	ASSERT(del->br_blockcount > 0);
+	ASSERT(got->br_startoff <= del->br_startoff);
+	ASSERT(got_endoff >= del_endoff);
+
+	if (isrt) {
+		uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
+
+		do_div(rtexts, mp->m_sb.sb_rextsize);
+		xfs_mod_frextents(mp, rtexts);
+	}
+
+	/*
+	 * Update the inode delalloc counter now and wait to update the
+	 * sb counters as we might have to borrow some blocks for the
+	 * indirect block accounting.
+	 */
+	ASSERT(!isrt);
+	error = xfs_quota_unreserve_blkres(ip, del->br_blockcount);
+	if (error)
+		return error;
+	ip->i_delayed_blks -= del->br_blockcount;
+
+	if (got->br_startoff == del->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (got_endoff == del_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_prev(ifp, icur);
+		break;
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		got->br_startoff = del_endoff;
+		got->br_blockcount -= del->br_blockcount;
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
+				got->br_blockcount), da_old);
+		got->br_startblock = nullstartblock((int)da_new);
+		xfs_iext_update_extent(ip, state, icur, got);
+		break;
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		got->br_blockcount = got->br_blockcount - del->br_blockcount;
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
+				got->br_blockcount), da_old);
+		got->br_startblock = nullstartblock((int)da_new);
+		xfs_iext_update_extent(ip, state, icur, got);
+		break;
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 *
+		 * Distribute the original indlen reservation across the two new
+		 * extents.  Steal blocks from the deleted extent if necessary.
+		 * Stealing blocks simply fudges the fdblocks accounting below.
+		 * Warn if either of the new indlen reservations is zero as this
+		 * can lead to delalloc problems.
+		 */
+		got->br_blockcount = del->br_startoff - got->br_startoff;
+		got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount);
+
+		new.br_blockcount = got_endoff - del_endoff;
+		new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount);
+
+		WARN_ON_ONCE(!got_indlen || !new_indlen);
+		stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen,
+						       del->br_blockcount);
+
+		got->br_startblock = nullstartblock((int)got_indlen);
+
+		new.br_startoff = del_endoff;
+		new.br_state = got->br_state;
+		new.br_startblock = nullstartblock((int)new_indlen);
+
+		xfs_iext_update_extent(ip, state, icur, got);
+		xfs_iext_next(ifp, icur);
+		xfs_iext_insert(ip, icur, &new, state);
+
+		da_new = got_indlen + new_indlen - stolen;
+		del->br_blockcount -= stolen;
+		break;
+	}
+
+	ASSERT(da_old >= da_new);
+	da_diff = da_old - da_new;
+	if (!isrt)
+		da_diff += del->br_blockcount;
+	if (da_diff) {
+		xfs_mod_fdblocks(mp, da_diff, false);
+		xfs_mod_delalloc(mp, -da_diff);
+	}
+	return error;
+}
+
+void
+xfs_bmap_del_extent_cow(
+	struct xfs_inode	*ip,
+	struct xfs_iext_cursor	*icur,
+	struct xfs_bmbt_irec	*got,
+	struct xfs_bmbt_irec	*del)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+	struct xfs_bmbt_irec	new;
+	xfs_fileoff_t		del_endoff, got_endoff;
+	uint32_t		state = BMAP_COWFORK;
+
+	XFS_STATS_INC(mp, xs_del_exlist);
+
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got->br_startoff + got->br_blockcount;
+
+	ASSERT(del->br_blockcount > 0);
+	ASSERT(got->br_startoff <= del->br_startoff);
+	ASSERT(got_endoff >= del_endoff);
+	ASSERT(!isnullstartblock(got->br_startblock));
+
+	if (got->br_startoff == del->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (got_endoff == del_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_prev(ifp, icur);
+		break;
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		got->br_startoff = del_endoff;
+		got->br_blockcount -= del->br_blockcount;
+		got->br_startblock = del->br_startblock + del->br_blockcount;
+		xfs_iext_update_extent(ip, state, icur, got);
+		break;
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		got->br_blockcount -= del->br_blockcount;
+		xfs_iext_update_extent(ip, state, icur, got);
+		break;
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		got->br_blockcount = del->br_startoff - got->br_startoff;
+
+		new.br_startoff = del_endoff;
+		new.br_blockcount = got_endoff - del_endoff;
+		new.br_state = got->br_state;
+		new.br_startblock = del->br_startblock + del->br_blockcount;
+
+		xfs_iext_update_extent(ip, state, icur, got);
+		xfs_iext_next(ifp, icur);
+		xfs_iext_insert(ip, icur, &new, state);
+		break;
+	}
+	ip->i_delayed_blks -= del->br_blockcount;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space.
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current transaction pointer */
+	struct xfs_iext_cursor	*icur,
+	struct xfs_btree_cur	*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	uint32_t		bflags)	/* bmapi flags */
+{
+	xfs_fsblock_t		del_endblock=0;	/* first block past del */
+	xfs_fileoff_t		del_endoff;	/* first offset past del */
+	int			do_fx;	/* free extent at end of routine */
+	int			error;	/* error return value */
+	int			flags = 0;/* inode logging flags */
+	struct xfs_bmbt_irec	got;	/* current extent entry */
+	xfs_fileoff_t		got_endoff;	/* first offset past got */
+	int			i;	/* temp state */
+	struct xfs_ifork	*ifp;	/* inode fork pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_filblks_t		nblks;	/* quota/sb block count */
+	xfs_bmbt_irec_t		new;	/* new record to be inserted */
+	/* REFERENCED */
+	uint			qfield;	/* quota field to update */
+	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
+	struct xfs_bmbt_irec	old;
+
+	mp = ip->i_mount;
+	XFS_STATS_INC(mp, xs_del_exlist);
+
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	ASSERT(del->br_blockcount > 0);
+	xfs_iext_get_extent(ifp, icur, &got);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	ASSERT(!isnullstartblock(got.br_startblock));
+	qfield = 0;
+	error = 0;
+
+	/*
+	 * If it's the case where the directory code is running with no block
+	 * reservation, and the deleted block is in the middle of its extent,
+	 * and the resulting insert of an extent would cause transformation to
+	 * btree format, then reject it.  The calling code will then swap blocks
+	 * around instead.  We have to do this now, rather than waiting for the
+	 * conversion to btree format, since the transaction will be dirty then.
+	 */
+	if (tp->t_blk_res == 0 &&
+	    ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+	    ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) &&
+	    del->br_startoff > got.br_startoff && del_endoff < got_endoff)
+		return -ENOSPC;
+
+	flags = XFS_ILOG_CORE;
+	if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+		xfs_filblks_t	len;
+		xfs_extlen_t	mod;
+
+		len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
+				  &mod);
+		ASSERT(mod == 0);
+
+		if (!(bflags & XFS_BMAPI_REMAP)) {
+			xfs_fsblock_t	bno;
+
+			bno = div_u64_rem(del->br_startblock,
+					mp->m_sb.sb_rextsize, &mod);
+			ASSERT(mod == 0);
+
+			error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+			if (error)
+				goto done;
+		}
+
+		do_fx = 0;
+		nblks = len * mp->m_sb.sb_rextsize;
+		qfield = XFS_TRANS_DQ_RTBCOUNT;
+	} else {
+		do_fx = 1;
+		nblks = del->br_blockcount;
+		qfield = XFS_TRANS_DQ_BCOUNT;
+	}
+
+	del_endblock = del->br_startblock + del->br_blockcount;
+	if (cur) {
+		error = xfs_bmbt_lookup_eq(cur, &got, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+	}
+
+	if (got.br_startoff == del->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (got_endoff == del_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, icur, state);
+		xfs_iext_prev(ifp, icur);
+		ifp->if_nextents--;
+
+		flags |= XFS_ILOG_CORE;
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_btree_delete(cur, &i)))
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		break;
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		got.br_startoff = del_endoff;
+		got.br_startblock = del_endblock;
+		got.br_blockcount -= del->br_blockcount;
+		xfs_iext_update_extent(ip, state, icur, &got);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		error = xfs_bmbt_update(cur, &got);
+		if (error)
+			goto done;
+		break;
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		got.br_blockcount -= del->br_blockcount;
+		xfs_iext_update_extent(ip, state, icur, &got);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		error = xfs_bmbt_update(cur, &got);
+		if (error)
+			goto done;
+		break;
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+
+		old = got;
+
+		got.br_blockcount = del->br_startoff - got.br_startoff;
+		xfs_iext_update_extent(ip, state, icur, &got);
+
+		new.br_startoff = del_endoff;
+		new.br_blockcount = got_endoff - del_endoff;
+		new.br_state = got.br_state;
+		new.br_startblock = del_endblock;
+
+		flags |= XFS_ILOG_CORE;
+		if (cur) {
+			error = xfs_bmbt_update(cur, &got);
+			if (error)
+				goto done;
+			error = xfs_btree_increment(cur, 0, &i);
+			if (error)
+				goto done;
+			cur->bc_rec.b = new;
+			error = xfs_btree_insert(cur, &i);
+			if (error && error != -ENOSPC)
+				goto done;
+			/*
+			 * If get no-space back from btree insert, it tried a
+			 * split, and we have a zero block reservation.  Fix up
+			 * our state and return the error.
+			 */
+			if (error == -ENOSPC) {
+				/*
+				 * Reset the cursor, don't trust it after any
+				 * insert operation.
+				 */
+				error = xfs_bmbt_lookup_eq(cur, &got, &i);
+				if (error)
+					goto done;
+				if (XFS_IS_CORRUPT(mp, i != 1)) {
+					error = -EFSCORRUPTED;
+					goto done;
+				}
+				/*
+				 * Update the btree record back
+				 * to the original value.
+				 */
+				error = xfs_bmbt_update(cur, &old);
+				if (error)
+					goto done;
+				/*
+				 * Reset the extent record back
+				 * to the original value.
+				 */
+				xfs_iext_update_extent(ip, state, icur, &old);
+				flags = 0;
+				error = -ENOSPC;
+				goto done;
+			}
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto done;
+			}
+		} else
+			flags |= xfs_ilog_fext(whichfork);
+
+		ifp->if_nextents++;
+		xfs_iext_next(ifp, icur);
+		xfs_iext_insert(ip, icur, &new, state);
+		break;
+	}
+
+	/* remove reverse mapping */
+	xfs_rmap_unmap_extent(tp, ip, whichfork, del);
+
+	/*
+	 * If we need to, add to list of extents to delete.
+	 */
+	if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
+			xfs_refcount_decrease_extent(tp, del);
+		} else {
+			__xfs_free_extent_later(tp, del->br_startblock,
+					del->br_blockcount, NULL,
+					(bflags & XFS_BMAPI_NODISCARD) ||
+					del->br_state == XFS_EXT_UNWRITTEN);
+		}
+	}
+
+	/*
+	 * Adjust inode # blocks in the file.
+	 */
+	if (nblks)
+		ip->i_nblocks -= nblks;
+	/*
+	 * Adjust quota data.
+	 */
+	if (qfield && !(bflags & XFS_BMAPI_REMAP))
+		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+__xfs_bunmapi(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		start,		/* first file offset deleted */
+	xfs_filblks_t		*rlen,		/* i/o: amount remaining */
+	uint32_t		flags,		/* misc flags */
+	xfs_extnum_t		nexts)		/* number of extents max */
+{
+	struct xfs_btree_cur	*cur;		/* bmap btree cursor */
+	struct xfs_bmbt_irec	del;		/* extent being deleted */
+	int			error;		/* error return value */
+	xfs_extnum_t		extno;		/* extent number in list */
+	struct xfs_bmbt_irec	got;		/* current extent record */
+	struct xfs_ifork	*ifp;		/* inode fork pointer */
+	int			isrt;		/* freeing in rt area */
+	int			logflags;	/* transaction logging flags */
+	xfs_extlen_t		mod;		/* rt extent offset */
+	struct xfs_mount	*mp = ip->i_mount;
+	int			tmp_logflags;	/* partial logging flags */
+	int			wasdel;		/* was a delayed alloc extent */
+	int			whichfork;	/* data or attribute fork */
+	xfs_fsblock_t		sum;
+	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
+	xfs_fileoff_t		end;
+	struct xfs_iext_cursor	icur;
+	bool			done = false;
+
+	trace_xfs_bunmap(ip, start, len, flags, _RET_IP_);
+
+	whichfork = xfs_bmapi_whichfork(flags);
+	ASSERT(whichfork != XFS_COW_FORK);
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
+		return -EFSCORRUPTED;
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(len > 0);
+	ASSERT(nexts >= 0);
+
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		return error;
+
+	if (xfs_iext_count(ifp) == 0) {
+		*rlen = 0;
+		return 0;
+	}
+	XFS_STATS_INC(mp, xs_blk_unmap);
+	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	end = start + len;
+
+	if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
+		*rlen = 0;
+		return 0;
+	}
+	end--;
+
+	logflags = 0;
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_ino.flags = 0;
+	} else
+		cur = NULL;
+
+	if (isrt) {
+		/*
+		 * Synchronize by locking the bitmap inode.
+		 */
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+	}
+
+	extno = 0;
+	while (end != (xfs_fileoff_t)-1 && end >= start &&
+	       (nexts == 0 || extno < nexts)) {
+		/*
+		 * Is the found extent after a hole in which end lives?
+		 * Just back up to the previous extent, if so.
+		 */
+		if (got.br_startoff > end &&
+		    !xfs_iext_prev_extent(ifp, &icur, &got)) {
+			done = true;
+			break;
+		}
+		/*
+		 * Is the last block of this extent before the range
+		 * we're supposed to delete?  If so, we're done.
+		 */
+		end = XFS_FILEOFF_MIN(end,
+			got.br_startoff + got.br_blockcount - 1);
+		if (end < start)
+			break;
+		/*
+		 * Then deal with the (possibly delayed) allocated space
+		 * we found.
+		 */
+		del = got;
+		wasdel = isnullstartblock(del.br_startblock);
+
+		if (got.br_startoff < start) {
+			del.br_startoff = start;
+			del.br_blockcount -= start - got.br_startoff;
+			if (!wasdel)
+				del.br_startblock += start - got.br_startoff;
+		}
+		if (del.br_startoff + del.br_blockcount > end + 1)
+			del.br_blockcount = end + 1 - del.br_startoff;
+
+		if (!isrt)
+			goto delete;
+
+		sum = del.br_startblock + del.br_blockcount;
+		div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod);
+		if (mod) {
+			/*
+			 * Realtime extent not lined up at the end.
+			 * The extent could have been split into written
+			 * and unwritten pieces, or we could just be
+			 * unmapping part of it.  But we can't really
+			 * get rid of part of a realtime extent.
+			 */
+			if (del.br_state == XFS_EXT_UNWRITTEN) {
+				/*
+				 * This piece is unwritten, or we're not
+				 * using unwritten extents.  Skip over it.
+				 */
+				ASSERT(end >= mod);
+				end -= mod > del.br_blockcount ?
+					del.br_blockcount : mod;
+				if (end < got.br_startoff &&
+				    !xfs_iext_prev_extent(ifp, &icur, &got)) {
+					done = true;
+					break;
+				}
+				continue;
+			}
+			/*
+			 * It's written, turn it unwritten.
+			 * This is better than zeroing it.
+			 */
+			ASSERT(del.br_state == XFS_EXT_NORM);
+			ASSERT(tp->t_blk_res > 0);
+			/*
+			 * If this spans a realtime extent boundary,
+			 * chop it back to the start of the one we end at.
+			 */
+			if (del.br_blockcount > mod) {
+				del.br_startoff += del.br_blockcount - mod;
+				del.br_startblock += del.br_blockcount - mod;
+				del.br_blockcount = mod;
+			}
+			del.br_state = XFS_EXT_UNWRITTEN;
+			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+					whichfork, &icur, &cur, &del,
+					&logflags);
+			if (error)
+				goto error0;
+			goto nodelete;
+		}
+		div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod);
+		if (mod) {
+			xfs_extlen_t off = mp->m_sb.sb_rextsize - mod;
+
+			/*
+			 * Realtime extent is lined up at the end but not
+			 * at the front.  We'll get rid of full extents if
+			 * we can.
+			 */
+			if (del.br_blockcount > off) {
+				del.br_blockcount -= off;
+				del.br_startoff += off;
+				del.br_startblock += off;
+			} else if (del.br_startoff == start &&
+				   (del.br_state == XFS_EXT_UNWRITTEN ||
+				    tp->t_blk_res == 0)) {
+				/*
+				 * Can't make it unwritten.  There isn't
+				 * a full extent here so just skip it.
+				 */
+				ASSERT(end >= del.br_blockcount);
+				end -= del.br_blockcount;
+				if (got.br_startoff > end &&
+				    !xfs_iext_prev_extent(ifp, &icur, &got)) {
+					done = true;
+					break;
+				}
+				continue;
+			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+				struct xfs_bmbt_irec	prev;
+				xfs_fileoff_t		unwrite_start;
+
+				/*
+				 * This one is already unwritten.
+				 * It must have a written left neighbor.
+				 * Unwrite the killed part of that one and
+				 * try again.
+				 */
+				if (!xfs_iext_prev_extent(ifp, &icur, &prev))
+					ASSERT(0);
+				ASSERT(prev.br_state == XFS_EXT_NORM);
+				ASSERT(!isnullstartblock(prev.br_startblock));
+				ASSERT(del.br_startblock ==
+				       prev.br_startblock + prev.br_blockcount);
+				unwrite_start = max3(start,
+						     del.br_startoff - mod,
+						     prev.br_startoff);
+				mod = unwrite_start - prev.br_startoff;
+				prev.br_startoff = unwrite_start;
+				prev.br_startblock += mod;
+				prev.br_blockcount -= mod;
+				prev.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, whichfork, &icur, &cur,
+						&prev, &logflags);
+				if (error)
+					goto error0;
+				goto nodelete;
+			} else {
+				ASSERT(del.br_state == XFS_EXT_NORM);
+				del.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, whichfork, &icur, &cur,
+						&del, &logflags);
+				if (error)
+					goto error0;
+				goto nodelete;
+			}
+		}
+
+delete:
+		if (wasdel) {
+			error = xfs_bmap_del_extent_delay(ip, whichfork, &icur,
+					&got, &del);
+		} else {
+			error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
+					&del, &tmp_logflags, whichfork,
+					flags);
+			logflags |= tmp_logflags;
+		}
+
+		if (error)
+			goto error0;
+
+		end = del.br_startoff - 1;
+nodelete:
+		/*
+		 * If not done go on to the next (previous) record.
+		 */
+		if (end != (xfs_fileoff_t)-1 && end >= start) {
+			if (!xfs_iext_get_extent(ifp, &icur, &got) ||
+			    (got.br_startoff > end &&
+			     !xfs_iext_prev_extent(ifp, &icur, &got))) {
+				done = true;
+				break;
+			}
+			extno++;
+		}
+	}
+	if (done || end == (xfs_fileoff_t)-1 || end < start)
+		*rlen = 0;
+	else
+		*rlen = end - start + 1;
+
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
+				&tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+	} else {
+		error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
+			whichfork);
+	}
+
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent records if we've converted to btree format.
+	 */
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
+	    ifp->if_format != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+		 ifp->if_format != XFS_DINODE_FMT_BTREE)
+		logflags &= ~xfs_ilog_fbroot(whichfork);
+	/*
+	 * Log inode even in the error case, if the transaction
+	 * is dirty we'll need to shut down the filesystem.
+	 */
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (cur) {
+		if (!error)
+			cur->bc_ino.allocated = 0;
+		xfs_btree_del_cursor(cur, error);
+	}
+	return error;
+}
+
+/* Unmap a range of a file. */
+int
+xfs_bunmapi(
+	xfs_trans_t		*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	uint32_t		flags,
+	xfs_extnum_t		nexts,
+	int			*done)
+{
+	int			error;
+
+	error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts);
+	*done = (len == 0);
+	return error;
+}
+
+/*
+ * Determine whether an extent shift can be accomplished by a merge with the
+ * extent that precedes the target hole of the shift.
+ */
+STATIC bool
+xfs_bmse_can_merge(
+	struct xfs_bmbt_irec	*left,	/* preceding extent */
+	struct xfs_bmbt_irec	*got,	/* current extent to shift */
+	xfs_fileoff_t		shift)	/* shift fsb */
+{
+	xfs_fileoff_t		startoff;
+
+	startoff = got->br_startoff - shift;
+
+	/*
+	 * The extent, once shifted, must be adjacent in-file and on-disk with
+	 * the preceding extent.
+	 */
+	if ((left->br_startoff + left->br_blockcount != startoff) ||
+	    (left->br_startblock + left->br_blockcount != got->br_startblock) ||
+	    (left->br_state != got->br_state) ||
+	    (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
+		return false;
+
+	return true;
+}
+
+/*
+ * A bmap extent shift adjusts the file offset of an extent to fill a preceding
+ * hole in the file. If an extent shift would result in the extent being fully
+ * adjacent to the extent that currently precedes the hole, we can merge with
+ * the preceding extent rather than do the shift.
+ *
+ * This function assumes the caller has verified a shift-by-merge is possible
+ * with the provided extents via xfs_bmse_can_merge().
+ */
+STATIC int
+xfs_bmse_merge(
+	struct xfs_trans		*tp,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			shift,		/* shift fsb */
+	struct xfs_iext_cursor		*icur,
+	struct xfs_bmbt_irec		*got,		/* extent to shift */
+	struct xfs_bmbt_irec		*left,		/* preceding extent */
+	struct xfs_btree_cur		*cur,
+	int				*logflags)	/* output */
+{
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_bmbt_irec		new;
+	xfs_filblks_t			blockcount;
+	int				error, i;
+	struct xfs_mount		*mp = ip->i_mount;
+
+	blockcount = left->br_blockcount + got->br_blockcount;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_bmse_can_merge(left, got, shift));
+
+	new = *left;
+	new.br_blockcount = blockcount;
+
+	/*
+	 * Update the on-disk extent count, the btree if necessary and log the
+	 * inode.
+	 */
+	ifp->if_nextents--;
+	*logflags |= XFS_ILOG_CORE;
+	if (!cur) {
+		*logflags |= XFS_ILOG_DEXT;
+		goto done;
+	}
+
+	/* lookup and remove the extent to merge */
+	error = xfs_bmbt_lookup_eq(cur, got, &i);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(mp, i != 1))
+		return -EFSCORRUPTED;
+
+	error = xfs_btree_delete(cur, &i);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(mp, i != 1))
+		return -EFSCORRUPTED;
+
+	/* lookup and update size of the previous extent */
+	error = xfs_bmbt_lookup_eq(cur, left, &i);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(mp, i != 1))
+		return -EFSCORRUPTED;
+
+	error = xfs_bmbt_update(cur, &new);
+	if (error)
+		return error;
+
+	/* change to extent format if required after extent removal */
+	error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork);
+	if (error)
+		return error;
+
+done:
+	xfs_iext_remove(ip, icur, 0);
+	xfs_iext_prev(ifp, icur);
+	xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
+			&new);
+
+	/* update reverse mapping. rmap functions merge the rmaps for us */
+	xfs_rmap_unmap_extent(tp, ip, whichfork, got);
+	memcpy(&new, got, sizeof(new));
+	new.br_startoff = left->br_startoff + left->br_blockcount;
+	xfs_rmap_map_extent(tp, ip, whichfork, &new);
+	return 0;
+}
+
+static int
+xfs_bmap_shift_update_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_iext_cursor	*icur,
+	struct xfs_bmbt_irec	*got,
+	struct xfs_btree_cur	*cur,
+	int			*logflags,
+	xfs_fileoff_t		startoff)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	prev = *got;
+	int			error, i;
+
+	*logflags |= XFS_ILOG_CORE;
+
+	got->br_startoff = startoff;
+
+	if (cur) {
+		error = xfs_bmbt_lookup_eq(cur, &prev, &i);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
+
+		error = xfs_bmbt_update(cur, got);
+		if (error)
+			return error;
+	} else {
+		*logflags |= XFS_ILOG_DEXT;
+	}
+
+	xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
+			got);
+
+	/* update reverse mapping */
+	xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
+	xfs_rmap_map_extent(tp, ip, whichfork, got);
+	return 0;
+}
+
+int
+xfs_bmap_collapse_extents(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*next_fsb,
+	xfs_fileoff_t		offset_shift_fsb,
+	bool			*done)
+{
+	int			whichfork = XFS_DATA_FORK;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_bmbt_irec	got, prev;
+	struct xfs_iext_cursor	icur;
+	xfs_fileoff_t		new_startoff;
+	int			error = 0;
+	int			logflags = 0;
+
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		return -EFSCORRUPTED;
+	}
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		return error;
+
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_ino.flags = 0;
+	}
+
+	if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
+		*done = true;
+		goto del_cursor;
+	}
+	if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+		error = -EFSCORRUPTED;
+		goto del_cursor;
+	}
+
+	new_startoff = got.br_startoff - offset_shift_fsb;
+	if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) {
+		if (new_startoff < prev.br_startoff + prev.br_blockcount) {
+			error = -EINVAL;
+			goto del_cursor;
+		}
+
+		if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) {
+			error = xfs_bmse_merge(tp, ip, whichfork,
+					offset_shift_fsb, &icur, &got, &prev,
+					cur, &logflags);
+			if (error)
+				goto del_cursor;
+			goto done;
+		}
+	} else {
+		if (got.br_startoff < offset_shift_fsb) {
+			error = -EINVAL;
+			goto del_cursor;
+		}
+	}
+
+	error = xfs_bmap_shift_update_extent(tp, ip, whichfork, &icur, &got,
+			cur, &logflags, new_startoff);
+	if (error)
+		goto del_cursor;
+
+done:
+	if (!xfs_iext_next_extent(ifp, &icur, &got)) {
+		*done = true;
+		goto del_cursor;
+	}
+
+	*next_fsb = got.br_startoff;
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur, error);
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	return error;
+}
+
+/* Make sure we won't be right-shifting an extent past the maximum bound. */
+int
+xfs_bmap_can_insert_extents(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		off,
+	xfs_fileoff_t		shift)
+{
+	struct xfs_bmbt_irec	got;
+	int			is_empty;
+	int			error = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+	if (xfs_is_shutdown(ip->i_mount))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_bmap_last_extent(NULL, ip, XFS_DATA_FORK, &got, &is_empty);
+	if (!error && !is_empty && got.br_startoff >= off &&
+	    ((got.br_startoff + shift) & BMBT_STARTOFF_MASK) < got.br_startoff)
+		error = -EINVAL;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+int
+xfs_bmap_insert_extents(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*next_fsb,
+	xfs_fileoff_t		offset_shift_fsb,
+	bool			*done,
+	xfs_fileoff_t		stop_fsb)
+{
+	int			whichfork = XFS_DATA_FORK;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_bmbt_irec	got, next;
+	struct xfs_iext_cursor	icur;
+	xfs_fileoff_t		new_startoff;
+	int			error = 0;
+	int			logflags = 0;
+
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		return -EFSCORRUPTED;
+	}
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		return error;
+
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_ino.flags = 0;
+	}
+
+	if (*next_fsb == NULLFSBLOCK) {
+		xfs_iext_last(ifp, &icur);
+		if (!xfs_iext_get_extent(ifp, &icur, &got) ||
+		    stop_fsb > got.br_startoff) {
+			*done = true;
+			goto del_cursor;
+		}
+	} else {
+		if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
+			*done = true;
+			goto del_cursor;
+		}
+	}
+	if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+		error = -EFSCORRUPTED;
+		goto del_cursor;
+	}
+
+	if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) {
+		error = -EFSCORRUPTED;
+		goto del_cursor;
+	}
+
+	new_startoff = got.br_startoff + offset_shift_fsb;
+	if (xfs_iext_peek_next_extent(ifp, &icur, &next)) {
+		if (new_startoff + got.br_blockcount > next.br_startoff) {
+			error = -EINVAL;
+			goto del_cursor;
+		}
+
+		/*
+		 * Unlike a left shift (which involves a hole punch), a right
+		 * shift does not modify extent neighbors in any way.  We should
+		 * never find mergeable extents in this scenario.  Check anyways
+		 * and warn if we encounter two extents that could be one.
+		 */
+		if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb))
+			WARN_ON_ONCE(1);
+	}
+
+	error = xfs_bmap_shift_update_extent(tp, ip, whichfork, &icur, &got,
+			cur, &logflags, new_startoff);
+	if (error)
+		goto del_cursor;
+
+	if (!xfs_iext_prev_extent(ifp, &icur, &got) ||
+	    stop_fsb >= got.br_startoff + got.br_blockcount) {
+		*done = true;
+		goto del_cursor;
+	}
+
+	*next_fsb = got.br_startoff;
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur, error);
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	return error;
+}
+
+/*
+ * Splits an extent into two extents at split_fsb block such that it is the
+ * first block of the current_ext. @ext is a target extent to be split.
+ * @split_fsb is a block where the extents is split.  If split_fsb lies in a
+ * hole or the first block of extents, just return 0.
+ */
+int
+xfs_bmap_split_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		split_fsb)
+{
+	int				whichfork = XFS_DATA_FORK;
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_cur		*cur = NULL;
+	struct xfs_bmbt_irec		got;
+	struct xfs_bmbt_irec		new; /* split extent */
+	struct xfs_mount		*mp = ip->i_mount;
+	xfs_fsblock_t			gotblkcnt; /* new block count for got */
+	struct xfs_iext_cursor		icur;
+	int				error = 0;
+	int				logflags = 0;
+	int				i = 0;
+
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		return -EFSCORRUPTED;
+	}
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/* Read in all the extents */
+	error = xfs_iread_extents(tp, ip, whichfork);
+	if (error)
+		return error;
+
+	/*
+	 * If there are not extents, or split_fsb lies in a hole we are done.
+	 */
+	if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &icur, &got) ||
+	    got.br_startoff >= split_fsb)
+		return 0;
+
+	gotblkcnt = split_fsb - got.br_startoff;
+	new.br_startoff = split_fsb;
+	new.br_startblock = got.br_startblock + gotblkcnt;
+	new.br_blockcount = got.br_blockcount - gotblkcnt;
+	new.br_state = got.br_state;
+
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_ino.flags = 0;
+		error = xfs_bmbt_lookup_eq(cur, &got, &i);
+		if (error)
+			goto del_cursor;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto del_cursor;
+		}
+	}
+
+	got.br_blockcount = gotblkcnt;
+	xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), &icur,
+			&got);
+
+	logflags = XFS_ILOG_CORE;
+	if (cur) {
+		error = xfs_bmbt_update(cur, &got);
+		if (error)
+			goto del_cursor;
+	} else
+		logflags |= XFS_ILOG_DEXT;
+
+	/* Add new extent */
+	xfs_iext_next(ifp, &icur);
+	xfs_iext_insert(ip, &icur, &new, 0);
+	ifp->if_nextents++;
+
+	if (cur) {
+		error = xfs_bmbt_lookup_eq(cur, &new, &i);
+		if (error)
+			goto del_cursor;
+		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			error = -EFSCORRUPTED;
+			goto del_cursor;
+		}
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto del_cursor;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto del_cursor;
+		}
+	}
+
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
+		int tmp_logflags; /* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
+				&tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+	}
+
+del_cursor:
+	if (cur) {
+		cur->bc_ino.allocated = 0;
+		xfs_btree_del_cursor(cur, error);
+	}
+
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	return error;
+}
+
+/* Deferred mapping is only for real extents in the data fork. */
+static bool
+xfs_bmap_is_update_needed(
+	struct xfs_bmbt_irec	*bmap)
+{
+	return  bmap->br_startblock != HOLESTARTBLOCK &&
+		bmap->br_startblock != DELAYSTARTBLOCK;
+}
+
+/* Record a bmap intent. */
+static int
+__xfs_bmap_add(
+	struct xfs_trans		*tp,
+	enum xfs_bmap_intent_type	type,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	struct xfs_bmbt_irec		*bmap)
+{
+	struct xfs_bmap_intent		*bi;
+
+	trace_xfs_bmap_defer(tp->t_mountp,
+			XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
+			type,
+			XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
+			ip->i_ino, whichfork,
+			bmap->br_startoff,
+			bmap->br_blockcount,
+			bmap->br_state);
+
+	bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&bi->bi_list);
+	bi->bi_type = type;
+	bi->bi_owner = ip;
+	bi->bi_whichfork = whichfork;
+	bi->bi_bmap = *bmap;
+
+	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+	return 0;
+}
+
+/* Map an extent into a file. */
+void
+xfs_bmap_map_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*PREV)
+{
+	if (!xfs_bmap_is_update_needed(PREV))
+		return;
+
+	__xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+}
+
+/* Unmap an extent out of a file. */
+void
+xfs_bmap_unmap_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*PREV)
+{
+	if (!xfs_bmap_is_update_needed(PREV))
+		return;
+
+	__xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+}
+
+/*
+ * Process one of the deferred bmap operations.  We pass back the
+ * btree cursor to maintain our lock on the bmapbt between calls.
+ */
+int
+xfs_bmap_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_inode		*ip,
+	enum xfs_bmap_intent_type	type,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			*blockcount,
+	xfs_exntst_t			state)
+{
+	int				error = 0;
+
+	ASSERT(tp->t_firstblock == NULLFSBLOCK);
+
+	trace_xfs_bmap_deferred(tp->t_mountp,
+			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+			ip->i_ino, whichfork, startoff, *blockcount, state);
+
+	if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
+		return -EFSCORRUPTED;
+
+	if (XFS_TEST_ERROR(false, tp->t_mountp,
+			XFS_ERRTAG_BMAP_FINISH_ONE))
+		return -EIO;
+
+	switch (type) {
+	case XFS_BMAP_MAP:
+		error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
+				startblock, 0);
+		*blockcount = 0;
+		break;
+	case XFS_BMAP_UNMAP:
+		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
+				XFS_BMAPI_REMAP, 1);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+	}
+
+	return error;
+}
+
+/* Check that an inode's extent does not have invalid flags or bad ranges. */
+xfs_failaddr_t
+xfs_bmap_validate_extent(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+		return __this_address;
+
+	if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
+		if (!xfs_verify_rtext(mp, irec->br_startblock,
+					  irec->br_blockcount))
+			return __this_address;
+	} else {
+		if (!xfs_verify_fsbext(mp, irec->br_startblock,
+					   irec->br_blockcount))
+			return __this_address;
+	}
+	if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
+		return __this_address;
+	return NULL;
+}
+
+int __init
+xfs_bmap_intent_init_cache(void)
+{
+	xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent",
+			sizeof(struct xfs_bmap_intent),
+			0, 0, NULL);
+
+	return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_bmap_intent_destroy_cache(void)
+{
+	kmem_cache_destroy(xfs_bmap_intent_cache);
+	xfs_bmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
new file mode 100644
index 000000000..16db95b11
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BMAP_H__
+#define	__XFS_BMAP_H__
+
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+	struct xfs_trans	*tp;	/* transaction pointer */
+	struct xfs_inode	*ip;	/* incore inode pointer */
+	struct xfs_bmbt_irec	prev;	/* extent before the new one */
+	struct xfs_bmbt_irec	got;	/* extent after, or delayed */
+
+	xfs_fileoff_t		offset;	/* offset in file filling in */
+	xfs_extlen_t		length;	/* i/o length asked/allocated */
+	xfs_fsblock_t		blkno;	/* starting block of new extent */
+
+	struct xfs_btree_cur	*cur;	/* btree cursor */
+	struct xfs_iext_cursor	icur;	/* incore extent cursor */
+	int			nallocs;/* number of extents alloc'd */
+	int			logflags;/* flags for transaction logging */
+
+	xfs_extlen_t		total;	/* total blocks needed for xaction */
+	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
+	xfs_extlen_t		minleft; /* amount must be left after alloc */
+	bool			eof;	/* set if allocating past last extent */
+	bool			wasdel;	/* replacing a delayed allocation */
+	bool			aeof;	/* allocated space at eof */
+	bool			conv;	/* overwriting unwritten extents */
+	int			datatype;/* data type being allocated */
+	uint32_t		flags;
+};
+
+#define	XFS_BMAP_MAX_NMAP	4
+
+/*
+ * Flags for xfs_bmapi_*
+ */
+#define XFS_BMAPI_ENTIRE	(1u << 0) /* return entire extent untrimmed */
+#define XFS_BMAPI_METADATA	(1u << 1) /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK	(1u << 2) /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC	(1u << 3) /* preallocating unwritten space */
+#define XFS_BMAPI_CONTIG	(1u << 4) /* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT	(1u << 5)
+
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO		(1u << 6)
+
+/*
+ * Map the inode offset to the block given in ap->firstblock.  Primarily
+ * used for reflink.  The range must be in a hole, and this flag cannot be
+ * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork.
+ *
+ * For bunmapi, this flag unmaps the range without adjusting quota, reducing
+ * refcount, or freeing the blocks.
+ */
+#define XFS_BMAPI_REMAP		(1u << 7)
+
+/* Map something in the CoW fork. */
+#define XFS_BMAPI_COWFORK	(1u << 8)
+
+/* Skip online discard of freed extents */
+#define XFS_BMAPI_NODISCARD	(1u << 9)
+
+/* Do not update the rmap btree.  Used for reconstructing bmbt from rmapbt. */
+#define XFS_BMAPI_NORMAP	(1u << 10)
+
+#define XFS_BMAPI_FLAGS \
+	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
+	{ XFS_BMAPI_METADATA,	"METADATA" }, \
+	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
+	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
+	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
+	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
+	{ XFS_BMAPI_ZERO,	"ZERO" }, \
+	{ XFS_BMAPI_REMAP,	"REMAP" }, \
+	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
+	{ XFS_BMAPI_NODISCARD,	"NODISCARD" }, \
+	{ XFS_BMAPI_NORMAP,	"NORMAP" }
+
+
+static inline int xfs_bmapi_aflag(int w)
+{
+	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK :
+	       (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
+}
+
+static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
+{
+	if (bmapi_flags & XFS_BMAPI_COWFORK)
+		return XFS_COW_FORK;
+	else if (bmapi_flags & XFS_BMAPI_ATTRFORK)
+		return XFS_ATTR_FORK;
+	return XFS_DATA_FORK;
+}
+
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define	DELAYSTARTBLOCK		((xfs_fsblock_t)-1LL)
+#define	HOLESTARTBLOCK		((xfs_fsblock_t)-2LL)
+
+/*
+ * Flags for xfs_bmap_add_extent*.
+ */
+#define BMAP_LEFT_CONTIG	(1u << 0)
+#define BMAP_RIGHT_CONTIG	(1u << 1)
+#define BMAP_LEFT_FILLING	(1u << 2)
+#define BMAP_RIGHT_FILLING	(1u << 3)
+#define BMAP_LEFT_DELAY		(1u << 4)
+#define BMAP_RIGHT_DELAY	(1u << 5)
+#define BMAP_LEFT_VALID		(1u << 6)
+#define BMAP_RIGHT_VALID	(1u << 7)
+#define BMAP_ATTRFORK		(1u << 8)
+#define BMAP_COWFORK		(1u << 9)
+
+#define XFS_BMAP_EXT_FLAGS \
+	{ BMAP_LEFT_CONTIG,	"LC" }, \
+	{ BMAP_RIGHT_CONTIG,	"RC" }, \
+	{ BMAP_LEFT_FILLING,	"LF" }, \
+	{ BMAP_RIGHT_FILLING,	"RF" }, \
+	{ BMAP_ATTRFORK,	"ATTR" }, \
+	{ BMAP_COWFORK,		"COW" }
+
+/* Return true if the extent is an allocated extent, written or not. */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+	return irec->br_startblock != HOLESTARTBLOCK &&
+		irec->br_startblock != DELAYSTARTBLOCK &&
+		!isnullstartblock(irec->br_startblock);
+}
+
+/*
+ * Return true if the extent is a real, allocated extent, or false if it is  a
+ * delayed allocation, and unwritten extent or a hole.
+ */
+static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
+{
+	return xfs_bmap_is_real_extent(irec) &&
+	       irec->br_state != XFS_EXT_UNWRITTEN;
+}
+
+/*
+ * Check the mapping for obviously garbage allocations that could trash the
+ * filesystem immediately.
+ */
+#define xfs_valid_startblock(ip, startblock) \
+	((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
+
+void	xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
+		xfs_filblks_t len);
+unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
+int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void	xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
+		struct xfs_inode *ip, int whichfork);
+void	xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int	xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int	xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *last_block, int whichfork);
+int	xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+		int whichfork);
+int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+		int *nmap, uint32_t flags);
+int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
+		xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
+int	__xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
+		xfs_extnum_t nexts);
+int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
+		xfs_extnum_t nexts, int *done);
+int	xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
+		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
+		struct xfs_bmbt_irec *del);
+void	xfs_bmap_del_extent_cow(struct xfs_inode *ip,
+		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
+		struct xfs_bmbt_irec *del);
+uint	xfs_default_attroffset(struct xfs_inode *ip);
+int	xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+		bool *done);
+int	xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off,
+		xfs_fileoff_t shift);
+int	xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+		bool *done, xfs_fileoff_t stop_fsb);
+int	xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t split_offset);
+int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
+		xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
+		struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
+		int eof);
+int	xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+		xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
+int	xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+		struct xfs_inode *ip, int whichfork,
+		struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+		struct xfs_bmbt_irec *new, int *logflagsp);
+
+enum xfs_bmap_intent_type {
+	XFS_BMAP_MAP = 1,
+	XFS_BMAP_UNMAP,
+};
+
+struct xfs_bmap_intent {
+	struct list_head			bi_list;
+	enum xfs_bmap_intent_type		bi_type;
+	int					bi_whichfork;
+	struct xfs_inode			*bi_owner;
+	struct xfs_bmbt_irec			bi_bmap;
+};
+
+int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
+		enum xfs_bmap_intent_type type, int whichfork,
+		xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+		xfs_filblks_t *blockcount, xfs_exntst_t state);
+void	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+		struct xfs_bmbt_irec *imap);
+void	xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+		struct xfs_bmbt_irec *imap);
+
+static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
+{
+	switch (whichfork) {
+	case XFS_ATTR_FORK:
+		return BMAP_ATTRFORK;
+	case XFS_COW_FORK:
+		return BMAP_COWFORK;
+	default:
+		return 0;
+	}
+}
+
+xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
+		struct xfs_bmbt_irec *irec);
+
+int	xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
+		uint32_t flags);
+
+extern struct kmem_cache	*xfs_bmap_intent_cache;
+
+int __init xfs_bmap_intent_init_cache(void);
+void xfs_bmap_intent_destroy_cache(void);
+
+#endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
new file mode 100644
index 000000000..cfa052d40
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -0,0 +1,703 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_rmap.h"
+
+static struct kmem_cache	*xfs_bmbt_cur_cache;
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+	struct xfs_inode	*ip,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen,
+	struct xfs_btree_block	*rblock,
+	int			rblocklen)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	__be64			*fpp;
+	xfs_bmbt_key_t		*tkp;
+	__be64			*tpp;
+
+	xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS);
+	rblock->bb_level = dblock->bb_level;
+	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+	rblock->bb_numrecs = dblock->bb_numrecs;
+	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+	fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	dmxr = be16_to_cpu(dblock->bb_numrecs);
+	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+void
+xfs_bmbt_disk_get_all(
+	const struct xfs_bmbt_rec *rec,
+	struct xfs_bmbt_irec	*irec)
+{
+	uint64_t		l0 = get_unaligned_be64(&rec->l0);
+	uint64_t		l1 = get_unaligned_be64(&rec->l1);
+
+	irec->br_startoff = (l0 & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+	irec->br_startblock = ((l0 & xfs_mask64lo(9)) << 43) | (l1 >> 21);
+	irec->br_blockcount = l1 & xfs_mask64lo(21);
+	if (l0 >> (64 - BMBT_EXNTFLAG_BITLEN))
+		irec->br_state = XFS_EXT_UNWRITTEN;
+	else
+		irec->br_state = XFS_EXT_NORM;
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+	const struct xfs_bmbt_rec	*r)
+{
+	return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+	const struct xfs_bmbt_rec	*r)
+{
+	return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_disk_set_all(
+	struct xfs_bmbt_rec	*r,
+	struct xfs_bmbt_irec	*s)
+{
+	int			extent_flag = (s->br_state != XFS_EXT_NORM);
+
+	ASSERT(s->br_state == XFS_EXT_NORM || s->br_state == XFS_EXT_UNWRITTEN);
+	ASSERT(!(s->br_startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)));
+	ASSERT(!(s->br_blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)));
+	ASSERT(!(s->br_startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)));
+
+	put_unaligned_be64(
+		((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		 ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+		 ((xfs_bmbt_rec_base_t)s->br_startblock >> 43), &r->l0);
+	put_unaligned_be64(
+		((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+		 ((xfs_bmbt_rec_base_t)s->br_blockcount &
+		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)), &r->l1);
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*rblock,
+	int			rblocklen,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	__be64			*fpp;
+	xfs_bmbt_key_t		*tkp;
+	__be64			*tpp;
+
+	if (xfs_has_crc(mp)) {
+		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+		ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid,
+		       &mp->m_sb.sb_meta_uuid));
+		ASSERT(rblock->bb_u.l.bb_blkno ==
+		       cpu_to_be64(XFS_BUF_DADDR_NULL));
+	} else
+		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_level != 0);
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+	fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+	tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+	fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	dmxr = be16_to_cpu(dblock->bb_numrecs);
+	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_cur	*new;
+
+	new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_ino.ip, cur->bc_ino.whichfork);
+
+	/*
+	 * Copy the firstblock, dfops, and flags values,
+	 * since init cursor doesn't get them.
+	 */
+	new->bc_ino.flags = cur->bc_ino.flags;
+
+	return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+	struct xfs_btree_cur	*src,
+	struct xfs_btree_cur	*dst)
+{
+	ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) ||
+	       (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME));
+
+	dst->bc_ino.allocated += src->bc_ino.allocated;
+	dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock;
+
+	src->bc_ino.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = cur->bc_tp->t_firstblock;
+	xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
+			cur->bc_ino.whichfork);
+
+	if (args.fsbno == NULLFSBLOCK) {
+		args.fsbno = be64_to_cpu(start->l);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		/*
+		 * Make sure there is sufficient room left in the AG to
+		 * complete a full tree split for an extent insert.  If
+		 * we are converting the middle part of an extent then
+		 * we may need space for two tree splits.
+		 *
+		 * We are relying on the caller to make the correct block
+		 * reservation for this operation to succeed.  If the
+		 * reservation amount is insufficient then we may fail a
+		 * block allocation here and corrupt the filesystem.
+		 */
+		args.minleft = args.tp->t_blk_res;
+	} else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	}
+
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
+	if (!args.wasdel && args.tp->t_blk_res == 0) {
+		error = -ENOSPC;
+		goto error0;
+	}
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto error0;
+
+	if (args.fsbno == NULLFSBLOCK && args.minleft) {
+		/*
+		 * Could not find an AG with enough free space to satisfy
+		 * a full btree split.  Try again and if
+		 * successful activate the lowspace algorithm.
+		 */
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			goto error0;
+		cur->bc_tp->t_flags |= XFS_TRANS_LOWMODE;
+	}
+	if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
+		*stat = 0;
+		return 0;
+	}
+
+	ASSERT(args.len == 1);
+	cur->bc_tp->t_firstblock = args.fsbno;
+	cur->bc_ino.allocated++;
+	cur->bc_ino.ip->i_nblocks++;
+	xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
+	xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
+			XFS_TRANS_DQ_BCOUNT, 1L);
+
+	new->l = cpu_to_be64(args.fsbno);
+
+	*stat = 1;
+	return 0;
+
+ error0:
+	return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_inode	*ip = cur->bc_ino.ip;
+	struct xfs_trans	*tp = cur->bc_tp;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+	struct xfs_owner_info	oinfo;
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
+	xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
+	ip->i_nblocks--;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp;
+
+		ifp = xfs_ifork_ptr(cur->bc_ino.ip,
+				    cur->bc_ino.whichfork);
+
+		return xfs_bmbt_maxrecs(cur->bc_mp,
+					ifp->if_broot_bytes, level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp;
+
+		ifp = xfs_ifork_ptr(cur->bc_ino.ip,
+				    cur->bc_ino.whichfork);
+
+		return xfs_bmbt_maxrecs(cur->bc_mp,
+					ifp->if_broot_bytes, level == 0);
+	}
+
+	return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_bmap_dmxr[level != 0];
+	return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->bmbt.br_startoff =
+		cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->bmbt.br_startoff = cpu_to_be64(
+			xfs_bmbt_disk_get_startoff(&rec->bmbt) +
+			xfs_bmbt_disk_get_blockcount(&rec->bmbt) - 1);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+STATIC int64_t
+xfs_bmbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	return (int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+				      cur->bc_rec.b.br_startoff;
+}
+
+STATIC int64_t
+xfs_bmbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	uint64_t			a = be64_to_cpu(k1->bmbt.br_startoff);
+	uint64_t			b = be64_to_cpu(k2->bmbt.br_startoff);
+
+	/*
+	 * Note: This routine previously casted a and b to int64 and subtracted
+	 * them to generate a result.  This lead to problems if b was the
+	 * "maximum" key value (all ones) being signed incorrectly, hence this
+	 * somewhat less efficient version.
+	 */
+	if (a > b)
+		return 1;
+	if (b > a)
+		return -1;
+	return 0;
+}
+
+static xfs_failaddr_t
+xfs_bmbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (xfs_has_crc(mp)) {
+		/*
+		 * XXX: need a better way of verifying the owner here. Right now
+		 * just make sure there has been one set.
+		 */
+		fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+		if (fa)
+			return fa;
+	}
+
+	/*
+	 * numrecs and level verification.
+	 *
+	 * We don't know what fork we belong to, so just verify that the level
+	 * is less than the maximum of the two. Later checks will be more
+	 * precise.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+		return __this_address;
+
+	return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
+}
+
+static void
+xfs_bmbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_lblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_bmbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_bmbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_bmbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_lblock_calc_crc(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+	.name = "xfs_bmbt",
+	.magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+		   cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
+	.verify_read = xfs_bmbt_read_verify,
+	.verify_write = xfs_bmbt_write_verify,
+	.verify_struct = xfs_bmbt_verify,
+};
+
+
+STATIC int
+xfs_bmbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return be64_to_cpu(k1->bmbt.br_startoff) <
+		be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+		xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+		xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+	.rec_len		= sizeof(xfs_bmbt_rec_t),
+	.key_len		= sizeof(xfs_bmbt_key_t),
+
+	.dup_cursor		= xfs_bmbt_dup_cursor,
+	.update_cursor		= xfs_bmbt_update_cursor,
+	.alloc_block		= xfs_bmbt_alloc_block,
+	.free_block		= xfs_bmbt_free_block,
+	.get_maxrecs		= xfs_bmbt_get_maxrecs,
+	.get_minrecs		= xfs_bmbt_get_minrecs,
+	.get_dmaxrecs		= xfs_bmbt_get_dmaxrecs,
+	.init_key_from_rec	= xfs_bmbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_bmbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_bmbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
+	.key_diff		= xfs_bmbt_key_diff,
+	.diff_two_keys		= xfs_bmbt_diff_two_keys,
+	.buf_ops		= &xfs_bmbt_buf_ops,
+	.keys_inorder		= xfs_bmbt_keys_inorder,
+	.recs_inorder		= xfs_bmbt_recs_inorder,
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *				/* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* inode owning the btree */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_cur	*cur;
+	ASSERT(whichfork != XFS_COW_FORK);
+
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
+			mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
+
+	cur->bc_ops = &xfs_bmbt_ops;
+	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+	if (xfs_has_crc(mp))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+	cur->bc_ino.ip = ip;
+	cur->bc_ino.allocated = 0;
+	cur->bc_ino.flags = 0;
+	cur->bc_ino.whichfork = whichfork;
+
+	return cur;
+}
+
+/* Calculate number of records in a block mapping btree block. */
+static inline unsigned int
+xfs_bmbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(xfs_bmbt_rec_t);
+	return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+	return xfs_bmbt_block_maxrecs(blocklen, leaf);
+}
+
+/*
+ * Calculate the maximum possible height of the btree that the on-disk format
+ * supports. This is used for sizing structures large enough to support every
+ * possible configuration of a filesystem that might get mounted.
+ */
+unsigned int
+xfs_bmbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+		       XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+	minrecs[0] = xfs_bmbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
+
+	/* One extra level for the inode root. */
+	return xfs_btree_compute_maxlevels(minrecs,
+			XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= sizeof(xfs_bmdr_block_t);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_bmdr_rec_t);
+	return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_ino_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ASSERT(tp || buffer_list);
+	ASSERT(!(tp && buffer_list));
+	ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
+
+	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+	cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
+
+	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/* Calculate the bmap btree size for some records. */
+unsigned long long
+xfs_bmbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
+}
+
+int __init
+xfs_bmbt_init_cur_cache(void)
+{
+	xfs_bmbt_cur_cache = kmem_cache_create("xfs_bmbt_cur",
+			xfs_btree_cur_sizeof(xfs_bmbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_bmbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_bmbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_bmbt_cur_cache);
+	xfs_bmbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
new file mode 100644
index 000000000..3e7a40a83
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+
+struct xfs_btree_cur;
+struct xfs_btree_block;
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) \
+	(xfs_has_crc(((mp))) ? \
+		XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+	((xfs_bmbt_rec_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+	((xfs_bmbt_key_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_bmbt_ptr_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+	((xfs_bmdr_rec_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+	         ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+	((xfs_bmdr_key_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+		 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+	((xfs_bmdr_ptr_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+		 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+		 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
+
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+	XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
+
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+	(int)(XFS_BMBT_BLOCK_LEN(mp) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+	(XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMDR_SPACE_CALC(nrecs) \
+	(int)(sizeof(xfs_bmdr_block_t) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+	(XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+
+/*
+ * Maximum number of bmap btree levels.
+ */
+#define XFS_BM_MAXLEVELS(mp,w)		((mp)->m_bm_maxlevels[(w)])
+
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
+			struct xfs_btree_block *, int);
+
+void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s);
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(const struct xfs_bmbt_rec *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(const struct xfs_bmbt_rec *r);
+void xfs_bmbt_disk_get_all(const struct xfs_bmbt_rec *r,
+		struct xfs_bmbt_irec *s);
+
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+			xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+				 int whichfork, xfs_ino_t new_owner,
+				 struct list_head *buffer_list);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+		struct xfs_trans *, struct xfs_inode *, int);
+
+extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
+unsigned int xfs_bmbt_maxlevels_ondisk(void);
+
+int __init xfs_bmbt_init_cur_cache(void);
+void xfs_bmbt_destroy_cur_cache(void);
+
+#endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
new file mode 100644
index 000000000..4c16c8c31
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -0,0 +1,5099 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_btree.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_alloc.h"
+#include "xfs_log.h"
+#include "xfs_btree_staging.h"
+#include "xfs_ag.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+
+/*
+ * Btree magic numbers.
+ */
+static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+	  XFS_FIBT_MAGIC, 0 },
+	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
+	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
+	  XFS_REFC_CRC_MAGIC }
+};
+
+uint32_t
+xfs_btree_magic(
+	int			crc,
+	xfs_btnum_t		btnum)
+{
+	uint32_t		magic = xfs_magics[crc][btnum];
+
+	/* Ensure we asked for crc for crc-only magics. */
+	ASSERT(magic != 0);
+	return magic;
+}
+
+/*
+ * These sibling pointer checks are optimised for null sibling pointers. This
+ * happens a lot, and we don't need to byte swap at runtime if the sibling
+ * pointer is NULL.
+ *
+ * These are explicitly marked at inline because the cost of calling them as
+ * functions instead of inlining them is about 36 bytes extra code per call site
+ * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
+ * two sibling check functions reduces the compiled code size by over 300
+ * bytes.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_lblock_siblings(
+	struct xfs_mount	*mp,
+	struct xfs_btree_cur	*cur,
+	int			level,
+	xfs_fsblock_t		fsb,
+	__be64			dsibling)
+{
+	xfs_fsblock_t		sibling;
+
+	if (dsibling == cpu_to_be64(NULLFSBLOCK))
+		return NULL;
+
+	sibling = be64_to_cpu(dsibling);
+	if (sibling == fsb)
+		return __this_address;
+	if (level >= 0) {
+		if (!xfs_btree_check_lptr(cur, sibling, level + 1))
+			return __this_address;
+	} else {
+		if (!xfs_verify_fsbno(mp, sibling))
+			return __this_address;
+	}
+
+	return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_btree_check_sblock_siblings(
+	struct xfs_perag	*pag,
+	struct xfs_btree_cur	*cur,
+	int			level,
+	xfs_agblock_t		agbno,
+	__be32			dsibling)
+{
+	xfs_agblock_t		sibling;
+
+	if (dsibling == cpu_to_be32(NULLAGBLOCK))
+		return NULL;
+
+	sibling = be32_to_cpu(dsibling);
+	if (sibling == agbno)
+		return __this_address;
+	if (level >= 0) {
+		if (!xfs_btree_check_sptr(cur, sibling, level + 1))
+			return __this_address;
+	} else {
+		if (!xfs_verify_agbno(pag, sibling))
+			return __this_address;
+	}
+	return NULL;
+}
+
+/*
+ * Check a long btree block header.  Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+xfs_failaddr_t
+__xfs_btree_check_lblock(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_btnum_t		btnum = cur->bc_btnum;
+	int			crc = xfs_has_crc(mp);
+	xfs_failaddr_t		fa;
+	xfs_fsblock_t		fsb = NULLFSBLOCK;
+
+	if (crc) {
+		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (block->bb_u.l.bb_blkno !=
+		    cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
+			return __this_address;
+		if (block->bb_u.l.bb_pad != cpu_to_be32(0))
+			return __this_address;
+	}
+
+	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+		return __this_address;
+	if (be16_to_cpu(block->bb_level) != level)
+		return __this_address;
+	if (be16_to_cpu(block->bb_numrecs) >
+	    cur->bc_ops->get_maxrecs(cur, level))
+		return __this_address;
+
+	if (bp)
+		fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+
+	fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+			block->bb_u.l.bb_leftsib);
+	if (!fa)
+		fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+				block->bb_u.l.bb_rightsib);
+	return fa;
+}
+
+/* Check a long btree block header. */
+static int
+xfs_btree_check_lblock(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_failaddr_t		fa;
+
+	fa = __xfs_btree_check_lblock(cur, block, level, bp);
+	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * Check a short btree block header.  Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+xfs_failaddr_t
+__xfs_btree_check_sblock(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_perag	*pag = cur->bc_ag.pag;
+	xfs_btnum_t		btnum = cur->bc_btnum;
+	int			crc = xfs_has_crc(mp);
+	xfs_failaddr_t		fa;
+	xfs_agblock_t		agbno = NULLAGBLOCK;
+
+	if (crc) {
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (block->bb_u.s.bb_blkno !=
+		    cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
+			return __this_address;
+	}
+
+	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+		return __this_address;
+	if (be16_to_cpu(block->bb_level) != level)
+		return __this_address;
+	if (be16_to_cpu(block->bb_numrecs) >
+	    cur->bc_ops->get_maxrecs(cur, level))
+		return __this_address;
+
+	if (bp)
+		agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+
+	fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+			block->bb_u.s.bb_leftsib);
+	if (!fa)
+		fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+				block->bb_u.s.bb_rightsib);
+	return fa;
+}
+
+/* Check a short btree block header. */
+STATIC int
+xfs_btree_check_sblock(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_failaddr_t		fa;
+
+	fa = __xfs_btree_check_sblock(cur, block, level, bp);
+	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer containing block, if any */
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return xfs_btree_check_lblock(cur, block, level, bp);
+	else
+		return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/* Check that this long pointer is valid and points within the fs. */
+bool
+xfs_btree_check_lptr(
+	struct xfs_btree_cur	*cur,
+	xfs_fsblock_t		fsbno,
+	int			level)
+{
+	if (level <= 0)
+		return false;
+	return xfs_verify_fsbno(cur->bc_mp, fsbno);
+}
+
+/* Check that this short pointer is valid and points within the AG. */
+bool
+xfs_btree_check_sptr(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	int			level)
+{
+	if (level <= 0)
+		return false;
+	return xfs_verify_agbno(cur->bc_ag.pag, agbno);
+}
+
+/*
+ * Check that a given (indexed) btree pointer at a certain level of a
+ * btree is valid and doesn't point past where it should.
+ */
+static int
+xfs_btree_check_ptr(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				index,
+	int				level)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
+				level))
+			return 0;
+		xfs_err(cur->bc_mp,
+"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
+				cur->bc_ino.ip->i_ino,
+				cur->bc_ino.whichfork, cur->bc_btnum,
+				level, index);
+	} else {
+		if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
+				level))
+			return 0;
+		xfs_err(cur->bc_mp,
+"AG %u: Corrupt btree %d pointer at level %d index %d.",
+				cur->bc_ag.pag->pag_agno, cur->bc_btnum,
+				level, index);
+	}
+
+	return -EFSCORRUPTED;
+}
+
+#ifdef DEBUG
+# define xfs_btree_debug_check_ptr	xfs_btree_check_ptr
+#else
+# define xfs_btree_debug_check_ptr(...)	(0)
+#endif
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modification was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	if (!xfs_has_crc(bp->b_mount))
+		return;
+	if (bip)
+		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_mount	*mp = bp->b_mount;
+
+	if (xfs_has_crc(mp)) {
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+			return false;
+		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+	}
+
+	return true;
+}
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modification was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	if (!xfs_has_crc(bp->b_mount))
+		return;
+	if (bip)
+		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_mount	*mp = bp->b_mount;
+
+	if (xfs_has_crc(mp)) {
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+			return false;
+		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+	}
+
+	return true;
+}
+
+static int
+xfs_btree_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	int			error;
+
+	error = cur->bc_ops->free_block(cur, bp);
+	if (!error) {
+		xfs_trans_binval(cur->bc_tp, bp);
+		XFS_BTREE_STATS_INC(cur, free);
+	}
+	return error;
+}
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			error)		/* del because of error */
+{
+	int			i;		/* btree level */
+
+	/*
+	 * Clear the buffer pointers and release the buffers. If we're doing
+	 * this because of an error, inspect all of the entries in the bc_bufs
+	 * array for buffers to be unlocked. This is because some of the btree
+	 * code works from level n down to 0, and if we get an error along the
+	 * way we won't have initialized all the entries down to 0.
+	 */
+	for (i = 0; i < cur->bc_nlevels; i++) {
+		if (cur->bc_levels[i].bp)
+			xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
+		else if (!error)
+			break;
+	}
+
+	/*
+	 * If we are doing a BMBT update, the number of unaccounted blocks
+	 * allocated during this cursor life time should be zero. If it's not
+	 * zero, then we should be shut down or on our way to shutdown due to
+	 * cancelling a dirty transaction on error.
+	 */
+	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+	       xfs_is_shutdown(cur->bc_mp) || error != 0);
+	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+		kmem_free(cur->bc_ops);
+	if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
+		xfs_perag_put(cur->bc_ag.pag);
+	kmem_cache_free(cur->bc_cache, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	struct xfs_btree_cur *cur,		/* input cursor */
+	struct xfs_btree_cur **ncur)		/* output cursor */
+{
+	struct xfs_buf	*bp;		/* btree block's buffer pointer */
+	int		error;		/* error return value */
+	int		i;		/* level number of btree block */
+	xfs_mount_t	*mp;		/* mount structure for filesystem */
+	struct xfs_btree_cur *new;		/* new cursor value */
+	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
+
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+
+	/*
+	 * Allocate a new cursor like the old one.
+	 */
+	new = cur->bc_ops->dup_cursor(cur);
+
+	/*
+	 * Copy the record currently in the cursor.
+	 */
+	new->bc_rec = cur->bc_rec;
+
+	/*
+	 * For each level current, re-get the buffer and copy the ptr value.
+	 */
+	for (i = 0; i < new->bc_nlevels; i++) {
+		new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
+		new->bc_levels[i].ra = cur->bc_levels[i].ra;
+		bp = cur->bc_levels[i].bp;
+		if (bp) {
+			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+						   xfs_buf_daddr(bp), mp->m_bsize,
+						   0, &bp,
+						   cur->bc_ops->buf_ops);
+			if (error) {
+				xfs_btree_del_cursor(new, error);
+				*ncur = NULL;
+				return error;
+			}
+		}
+		new->bc_levels[i].bp = bp;
+	}
+	*ncur = new;
+	return 0;
+}
+
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ *
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ *
+ * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * overlapping intervals.  In such a tree, records are still sorted lowest to
+ * highest and indexed by the smallest key value that refers to the record.
+ * However, nodes are different: each pointer has two associated keys -- one
+ * indexing the lowest key available in the block(s) below (the same behavior
+ * as the key in a regular btree) and another indexing the highest key
+ * available in the block(s) below.  Because records are /not/ sorted by the
+ * highest key, all leaf block updates require us to compute the highest key
+ * that matches any record in the leaf and to recursively update the high keys
+ * in the nodes going further up in the tree, if necessary.  Nodes look like
+ * this:
+ *
+ *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ * Non-Leaf:	| header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
+ *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ *
+ * To perform an interval query on an overlapped tree, perform the usual
+ * depth-first search and use the low and high keys to decide if we can skip
+ * that particular node.  If a leaf node is reached, return the records that
+ * intersect the interval.  Note that an interval query may return numerous
+ * entries.  For a non-overlapped tree, simply search for the record associated
+ * with the lowest key and iterate forward until a non-matching record is
+ * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
+ * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
+ * more detail.
+ *
+ * Why do we care about overlapping intervals?  Let's say you have a bunch of
+ * reverse mapping records on a reflink filesystem:
+ *
+ * 1: +- file A startblock B offset C length D -----------+
+ * 2:      +- file E startblock F offset G length H --------------+
+ * 3:      +- file I startblock F offset J length K --+
+ * 4:                                                        +- file L... --+
+ *
+ * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
+ * we'd simply increment the length of record 1.  But how do we find the record
+ * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
+ * record 3 because the keys are ordered first by startblock.  An interval
+ * query would return records 1 and 2 because they both overlap (B+D-1), and
+ * from that we can pick out record 1 as the appropriate left neighbor.
+ *
+ * In the non-overlapped case you can do a LE lookup and decrement the cursor
+ * because a record's interval must end before the next record.
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+			return XFS_BTREE_LBLOCK_CRC_LEN;
+		return XFS_BTREE_LBLOCK_LEN;
+	}
+	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+		return XFS_BTREE_SBLOCK_CRC_LEN;
+	return XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+		sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th high key in a btree block.
+ */
+STATIC size_t
+xfs_btree_high_key_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	int			level)
+{
+	return xfs_btree_block_len(cur) +
+		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+		(n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+union xfs_btree_rec *
+xfs_btree_rec_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_rec *)
+		((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+union xfs_btree_key *
+xfs_btree_key_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_key *)
+		((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th high key in the btree block.
+ */
+union xfs_btree_key *
+xfs_btree_high_key_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_key *)
+		((char *)block + xfs_btree_high_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	int			level = xfs_btree_get_level(block);
+
+	ASSERT(block->bb_level != 0);
+
+	return (union xfs_btree_ptr *)
+		((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+struct xfs_ifork *
+xfs_btree_ifork_ptr(
+	struct xfs_btree_cur	*cur)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	if (cur->bc_flags & XFS_BTREE_STAGING)
+		return cur->bc_ino.ifake->if_fork;
+	return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
+}
+
+/*
+ * Get the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+	return (struct xfs_btree_block *)ifp->if_broot;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be an inode btree root or from a buffer.
+ */
+struct xfs_btree_block *		/* generic btree block pointer */
+xfs_btree_get_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree */
+	struct xfs_buf		**bpp)	/* buffer containing the block */
+{
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1)) {
+		*bpp = NULL;
+		return xfs_btree_get_iroot(cur);
+	}
+
+	*bpp = cur->bc_levels[level].bp;
+	return XFS_BUF_TO_BLOCK(*bpp);
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+STATIC int				/* success=1, failure=0 */
+xfs_btree_firstrec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	struct xfs_buf		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	if (xfs_btree_check_block(cur, block, level, bp))
+		return 0;
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to 1, that's the first record/key.
+	 */
+	cur->bc_levels[level].ptr = 1;
+	return 1;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+STATIC int				/* success=1, failure=0 */
+xfs_btree_lastrec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	struct xfs_buf		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	if (xfs_btree_check_block(cur, block, level, bp))
+		return 0;
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to numrecs, that's the last record/key.
+	 */
+	cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
+	return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	uint32_t	fields,		/* bitmask of fields */
+	const short	*offsets,	/* table of field offsets */
+	int		nbits,		/* number of bits to inspect */
+	int		*first,		/* output: first byte offset */
+	int		*last)		/* output: last byte offset */
+{
+	int		i;		/* current bit number */
+	uint32_t	imask;		/* mask for current bit number */
+
+	ASSERT(fields != 0);
+	/*
+	 * Find the lowest bit, so the first byte offset.
+	 */
+	for (i = 0, imask = 1u; ; i++, imask <<= 1) {
+		if (imask & fields) {
+			*first = offsets[i];
+			break;
+		}
+	}
+	/*
+	 * Find the highest bit, so the last byte offset.
+	 */
+	for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
+		if (imask & fields) {
+			*last = offsets[i + 1] - 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int
+xfs_btree_read_bufl(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	struct xfs_buf		**bpp,		/* buffer for fsbno */
+	int			refval,		/* ref count value for buffer */
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;		/* return value */
+	xfs_daddr_t		d;		/* real disk block address */
+	int			error;
+
+	if (!xfs_verify_fsbno(mp, fsbno))
+		return -EFSCORRUPTED;
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, 0, &bp, ops);
+	if (error)
+		return error;
+	if (bp)
+		xfs_buf_set_ref(bp, refval);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	xfs_extlen_t		count,		/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops)
+{
+	xfs_daddr_t		d;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_agblock_t		agbno,		/* allocation group block number */
+	xfs_extlen_t		count,		/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops)
+{
+	xfs_daddr_t		d;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+STATIC int
+xfs_btree_readahead_lblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block	*block)
+{
+	int			rval = 0;
+	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+				     cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+				     cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block *block)
+{
+	int			rval = 0;
+	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+				     left, 1, cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+				     right, 1, cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			lev,		/* level in btree */
+	int			lr)		/* left/right bits */
+{
+	struct xfs_btree_block	*block;
+
+	/*
+	 * No readahead needed if we are at the root level and the
+	 * btree root is stored in the inode.
+	 */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (lev == cur->bc_nlevels - 1))
+		return 0;
+
+	if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
+		return 0;
+
+	cur->bc_levels[lev].ra |= lr;
+	block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return xfs_btree_readahead_lblock(cur, lr, block);
+	return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+STATIC int
+xfs_btree_ptr_to_daddr(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	xfs_daddr_t			*daddr)
+{
+	xfs_fsblock_t		fsbno;
+	xfs_agblock_t		agbno;
+	int			error;
+
+	error = xfs_btree_check_ptr(cur, ptr, 0, 1);
+	if (error)
+		return error;
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		fsbno = be64_to_cpu(ptr->l);
+		*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
+	} else {
+		agbno = be32_to_cpu(ptr->s);
+		*daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+				agbno);
+	}
+
+	return 0;
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	xfs_extlen_t		count)
+{
+	xfs_daddr_t		daddr;
+
+	if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
+		return;
+	xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
+			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+STATIC void
+xfs_btree_setbuf(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	struct xfs_buf		*bp)	/* new buffer to set */
+{
+	struct xfs_btree_block	*b;	/* btree block */
+
+	if (cur->bc_levels[lev].bp)
+		xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
+	cur->bc_levels[lev].bp = bp;
+	cur->bc_levels[lev].ra = 0;
+
+	b = XFS_BUF_TO_BLOCK(bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
+			cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
+		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
+			cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
+	} else {
+		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
+			cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
+		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+			cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
+	}
+}
+
+bool
+xfs_btree_ptr_is_null(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return ptr->l == cpu_to_be64(NULLFSBLOCK);
+	else
+		return ptr->s == cpu_to_be32(NULLAGBLOCK);
+}
+
+void
+xfs_btree_set_ptr_null(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(NULLFSBLOCK);
+	else
+		ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+void
+xfs_btree_get_sibling(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_ptr	*ptr,
+	int			lr)
+{
+	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (lr == XFS_BB_RIGHTSIB)
+			ptr->l = block->bb_u.l.bb_rightsib;
+		else
+			ptr->l = block->bb_u.l.bb_leftsib;
+	} else {
+		if (lr == XFS_BB_RIGHTSIB)
+			ptr->s = block->bb_u.s.bb_rightsib;
+		else
+			ptr->s = block->bb_u.s.bb_leftsib;
+	}
+}
+
+void
+xfs_btree_set_sibling(
+	struct xfs_btree_cur		*cur,
+	struct xfs_btree_block		*block,
+	const union xfs_btree_ptr	*ptr,
+	int				lr)
+{
+	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (lr == XFS_BB_RIGHTSIB)
+			block->bb_u.l.bb_rightsib = ptr->l;
+		else
+			block->bb_u.l.bb_leftsib = ptr->l;
+	} else {
+		if (lr == XFS_BB_RIGHTSIB)
+			block->bb_u.s.bb_rightsib = ptr->s;
+		else
+			block->bb_u.s.bb_leftsib = ptr->s;
+	}
+}
+
+void
+xfs_btree_init_block_int(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*buf,
+	xfs_daddr_t		blkno,
+	xfs_btnum_t		btnum,
+	__u16			level,
+	__u16			numrecs,
+	__u64			owner,
+	unsigned int		flags)
+{
+	int			crc = xfs_has_crc(mp);
+	__u32			magic = xfs_btree_magic(crc, btnum);
+
+	buf->bb_magic = cpu_to_be32(magic);
+	buf->bb_level = cpu_to_be16(level);
+	buf->bb_numrecs = cpu_to_be16(numrecs);
+
+	if (flags & XFS_BTREE_LONG_PTRS) {
+		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+		if (crc) {
+			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
+			buf->bb_u.l.bb_pad = 0;
+			buf->bb_u.l.bb_lsn = 0;
+		}
+	} else {
+		/* owner is a 32 bit value on short blocks */
+		__u32 __owner = (__u32)owner;
+
+		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+		if (crc) {
+			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
+			buf->bb_u.s.bb_lsn = 0;
+		}
+	}
+}
+
+void
+xfs_btree_init_block(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_btnum_t	btnum,
+	__u16		level,
+	__u16		numrecs,
+	__u64		owner)
+{
+	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
+				 btnum, level, numrecs, owner, 0);
+}
+
+void
+xfs_btree_init_block_cur(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	int			numrecs)
+{
+	__u64			owner;
+
+	/*
+	 * we can pull the owner from the cursor right now as the different
+	 * owners align directly with the pointer size of the btree. This may
+	 * change in future, but is safe for current users of the generic btree
+	 * code.
+	 */
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		owner = cur->bc_ino.ip->i_ino;
+	else
+		owner = cur->bc_ag.pag->pag_agno;
+
+	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
+				xfs_buf_daddr(bp), cur->bc_btnum, level,
+				numrecs, owner, cur->bc_flags);
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updates to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level)
+{
+	union xfs_btree_ptr	ptr;
+
+	if (level > 0)
+		return 0;
+	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+		return 0;
+
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &ptr))
+		return 0;
+	return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+					xfs_buf_daddr(bp)));
+	else {
+		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+					xfs_buf_daddr(bp)));
+	}
+}
+
+STATIC void
+xfs_btree_set_refs(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	switch (cur->bc_btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
+		break;
+	case XFS_BTNUM_INO:
+	case XFS_BTNUM_FINO:
+		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
+		break;
+	case XFS_BTNUM_BMAP:
+		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+		break;
+	case XFS_BTNUM_RMAP:
+		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
+		break;
+	case XFS_BTNUM_REFC:
+		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+int
+xfs_btree_get_buf_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	struct xfs_btree_block		**block,
+	struct xfs_buf			**bpp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_daddr_t		d;
+	int			error;
+
+	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
+	if (error)
+		return error;
+	error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
+			0, bpp);
+	if (error)
+		return error;
+
+	(*bpp)->b_ops = cur->bc_ops->buf_ops;
+	*block = XFS_BUF_TO_BLOCK(*bpp);
+	return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				flags,
+	struct xfs_btree_block		**block,
+	struct xfs_buf			**bpp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_daddr_t		d;
+	int			error;
+
+	/* need to sort out how callers deal with failures first */
+	ASSERT(!(flags & XBF_TRYLOCK));
+
+	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
+	if (error)
+		return error;
+	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, flags, bpp,
+				   cur->bc_ops->buf_ops);
+	if (error)
+		return error;
+
+	xfs_btree_set_refs(cur, *bpp);
+	*block = XFS_BUF_TO_BLOCK(*bpp);
+	return 0;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+void
+xfs_btree_copy_keys(
+	struct xfs_btree_cur		*cur,
+	union xfs_btree_key		*dst_key,
+	const union xfs_btree_key	*src_key,
+	int				numkeys)
+{
+	ASSERT(numkeys >= 0);
+	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*dst_rec,
+	union xfs_btree_rec	*src_rec,
+	int			numrecs)
+{
+	ASSERT(numrecs >= 0);
+	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+void
+xfs_btree_copy_ptrs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*dst_ptr,
+	const union xfs_btree_ptr *src_ptr,
+	int			numptrs)
+{
+	ASSERT(numptrs >= 0);
+	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key,
+	int			dir,
+	int			numkeys)
+{
+	char			*dst_key;
+
+	ASSERT(numkeys >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec,
+	int			dir,
+	int			numrecs)
+{
+	char			*dst_rec;
+
+	ASSERT(numrecs >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			dir,
+	int			numptrs)
+{
+	char			*dst_ptr;
+
+	ASSERT(numptrs >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+
+	if (bp) {
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp,
+				  xfs_btree_key_offset(cur, first),
+				  xfs_btree_key_offset(cur, last + 1) - 1);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+				xfs_ilog_fbroot(cur->bc_ino.whichfork));
+	}
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+
+	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+	xfs_trans_log_buf(cur->bc_tp, bp,
+			  xfs_btree_rec_offset(cur, first),
+			  xfs_btree_rec_offset(cur, last + 1) - 1);
+
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_buf		*bp,	/* buffer containing btree block */
+	int			first,	/* index of first pointer to log */
+	int			last)	/* index of last pointer to log */
+{
+
+	if (bp) {
+		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+		int			level = xfs_btree_get_level(block);
+
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp,
+				xfs_btree_ptr_offset(cur, first, level),
+				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+			xfs_ilog_fbroot(cur->bc_ino.whichfork));
+	}
+
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_buf		*bp,	/* buffer containing btree block */
+	uint32_t		fields)	/* mask of fields: XFS_BB_... */
+{
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	static const short	soffsets[] = {	/* table of offsets (short) */
+		offsetof(struct xfs_btree_block, bb_magic),
+		offsetof(struct xfs_btree_block, bb_level),
+		offsetof(struct xfs_btree_block, bb_numrecs),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+		XFS_BTREE_SBLOCK_CRC_LEN
+	};
+	static const short	loffsets[] = {	/* table of offsets (long) */
+		offsetof(struct xfs_btree_block, bb_magic),
+		offsetof(struct xfs_btree_block, bb_level),
+		offsetof(struct xfs_btree_block, bb_numrecs),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+		XFS_BTREE_LBLOCK_CRC_LEN
+	};
+
+	if (bp) {
+		int nbits;
+
+		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+			/*
+			 * We don't log the CRC when updating a btree
+			 * block but instead recreate it during log
+			 * recovery.  As the log buffers have checksums
+			 * of their own this is safe and avoids logging a crc
+			 * update in a lot of places.
+			 */
+			if (fields == XFS_BB_ALL_BITS)
+				fields = XFS_BB_ALL_BITS_CRC;
+			nbits = XFS_BB_NUM_BITS_CRC;
+		} else {
+			nbits = XFS_BB_NUM_BITS;
+		}
+		xfs_btree_offsets(fields,
+				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+					loffsets : soffsets,
+				  nbits, &first, &last);
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+			xfs_ilog_fbroot(cur->bc_ino.whichfork));
+	}
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_btree_increment(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_block	*block;
+	union xfs_btree_ptr	ptr;
+	struct xfs_buf		*bp;
+	int			error;		/* error return value */
+	int			lev;
+
+	ASSERT(level < cur->bc_nlevels);
+
+	/* Read-ahead to the right at this level. */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+	/* Get a pointer to the btree block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* We're done if we remain in the block after the increment. */
+	if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
+		goto out1;
+
+	/* Fail if we just went off the right edge of the tree. */
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &ptr))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, increment);
+
+	/*
+	 * March up the tree incrementing pointers.
+	 * Stop when we don't go off the right edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, lev, bp);
+		if (error)
+			goto error0;
+#endif
+
+		if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
+			break;
+
+		/* Read-ahead the right block for the next loop. */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+
+	/*
+	 * If we went off the root then we are either seriously
+	 * confused or have the tree root in an inode.
+	 */
+	if (lev == cur->bc_nlevels) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			goto out0;
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(lev < cur->bc_nlevels);
+
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+		union xfs_btree_ptr	*ptrp;
+
+		ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
+		--lev;
+		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+		if (error)
+			goto error0;
+
+		xfs_btree_setbuf(cur, lev, bp);
+		cur->bc_levels[lev].ptr = 1;
+	}
+out1:
+	*stat = 1;
+	return 0;
+
+out0:
+	*stat = 0;
+	return 0;
+
+error0:
+	return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_btree_decrement(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	int			error;		/* error return value */
+	int			lev;
+	union xfs_btree_ptr	ptr;
+
+	ASSERT(level < cur->bc_nlevels);
+
+	/* Read-ahead to the left at this level. */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+	/* We're done if we remain in the block after the decrement. */
+	if (--cur->bc_levels[level].ptr > 0)
+		goto out1;
+
+	/* Get a pointer to the btree block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* Fail if we just went off the left edge of the tree. */
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+	if (xfs_btree_ptr_is_null(cur, &ptr))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, decrement);
+
+	/*
+	 * March up the tree decrementing pointers.
+	 * Stop when we don't go off the left edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_levels[lev].ptr > 0)
+			break;
+		/* Read-ahead the left block for the next loop. */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+
+	/*
+	 * If we went off the root then we are seriously confused.
+	 * or the root of the tree is in an inode.
+	 */
+	if (lev == cur->bc_nlevels) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			goto out0;
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(lev < cur->bc_nlevels);
+
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+		union xfs_btree_ptr	*ptrp;
+
+		ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
+		--lev;
+		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+		if (error)
+			goto error0;
+		xfs_btree_setbuf(cur, lev, bp);
+		cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
+	}
+out1:
+	*stat = 1;
+	return 0;
+
+out0:
+	*stat = 0;
+	return 0;
+
+error0:
+	return error;
+}
+
+int
+xfs_btree_lookup_get_block(
+	struct xfs_btree_cur		*cur,	/* btree cursor */
+	int				level,	/* level in the btree */
+	const union xfs_btree_ptr	*pp,	/* ptr to btree block */
+	struct xfs_btree_block		**blkp) /* return btree block */
+{
+	struct xfs_buf		*bp;	/* buffer pointer for btree block */
+	xfs_daddr_t		daddr;
+	int			error = 0;
+
+	/* special case the root block if in an inode */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1)) {
+		*blkp = xfs_btree_get_iroot(cur);
+		return 0;
+	}
+
+	/*
+	 * If the old buffer at this level for the disk address we are
+	 * looking for re-use it.
+	 *
+	 * Otherwise throw it away and get a new one.
+	 */
+	bp = cur->bc_levels[level].bp;
+	error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
+	if (error)
+		return error;
+	if (bp && xfs_buf_daddr(bp) == daddr) {
+		*blkp = XFS_BUF_TO_BLOCK(bp);
+		return 0;
+	}
+
+	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
+	if (error)
+		return error;
+
+	/* Check the inode owner since the verifiers don't. */
+	if (xfs_has_crc(cur->bc_mp) &&
+	    !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
+	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+			cur->bc_ino.ip->i_ino)
+		goto out_bad;
+
+	/* Did we get the level we were looking for? */
+	if (be16_to_cpu((*blkp)->bb_level) != level)
+		goto out_bad;
+
+	/* Check that internal nodes have at least one record. */
+	if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
+		goto out_bad;
+
+	xfs_btree_setbuf(cur, level, bp);
+	return 0;
+
+out_bad:
+	*blkp = NULL;
+	xfs_buf_mark_corrupt(bp);
+	xfs_trans_brelse(cur->bc_tp, bp);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			keyno,
+	struct xfs_btree_block	*block,
+	union xfs_btree_key	*kp)
+{
+	if (level == 0) {
+		cur->bc_ops->init_key_from_rec(kp,
+				xfs_btree_rec_addr(cur, keyno, block));
+		return kp;
+	}
+
+	return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * stat is set to 0 if can't find any such record, 1 for success.
+ */
+int					/* error */
+xfs_btree_lookup(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_lookup_t		dir,	/* <=, ==, or >= */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	int64_t			diff;	/* difference for the current key */
+	int			error;	/* error return value */
+	int			keyno;	/* current key number */
+	int			level;	/* level in the btree */
+	union xfs_btree_ptr	*pp;	/* ptr to btree block */
+	union xfs_btree_ptr	ptr;	/* ptr to btree block */
+
+	XFS_BTREE_STATS_INC(cur, lookup);
+
+	/* No such thing as a zero-level tree. */
+	if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
+		return -EFSCORRUPTED;
+
+	block = NULL;
+	keyno = 0;
+
+	/* initialise start pointer from cursor */
+	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	pp = &ptr;
+
+	/*
+	 * Iterate over each level in the btree, starting at the root.
+	 * For each level above the leaves, find the key we need, based
+	 * on the lookup record, then follow the corresponding block
+	 * pointer down to the next level.
+	 */
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		/* Get the block we need to do the lookup on. */
+		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+		if (error)
+			goto error0;
+
+		if (diff == 0) {
+			/*
+			 * If we already had a key match at a higher level, we
+			 * know we need to use the first entry in this block.
+			 */
+			keyno = 1;
+		} else {
+			/* Otherwise search this block. Do a binary search. */
+
+			int	high;	/* high entry number */
+			int	low;	/* low entry number */
+
+			/* Set low and high entry numbers, 1-based. */
+			low = 1;
+			high = xfs_btree_get_numrecs(block);
+			if (!high) {
+				/* Block is empty, must be an empty leaf. */
+				if (level != 0 || cur->bc_nlevels != 1) {
+					XFS_CORRUPTION_ERROR(__func__,
+							XFS_ERRLEVEL_LOW,
+							cur->bc_mp, block,
+							sizeof(*block));
+					return -EFSCORRUPTED;
+				}
+
+				cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
+				*stat = 0;
+				return 0;
+			}
+
+			/* Binary search the block. */
+			while (low <= high) {
+				union xfs_btree_key	key;
+				union xfs_btree_key	*kp;
+
+				XFS_BTREE_STATS_INC(cur, compare);
+
+				/* keyno is average of low and high. */
+				keyno = (low + high) >> 1;
+
+				/* Get current search key */
+				kp = xfs_lookup_get_search_key(cur, level,
+						keyno, block, &key);
+
+				/*
+				 * Compute difference to get next direction:
+				 *  - less than, move right
+				 *  - greater than, move left
+				 *  - equal, we're done
+				 */
+				diff = cur->bc_ops->key_diff(cur, kp);
+				if (diff < 0)
+					low = keyno + 1;
+				else if (diff > 0)
+					high = keyno - 1;
+				else
+					break;
+			}
+		}
+
+		/*
+		 * If there are more levels, set up for the next level
+		 * by getting the block number and filling in the cursor.
+		 */
+		if (level > 0) {
+			/*
+			 * If we moved left, need the previous key number,
+			 * unless there isn't one.
+			 */
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+			error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
+			if (error)
+				goto error0;
+
+			cur->bc_levels[level].ptr = keyno;
+		}
+	}
+
+	/* Done with the search. See if we need to adjust the results. */
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+		if (dir == XFS_LOOKUP_GE &&
+		    keyno > xfs_btree_get_numrecs(block) &&
+		    !xfs_btree_ptr_is_null(cur, &ptr)) {
+			int	i;
+
+			cur->bc_levels[0].ptr = keyno;
+			error = xfs_btree_increment(cur, 0, &i);
+			if (error)
+				goto error0;
+			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+				return -EFSCORRUPTED;
+			*stat = 1;
+			return 0;
+		}
+	} else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_levels[0].ptr = keyno;
+
+	/* Return if we succeeded or not. */
+	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+		*stat = 0;
+	else if (dir != XFS_LOOKUP_EQ || diff == 0)
+		*stat = 1;
+	else
+		*stat = 0;
+	return 0;
+
+error0:
+	return error;
+}
+
+/* Find the high key storage area from a regular key. */
+union xfs_btree_key *
+xfs_btree_high_key_from_key(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+	return (union xfs_btree_key *)((char *)key +
+			(cur->bc_ops->key_len / 2));
+}
+
+/* Determine the low (and high if overlapped) keys of a leaf block */
+STATIC void
+xfs_btree_get_leaf_keys(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_key	*key)
+{
+	union xfs_btree_key	max_hkey;
+	union xfs_btree_key	hkey;
+	union xfs_btree_rec	*rec;
+	union xfs_btree_key	*high;
+	int			n;
+
+	rec = xfs_btree_rec_addr(cur, 1, block);
+	cur->bc_ops->init_key_from_rec(key, rec);
+
+	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+
+		cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
+		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+			rec = xfs_btree_rec_addr(cur, n, block);
+			cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+			if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
+					> 0)
+				max_hkey = hkey;
+		}
+
+		high = xfs_btree_high_key_from_key(cur, key);
+		memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
+	}
+}
+
+/* Determine the low (and high if overlapped) keys of a node block */
+STATIC void
+xfs_btree_get_node_keys(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_key	*key)
+{
+	union xfs_btree_key	*hkey;
+	union xfs_btree_key	*max_hkey;
+	union xfs_btree_key	*high;
+	int			n;
+
+	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+		memcpy(key, xfs_btree_key_addr(cur, 1, block),
+				cur->bc_ops->key_len / 2);
+
+		max_hkey = xfs_btree_high_key_addr(cur, 1, block);
+		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+			hkey = xfs_btree_high_key_addr(cur, n, block);
+			if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
+				max_hkey = hkey;
+		}
+
+		high = xfs_btree_high_key_from_key(cur, key);
+		memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
+	} else {
+		memcpy(key, xfs_btree_key_addr(cur, 1, block),
+				cur->bc_ops->key_len);
+	}
+}
+
+/* Derive the keys for any btree block. */
+void
+xfs_btree_get_keys(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_key	*key)
+{
+	if (be16_to_cpu(block->bb_level) == 0)
+		xfs_btree_get_leaf_keys(cur, block, key);
+	else
+		xfs_btree_get_node_keys(cur, block, key);
+}
+
+/*
+ * Decide if we need to update the parent keys of a btree block.  For
+ * a standard btree this is only necessary if we're updating the first
+ * record/key.  For an overlapping btree, we must always update the
+ * keys because the highest key can be in any of the records or keys
+ * in the block.
+ */
+static inline bool
+xfs_btree_needs_key_update(
+	struct xfs_btree_cur	*cur,
+	int			ptr)
+{
+	return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+}
+
+/*
+ * Update the low and high parent keys of the given level, progressing
+ * towards the root.  If force_all is false, stop if the keys for a given
+ * level do not need updating.
+ */
+STATIC int
+__xfs_btree_updkeys(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	struct xfs_btree_block	*block,
+	struct xfs_buf		*bp0,
+	bool			force_all)
+{
+	union xfs_btree_key	key;	/* keys from current level */
+	union xfs_btree_key	*lkey;	/* keys from the next level up */
+	union xfs_btree_key	*hkey;
+	union xfs_btree_key	*nlkey;	/* keys from the next level up */
+	union xfs_btree_key	*nhkey;
+	struct xfs_buf		*bp;
+	int			ptr;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+
+	/* Exit if there aren't any parent levels to update. */
+	if (level + 1 >= cur->bc_nlevels)
+		return 0;
+
+	trace_xfs_btree_updkeys(cur, level, bp0);
+
+	lkey = &key;
+	hkey = xfs_btree_high_key_from_key(cur, lkey);
+	xfs_btree_get_keys(cur, block, lkey);
+	for (level++; level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+		int		error;
+#endif
+		block = xfs_btree_get_block(cur, level, &bp);
+		trace_xfs_btree_updkeys(cur, level, bp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, level, bp);
+		if (error)
+			return error;
+#endif
+		ptr = cur->bc_levels[level].ptr;
+		nlkey = xfs_btree_key_addr(cur, ptr, block);
+		nhkey = xfs_btree_high_key_addr(cur, ptr, block);
+		if (!force_all &&
+		    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
+		      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
+			break;
+		xfs_btree_copy_keys(cur, nlkey, lkey, 1);
+		xfs_btree_log_keys(cur, bp, ptr, ptr);
+		if (level + 1 >= cur->bc_nlevels)
+			break;
+		xfs_btree_get_node_keys(cur, block, lkey);
+	}
+
+	return 0;
+}
+
+/* Update all the keys from some level in cursor back to the root. */
+STATIC int
+xfs_btree_updkeys_force(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	struct xfs_buf		*bp;
+	struct xfs_btree_block	*block;
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	return __xfs_btree_updkeys(cur, level, block, bp, true);
+}
+
+/*
+ * Update the parent keys of the given level, progressing towards the root.
+ */
+STATIC int
+xfs_btree_update_keys(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_key	key;
+	int			ptr;
+
+	ASSERT(level >= 0);
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+		return __xfs_btree_updkeys(cur, level, block, bp, false);
+
+	/*
+	 * Go up the tree from this level toward the root.
+	 * At each level, update the key value to the value input.
+	 * Stop when we reach a level where the cursor isn't pointing
+	 * at the first entry in the block.
+	 */
+	xfs_btree_get_keys(cur, block, &key);
+	for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+		int		error;
+#endif
+		block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, level, bp);
+		if (error)
+			return error;
+#endif
+		ptr = cur->bc_levels[level].ptr;
+		kp = xfs_btree_key_addr(cur, ptr, block);
+		xfs_btree_copy_keys(cur, kp, &key, 1);
+		xfs_btree_log_keys(cur, bp, ptr, ptr);
+	}
+
+	return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	int			error;
+	int			ptr;
+	union xfs_btree_rec	*rp;
+
+	/* Pick up the current block. */
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, 0, bp);
+	if (error)
+		goto error0;
+#endif
+	/* Get the address of the rec to be updated. */
+	ptr = cur->bc_levels[0].ptr;
+	rp = xfs_btree_rec_addr(cur, ptr, block);
+
+	/* Fill in the new contents and log them. */
+	xfs_btree_copy_recs(cur, rp, rec, 1);
+	xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, 0)) {
+		cur->bc_ops->update_lastrec(cur, block, rec,
+					    ptr, LASTREC_UPDATE);
+	}
+
+	/* Pass new key value up to our parent. */
+	if (xfs_btree_needs_key_update(cur, ptr)) {
+		error = xfs_btree_update_keys(cur, 0);
+		if (error)
+			goto error0;
+	}
+
+	return 0;
+
+error0:
+	return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_btree_lshift(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	int			lrecs;		/* left record count */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	int			rrecs;		/* right record count */
+	union xfs_btree_ptr	lptr;		/* left btree pointer */
+	union xfs_btree_key	*rkp = NULL;	/* right btree key */
+	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
+	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
+	int			error;		/* error return value */
+	int			i;
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1)
+		goto out0;
+
+	/* Set up variables for this block as "right". */
+	right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, right, level, rbp);
+	if (error)
+		goto error0;
+#endif
+
+	/* If we've got no left sibling then we can't shift an entry left. */
+	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+	if (xfs_btree_ptr_is_null(cur, &lptr))
+		goto out0;
+
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_levels[level].ptr <= 1)
+		goto out0;
+
+	/* Set up the left neighbor as "left". */
+	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+	if (error)
+		goto error0;
+
+	/* If it's full, it can't take another entry. */
+	lrecs = xfs_btree_get_numrecs(left);
+	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+		goto out0;
+
+	rrecs = xfs_btree_get_numrecs(right);
+
+	/*
+	 * We add one entry to the left side and remove one for the right side.
+	 * Account for it here, the changes will be updated on disk and logged
+	 * later.
+	 */
+	lrecs++;
+	rrecs--;
+
+	XFS_BTREE_STATS_INC(cur, lshift);
+	XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+	/*
+	 * If non-leaf, copy a key and a ptr to the left block.
+	 * Log the changes to the left block.
+	 */
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+
+		lkp = xfs_btree_key_addr(cur, lrecs, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+
+		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+		error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
+		if (error)
+			goto error0;
+
+		xfs_btree_copy_keys(cur, lkp, rkp, 1);
+		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+		ASSERT(cur->bc_ops->keys_inorder(cur,
+			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, lrecs, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, lrp, rrp, 1);
+		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+		ASSERT(cur->bc_ops->recs_inorder(cur,
+			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+	}
+
+	xfs_btree_set_numrecs(left, lrecs);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+	xfs_btree_set_numrecs(right, rrecs);
+	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+	/*
+	 * Slide the contents of right down one entry.
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+	if (level > 0) {
+		/* It's a nonleaf. operate on keys and ptrs */
+		for (i = 0; i < rrecs; i++) {
+			error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
+			if (error)
+				goto error0;
+		}
+
+		xfs_btree_shift_keys(cur,
+				xfs_btree_key_addr(cur, 2, right),
+				-1, rrecs);
+		xfs_btree_shift_ptrs(cur,
+				xfs_btree_ptr_addr(cur, 2, right),
+				-1, rrecs);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+	} else {
+		/* It's a leaf. operate on records */
+		xfs_btree_shift_recs(cur,
+			xfs_btree_rec_addr(cur, 2, right),
+			-1, rrecs);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs);
+	}
+
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the left.
+	 */
+	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+		error = xfs_btree_dup_cursor(cur, &tcur);
+		if (error)
+			goto error0;
+		i = xfs_btree_firstrec(tcur, level);
+		if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+
+		error = xfs_btree_decrement(tcur, level, &i);
+		if (error)
+			goto error1;
+
+		/* Update the parent high keys of the left block, if needed. */
+		error = xfs_btree_update_keys(tcur, level);
+		if (error)
+			goto error1;
+
+		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	}
+
+	/* Update the parent keys of the right block. */
+	error = xfs_btree_update_keys(cur, level);
+	if (error)
+		goto error0;
+
+	/* Slide the cursor value left one. */
+	cur->bc_levels[level].ptr--;
+
+	*stat = 1;
+	return 0;
+
+out0:
+	*stat = 0;
+	return 0;
+
+error0:
+	return error;
+
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_btree_rshift(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	union xfs_btree_ptr	rptr;		/* right block pointer */
+	union xfs_btree_key	*rkp;		/* right btree key */
+	int			rrecs;		/* right record count */
+	int			lrecs;		/* left record count */
+	int			error;		/* error return value */
+	int			i;		/* loop counter */
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1))
+		goto out0;
+
+	/* Set up variables for this block as "left". */
+	left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, left, level, lbp);
+	if (error)
+		goto error0;
+#endif
+
+	/* If we've got no right sibling then we can't shift an entry right. */
+	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &rptr))
+		goto out0;
+
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	lrecs = xfs_btree_get_numrecs(left);
+	if (cur->bc_levels[level].ptr >= lrecs)
+		goto out0;
+
+	/* Set up the right neighbor as "right". */
+	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+	if (error)
+		goto error0;
+
+	/* If it's full, it can't take another entry. */
+	rrecs = xfs_btree_get_numrecs(right);
+	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, rshift);
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+	/*
+	 * Make a hole at the start of the right neighbor block, then
+	 * copy the last left block entry to the hole.
+	 */
+	if (level > 0) {
+		/* It's a nonleaf. make a hole in the keys and ptrs */
+		union xfs_btree_key	*lkp;
+		union xfs_btree_ptr	*lpp;
+		union xfs_btree_ptr	*rpp;
+
+		lkp = xfs_btree_key_addr(cur, lrecs, left);
+		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+		for (i = rrecs - 1; i >= 0; i--) {
+			error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
+			if (error)
+				goto error0;
+		}
+
+		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+		error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
+		if (error)
+			goto error0;
+
+		/* Now put the new data in, and log it. */
+		xfs_btree_copy_keys(cur, rkp, lkp, 1);
+		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+			xfs_btree_key_addr(cur, 2, right)));
+	} else {
+		/* It's a leaf. make a hole in the records */
+		union xfs_btree_rec	*lrp;
+		union xfs_btree_rec	*rrp;
+
+		lrp = xfs_btree_rec_addr(cur, lrecs, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+		/* Now put the new data in, and log it. */
+		xfs_btree_copy_recs(cur, rrp, lrp, 1);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+	}
+
+	/*
+	 * Decrement and log left's numrecs, bump and log right's numrecs.
+	 */
+	xfs_btree_set_numrecs(left, --lrecs);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+	xfs_btree_set_numrecs(right, ++rrecs);
+	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the right.
+	 */
+	error = xfs_btree_dup_cursor(cur, &tcur);
+	if (error)
+		goto error0;
+	i = xfs_btree_lastrec(tcur, level);
+	if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+
+	error = xfs_btree_increment(tcur, level, &i);
+	if (error)
+		goto error1;
+
+	/* Update the parent high keys of the left block, if needed. */
+	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+		error = xfs_btree_update_keys(cur, level);
+		if (error)
+			goto error1;
+	}
+
+	/* Update the parent keys of the right block. */
+	error = xfs_btree_update_keys(tcur, level);
+	if (error)
+		goto error1;
+
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+	*stat = 1;
+	return 0;
+
+out0:
+	*stat = 0;
+	return 0;
+
+error0:
+	return error;
+
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int					/* error */
+__xfs_btree_split(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	union xfs_btree_ptr	*ptrp,
+	union xfs_btree_key	*key,
+	struct xfs_btree_cur	**curp,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
+	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
+	struct xfs_btree_block	*rrblock;	/* right-right btree block */
+	int			lrecs;
+	int			rrecs;
+	int			src_index;
+	int			error;		/* error return value */
+	int			i;
+
+	XFS_BTREE_STATS_INC(cur, split);
+
+	/* Set up left block (current one). */
+	left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, left, level, lbp);
+	if (error)
+		goto error0;
+#endif
+
+	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		goto out0;
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Set up the new block as "right". */
+	error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
+	if (error)
+		goto error0;
+
+	/* Fill in the btree header for the new right block. */
+	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
+
+	/*
+	 * Split the entries between the old and the new block evenly.
+	 * Make sure that if there's an odd number of entries now, that
+	 * each new block will have the same number of entries.
+	 */
+	lrecs = xfs_btree_get_numrecs(left);
+	rrecs = lrecs / 2;
+	if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
+		rrecs++;
+	src_index = (lrecs - rrecs + 1);
+
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+	/* Adjust numrecs for the later get_*_keys() calls. */
+	lrecs -= rrecs;
+	xfs_btree_set_numrecs(left, lrecs);
+	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+	/*
+	 * Copy btree block entries from the left block over to the
+	 * new block, the right. Update the right block and log the
+	 * changes.
+	 */
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+		union xfs_btree_key	*rkp;	/* right btree key */
+		union xfs_btree_ptr	*rpp;	/* right address pointer */
+
+		lkp = xfs_btree_key_addr(cur, src_index, left);
+		lpp = xfs_btree_ptr_addr(cur, src_index, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+		for (i = src_index; i < rrecs; i++) {
+			error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
+			if (error)
+				goto error0;
+		}
+
+		/* Copy the keys & pointers to the new block. */
+		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+		/* Stash the keys of the new block for later insertion. */
+		xfs_btree_get_node_keys(cur, right, key);
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+		union xfs_btree_rec	*rrp;	/* right record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, src_index, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		/* Copy records to the new block. */
+		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+		/* Stash the keys of the new block for later insertion. */
+		xfs_btree_get_leaf_keys(cur, right, key);
+	}
+
+	/*
+	 * Find the left block number by looking in the buffer.
+	 * Adjust sibling pointers.
+	 */
+	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+	/*
+	 * If there's a block to the new block's right, make that block
+	 * point back to right instead of to left.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+		error = xfs_btree_read_buf_block(cur, &rrptr,
+							0, &rrblock, &rrbp);
+		if (error)
+			goto error0;
+		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+
+	/* Update the parent high keys of the left block, if needed. */
+	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+		error = xfs_btree_update_keys(cur, level);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If the cursor is really in the right block, move it there.
+	 * If it's just pointing past the last entry in left, then we'll
+	 * insert there, so don't change anything in that case.
+	 */
+	if (cur->bc_levels[level].ptr > lrecs + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_levels[level].ptr -= lrecs;
+	}
+	/*
+	 * If there are more levels, we'll need another cursor which refers
+	 * the right block, no matter where this cursor was.
+	 */
+	if (level + 1 < cur->bc_nlevels) {
+		error = xfs_btree_dup_cursor(cur, curp);
+		if (error)
+			goto error0;
+		(*curp)->bc_levels[level + 1].ptr++;
+	}
+	*ptrp = rptr;
+	*stat = 1;
+	return 0;
+out0:
+	*stat = 0;
+	return 0;
+
+error0:
+	return error;
+}
+
+#ifdef __KERNEL__
+struct xfs_btree_split_args {
+	struct xfs_btree_cur	*cur;
+	int			level;
+	union xfs_btree_ptr	*ptrp;
+	union xfs_btree_key	*key;
+	struct xfs_btree_cur	**curp;
+	int			*stat;		/* success/failure */
+	int			result;
+	bool			kswapd;	/* allocation in kswapd context */
+	struct completion	*done;
+	struct work_struct	work;
+};
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+	struct work_struct	*work)
+{
+	struct xfs_btree_split_args	*args = container_of(work,
+						struct xfs_btree_split_args, work);
+	unsigned long		pflags;
+	unsigned long		new_pflags = 0;
+
+	/*
+	 * we are in a transaction context here, but may also be doing work
+	 * in kswapd context, and hence we may need to inherit that state
+	 * temporarily to ensure that we don't block waiting for memory reclaim
+	 * in any way.
+	 */
+	if (args->kswapd)
+		new_pflags |= PF_MEMALLOC | PF_KSWAPD;
+
+	current_set_flags_nested(&pflags, new_pflags);
+	xfs_trans_set_context(args->cur->bc_tp);
+
+	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+					 args->key, args->curp, args->stat);
+
+	xfs_trans_clear_context(args->cur->bc_tp);
+	current_restore_flags_nested(&pflags, new_pflags);
+
+	/*
+	 * Do not access args after complete() has run here. We don't own args
+	 * and the owner may run and free args before we return here.
+	 */
+	complete(args->done);
+
+}
+
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int					/* error */
+xfs_btree_split(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	union xfs_btree_ptr	*ptrp,
+	union xfs_btree_key	*key,
+	struct xfs_btree_cur	**curp,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_split_args	args;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	if (cur->bc_btnum != XFS_BTNUM_BMAP)
+		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+
+	args.cur = cur;
+	args.level = level;
+	args.ptrp = ptrp;
+	args.key = key;
+	args.curp = curp;
+	args.stat = stat;
+	args.done = &done;
+	args.kswapd = current_is_kswapd();
+	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+	queue_work(xfs_alloc_wq, &args.work);
+	wait_for_completion(&done);
+	destroy_work_on_stack(&args.work);
+	return args.result;
+}
+#else
+#define xfs_btree_split	__xfs_btree_split
+#endif /* __KERNEL__ */
+
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int						/* error */
+xfs_btree_new_iroot(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			*logflags,	/* logging flags for inode */
+	int			*stat)		/* return status - 0 fail */
+{
+	struct xfs_buf		*cbp;		/* buffer for cblock */
+	struct xfs_btree_block	*block;		/* btree block */
+	struct xfs_btree_block	*cblock;	/* child btree block */
+	union xfs_btree_key	*ckp;		/* child key pointer */
+	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
+	union xfs_btree_key	*kp;		/* pointer to btree key */
+	union xfs_btree_ptr	*pp;		/* pointer to block addr */
+	union xfs_btree_ptr	nptr;		/* new block addr */
+	int			level;		/* btree level */
+	int			error;		/* error return code */
+	int			i;		/* loop counter */
+
+	XFS_BTREE_STATS_INC(cur, newroot);
+
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	level = cur->bc_nlevels - 1;
+
+	block = xfs_btree_get_iroot(cur);
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		return 0;
+
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Copy the root into a real block. */
+	error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
+	if (error)
+		goto error0;
+
+	/*
+	 * we can't just memcpy() the root in for CRC enabled btree blocks.
+	 * In that case have to also ensure the blkno remains correct
+	 */
+	memcpy(cblock, block, xfs_btree_block_len(cur));
+	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+		__be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
+		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+			cblock->bb_u.l.bb_blkno = bno;
+		else
+			cblock->bb_u.s.bb_blkno = bno;
+	}
+
+	be16_add_cpu(&block->bb_level, 1);
+	xfs_btree_set_numrecs(block, 1);
+	cur->bc_nlevels++;
+	ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+	cur->bc_levels[level + 1].ptr = 1;
+
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+		if (error)
+			goto error0;
+	}
+
+	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+	error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
+	if (error)
+		goto error0;
+
+	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+	xfs_iroot_realloc(cur->bc_ino.ip,
+			  1 - xfs_btree_get_numrecs(cblock),
+			  cur->bc_ino.whichfork);
+
+	xfs_btree_setbuf(cur, level, cbp);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+	*logflags |=
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
+	*stat = 1;
+	return 0;
+error0:
+	return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int				/* error */
+xfs_btree_new_root(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* one half of the old root block */
+	struct xfs_buf		*bp;	/* buffer containing block */
+	int			error;	/* error return value */
+	struct xfs_buf		*lbp;	/* left buffer pointer */
+	struct xfs_btree_block	*left;	/* left btree block */
+	struct xfs_buf		*nbp;	/* new (root) buffer */
+	struct xfs_btree_block	*new;	/* new (root) btree block */
+	int			nptr;	/* new value for key index, 1 or 2 */
+	struct xfs_buf		*rbp;	/* right buffer pointer */
+	struct xfs_btree_block	*right;	/* right btree block */
+	union xfs_btree_ptr	rptr;
+	union xfs_btree_ptr	lptr;
+
+	XFS_BTREE_STATS_INC(cur, newroot);
+
+	/* initialise our start point from the cursor */
+	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		goto out0;
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Set up the new block. */
+	error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
+	if (error)
+		goto error0;
+
+	/* Set the root in the holding structure  increasing the level by 1. */
+	cur->bc_ops->set_root(cur, &lptr, 1);
+
+	/*
+	 * At the previous root level there are now two blocks: the old root,
+	 * and the new block generated when it was split.  We don't know which
+	 * one the cursor is pointing at, so we set up variables "left" and
+	 * "right" for each case.
+	 */
+	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+	if (error)
+		goto error0;
+#endif
+
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+		/* Our block is left, pick up the right block. */
+		lbp = bp;
+		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+		left = block;
+		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+		if (error)
+			goto error0;
+		bp = rbp;
+		nptr = 1;
+	} else {
+		/* Our block is right, pick up the left block. */
+		rbp = bp;
+		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+		right = block;
+		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+		if (error)
+			goto error0;
+		bp = lbp;
+		nptr = 2;
+	}
+
+	/* Fill in the new block's btree header and log it. */
+	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
+	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+			!xfs_btree_ptr_is_null(cur, &rptr));
+
+	/* Fill in the key data in the new root. */
+	if (xfs_btree_get_level(left) > 0) {
+		/*
+		 * Get the keys for the left block's keys and put them directly
+		 * in the parent block.  Do the same for the right block.
+		 */
+		xfs_btree_get_node_keys(cur, left,
+				xfs_btree_key_addr(cur, 1, new));
+		xfs_btree_get_node_keys(cur, right,
+				xfs_btree_key_addr(cur, 2, new));
+	} else {
+		/*
+		 * Get the keys for the left block's records and put them
+		 * directly in the parent block.  Do the same for the right
+		 * block.
+		 */
+		xfs_btree_get_leaf_keys(cur, left,
+			xfs_btree_key_addr(cur, 1, new));
+		xfs_btree_get_leaf_keys(cur, right,
+			xfs_btree_key_addr(cur, 2, new));
+	}
+	xfs_btree_log_keys(cur, nbp, 1, 2);
+
+	/* Fill in the pointer data in the new root. */
+	xfs_btree_copy_ptrs(cur,
+		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+	xfs_btree_copy_ptrs(cur,
+		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+	xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+	/* Fix up the cursor. */
+	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+	cur->bc_levels[cur->bc_nlevels].ptr = nptr;
+	cur->bc_nlevels++;
+	ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+	*stat = 1;
+	return 0;
+error0:
+	return error;
+out0:
+	*stat = 0;
+	return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* btree level */
+	int			numrecs,/* # of recs in block */
+	int			*oindex,/* old tree index */
+	int			*index,	/* new tree index */
+	union xfs_btree_ptr	*nptr,	/* new btree ptr */
+	struct xfs_btree_cur	**ncur,	/* new btree cursor */
+	union xfs_btree_key	*key,	/* key of new block */
+	int			*stat)
+{
+	int			error = 0;
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1) {
+		struct xfs_inode *ip = cur->bc_ino.ip;
+
+		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+			/* A root block that can be made bigger. */
+			xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
+			*stat = 1;
+		} else {
+			/* A root block that needs replacing */
+			int	logflags = 0;
+
+			error = xfs_btree_new_iroot(cur, &logflags, stat);
+			if (error || *stat == 0)
+				return error;
+
+			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+		}
+
+		return 0;
+	}
+
+	/* First, try shifting an entry to the right neighbor. */
+	error = xfs_btree_rshift(cur, level, stat);
+	if (error || *stat)
+		return error;
+
+	/* Next, try shifting an entry to the left neighbor. */
+	error = xfs_btree_lshift(cur, level, stat);
+	if (error)
+		return error;
+
+	if (*stat) {
+		*oindex = *index = cur->bc_levels[level].ptr;
+		return 0;
+	}
+
+	/*
+	 * Next, try splitting the current block in half.
+	 *
+	 * If this works we have to re-set our variables because we
+	 * could be in a different block now.
+	 */
+	error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
+	if (error || *stat == 0)
+		return error;
+
+
+	*index = cur->bc_levels[level].ptr;
+	return 0;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level to insert record at */
+	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
+	union xfs_btree_rec	*rec,	/* record to insert */
+	union xfs_btree_key	*key,	/* i/o: block key for ptrp */
+	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* btree block */
+	struct xfs_buf		*bp;	/* buffer for block */
+	union xfs_btree_ptr	nptr;	/* new block ptr */
+	struct xfs_btree_cur	*ncur = NULL;	/* new btree cursor */
+	union xfs_btree_key	nkey;	/* new block key */
+	union xfs_btree_key	*lkey;
+	int			optr;	/* old key/record index */
+	int			ptr;	/* key/record index */
+	int			numrecs;/* number of records */
+	int			error;	/* error return value */
+	int			i;
+	xfs_daddr_t		old_bn;
+
+	ncur = NULL;
+	lkey = &nkey;
+
+	/*
+	 * If we have an external root pointer, and we've made it to the
+	 * root level, allocate a new root block and we're done.
+	 */
+	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level >= cur->bc_nlevels)) {
+		error = xfs_btree_new_root(cur, stat);
+		xfs_btree_set_ptr_null(cur, ptrp);
+
+		return error;
+	}
+
+	/* If we're off the left edge, return failure. */
+	ptr = cur->bc_levels[level].ptr;
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+
+	optr = ptr;
+
+	XFS_BTREE_STATS_INC(cur, insrec);
+
+	/* Get pointers to the btree buffer and block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+	old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+
+	/* Check that the new entry is being inserted in the right place. */
+	if (ptr <= numrecs) {
+		if (level == 0) {
+			ASSERT(cur->bc_ops->recs_inorder(cur, rec,
+				xfs_btree_rec_addr(cur, ptr, block)));
+		} else {
+			ASSERT(cur->bc_ops->keys_inorder(cur, key,
+				xfs_btree_key_addr(cur, ptr, block)));
+		}
+	}
+#endif
+
+	/*
+	 * If the block is full, we can't insert the new entry until we
+	 * make the block un-full.
+	 */
+	xfs_btree_set_ptr_null(cur, &nptr);
+	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+		error = xfs_btree_make_block_unfull(cur, level, numrecs,
+					&optr, &ptr, &nptr, &ncur, lkey, stat);
+		if (error || *stat == 0)
+			goto error0;
+	}
+
+	/*
+	 * The current block may have changed if the block was
+	 * previously full and we have just made space in it.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/*
+	 * At this point we know there's room for our new entry in the block
+	 * we're pointing at.
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+	if (level > 0) {
+		/* It's a nonleaf. make a hole in the keys and ptrs */
+		union xfs_btree_key	*kp;
+		union xfs_btree_ptr	*pp;
+
+		kp = xfs_btree_key_addr(cur, ptr, block);
+		pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+		for (i = numrecs - ptr; i >= 0; i--) {
+			error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+			if (error)
+				goto error0;
+		}
+
+		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+		error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
+		if (error)
+			goto error0;
+
+		/* Now put the new data in, bump numrecs and log it. */
+		xfs_btree_copy_keys(cur, kp, key, 1);
+		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+		numrecs++;
+		xfs_btree_set_numrecs(block, numrecs);
+		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+		xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+		if (ptr < numrecs) {
+			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+				xfs_btree_key_addr(cur, ptr + 1, block)));
+		}
+#endif
+	} else {
+		/* It's a leaf. make a hole in the records */
+		union xfs_btree_rec             *rp;
+
+		rp = xfs_btree_rec_addr(cur, ptr, block);
+
+		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+		/* Now put the new data in, bump numrecs and log it. */
+		xfs_btree_copy_recs(cur, rp, rec, 1);
+		xfs_btree_set_numrecs(block, ++numrecs);
+		xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+		if (ptr < numrecs) {
+			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+				xfs_btree_rec_addr(cur, ptr + 1, block)));
+		}
+#endif
+	}
+
+	/* Log the new number of records in the btree header. */
+	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+	/*
+	 * If we just inserted into a new tree block, we have to
+	 * recalculate nkey here because nkey is out of date.
+	 *
+	 * Otherwise we're just updating an existing block (having shoved
+	 * some records into the new tree block), so use the regular key
+	 * update mechanism.
+	 */
+	if (bp && xfs_buf_daddr(bp) != old_bn) {
+		xfs_btree_get_keys(cur, block, lkey);
+	} else if (xfs_btree_needs_key_update(cur, optr)) {
+		error = xfs_btree_update_keys(cur, level);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, level)) {
+		cur->bc_ops->update_lastrec(cur, block, rec,
+					    ptr, LASTREC_INSREC);
+	}
+
+	/*
+	 * Return the new block number, if any.
+	 * If there is one, give back a record value and a cursor too.
+	 */
+	*ptrp = nptr;
+	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+		xfs_btree_copy_keys(cur, key, lkey, 1);
+		*curp = ncur;
+	}
+
+	*stat = 1;
+	return 0;
+
+error0:
+	if (ncur)
+		xfs_btree_del_cursor(ncur, error);
+	return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+	struct xfs_btree_cur	*cur,
+	int			*stat)
+{
+	int			error;	/* error return value */
+	int			i;	/* result value, 0 for failure */
+	int			level;	/* current level number in btree */
+	union xfs_btree_ptr	nptr;	/* new block number (split result) */
+	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
+	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
+	union xfs_btree_key	bkey;	/* key of block to insert */
+	union xfs_btree_key	*key;
+	union xfs_btree_rec	rec;	/* record to insert */
+
+	level = 0;
+	ncur = NULL;
+	pcur = cur;
+	key = &bkey;
+
+	xfs_btree_set_ptr_null(cur, &nptr);
+
+	/* Make a key out of the record data to be inserted, and save it. */
+	cur->bc_ops->init_rec_from_cur(cur, &rec);
+	cur->bc_ops->init_key_from_rec(key, &rec);
+
+	/*
+	 * Loop going up the tree, starting at the leaf level.
+	 * Stop when we don't get a split block, that must mean that
+	 * the insert is finished with this level.
+	 */
+	do {
+		/*
+		 * Insert nrec/nptr into this level of the tree.
+		 * Note if we fail, nptr will be null.
+		 */
+		error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
+				&ncur, &i);
+		if (error) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			goto error0;
+		}
+
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		level++;
+
+		/*
+		 * See if the cursor we just used is trash.
+		 * Can't trash the caller's cursor, but otherwise we should
+		 * if ncur is a new cursor or we're about to be done.
+		 */
+		if (pcur != cur &&
+		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+			/* Save the state from the cursor before we trash it */
+			if (cur->bc_ops->update_cursor)
+				cur->bc_ops->update_cursor(pcur, cur);
+			cur->bc_nlevels = pcur->bc_nlevels;
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		/* If we got a new cursor, switch to it. */
+		if (ncur) {
+			pcur = ncur;
+			ncur = NULL;
+		}
+	} while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+	*stat = i;
+	return 0;
+error0:
+	return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+	struct xfs_btree_cur	*cur)
+{
+	int			whichfork = cur->bc_ino.whichfork;
+	struct xfs_inode	*ip = cur->bc_ino.ip;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_block	*block;
+	struct xfs_btree_block	*cblock;
+	union xfs_btree_key	*kp;
+	union xfs_btree_key	*ckp;
+	union xfs_btree_ptr	*pp;
+	union xfs_btree_ptr	*cpp;
+	struct xfs_buf		*cbp;
+	int			level;
+	int			index;
+	int			numrecs;
+	int			error;
+#ifdef DEBUG
+	union xfs_btree_ptr	ptr;
+#endif
+	int			i;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_nlevels > 1);
+
+	/*
+	 * Don't deal with the root block needs to be a leaf case.
+	 * We're just going to turn the thing back into extents anyway.
+	 */
+	level = cur->bc_nlevels - 1;
+	if (level == 1)
+		goto out0;
+
+	/*
+	 * Give up if the root has multiple children.
+	 */
+	block = xfs_btree_get_iroot(cur);
+	if (xfs_btree_get_numrecs(block) != 1)
+		goto out0;
+
+	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+	numrecs = xfs_btree_get_numrecs(cblock);
+
+	/*
+	 * Only do this if the next level will fit.
+	 * Then the data must be copied up to the inode,
+	 * instead of freeing the root you free the next level.
+	 */
+	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+	if (index) {
+		xfs_iroot_realloc(cur->bc_ino.ip, index,
+				  cur->bc_ino.whichfork);
+		block = ifp->if_broot;
+	}
+
+	be16_add_cpu(&block->bb_numrecs, index);
+	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+
+	for (i = 0; i < numrecs; i++) {
+		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+		if (error)
+			return error;
+	}
+
+	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+	error = xfs_btree_free_block(cur, cbp);
+	if (error)
+		return error;
+
+	cur->bc_levels[level - 1].bp = NULL;
+	be16_add_cpu(&block->bb_level, -1);
+	xfs_trans_log_inode(cur->bc_tp, ip,
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
+	cur->bc_nlevels--;
+out0:
+	return 0;
+}
+
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	union xfs_btree_ptr	*newroot)
+{
+	int			error;
+
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+	/*
+	 * Update the root pointer, decreasing the level by 1 and then
+	 * free the old root.
+	 */
+	cur->bc_ops->set_root(cur, newroot, -1);
+
+	error = xfs_btree_free_block(cur, bp);
+	if (error)
+		return error;
+
+	cur->bc_levels[level].bp = NULL;
+	cur->bc_levels[level].ra = 0;
+	cur->bc_nlevels--;
+
+	return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)
+{
+	int			error;
+	int			i;
+
+	if (level > 0) {
+		error = xfs_btree_decrement(cur, level, &i);
+		if (error)
+			return error;
+	}
+
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int					/* error */
+xfs_btree_delrec(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			level,		/* level removing record from */
+	int			*stat)		/* fail/done/go-on */
+{
+	struct xfs_btree_block	*block;		/* btree block */
+	union xfs_btree_ptr	cptr;		/* current block ptr */
+	struct xfs_buf		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+	int			i;		/* loop counter */
+	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	int			lrecs = 0;	/* left record count */
+	int			ptr;		/* key/record index */
+	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_block	*rrblock;	/* right-right btree block */
+	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
+	int			rrecs = 0;	/* right record count */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	int			numrecs;	/* temporary numrec count */
+
+	tcur = NULL;
+
+	/* Get the index of the entry being deleted, check for nothing there. */
+	ptr = cur->bc_levels[level].ptr;
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+
+	/* Get the buffer & block containing the record or key/ptr. */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* Fail if we're off the end of the block. */
+	if (ptr > numrecs) {
+		*stat = 0;
+		return 0;
+	}
+
+	XFS_BTREE_STATS_INC(cur, delrec);
+	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+	/* Excise the entries being deleted. */
+	if (level > 0) {
+		/* It's a nonleaf. operate on keys and ptrs */
+		union xfs_btree_key	*lkp;
+		union xfs_btree_ptr	*lpp;
+
+		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+		for (i = 0; i < numrecs - ptr; i++) {
+			error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
+			if (error)
+				goto error0;
+		}
+
+		if (ptr < numrecs) {
+			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+		}
+	} else {
+		/* It's a leaf. operate on records */
+		if (ptr < numrecs) {
+			xfs_btree_shift_recs(cur,
+				xfs_btree_rec_addr(cur, ptr + 1, block),
+				-1, numrecs - ptr);
+			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+		}
+	}
+
+	/*
+	 * Decrement and log the number of entries in the block.
+	 */
+	xfs_btree_set_numrecs(block, --numrecs);
+	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, level)) {
+		cur->bc_ops->update_lastrec(cur, block, NULL,
+					    ptr, LASTREC_DELREC);
+	}
+
+	/*
+	 * We're at the root level.  First, shrink the root block in-memory.
+	 * Try to get rid of the next level down.  If we can't then there's
+	 * nothing left to do.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+			xfs_iroot_realloc(cur->bc_ino.ip, -1,
+					  cur->bc_ino.whichfork);
+
+			error = xfs_btree_kill_iroot(cur);
+			if (error)
+				goto error0;
+
+			error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+			*stat = 1;
+			return 0;
+		}
+
+		/*
+		 * If this is the root level, and there's only one entry left,
+		 * and it's NOT the leaf level, then we can get rid of this
+		 * level.
+		 */
+		if (numrecs == 1 && level > 0) {
+			union xfs_btree_ptr	*pp;
+			/*
+			 * pp is still set to the first pointer in the block.
+			 * Make it the new root of the btree.
+			 */
+			pp = xfs_btree_ptr_addr(cur, 1, block);
+			error = xfs_btree_kill_root(cur, bp, level, pp);
+			if (error)
+				goto error0;
+		} else if (level > 0) {
+			error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+		}
+		*stat = 1;
+		return 0;
+	}
+
+	/*
+	 * If we deleted the leftmost entry in the block, update the
+	 * key values above us in the tree.
+	 */
+	if (xfs_btree_needs_key_update(cur, ptr)) {
+		error = xfs_btree_update_keys(cur, level);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If the number of records remaining in the block is at least
+	 * the minimum, we're done.
+	 */
+	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		return 0;
+	}
+
+	/*
+	 * Otherwise, we have to move some records around to keep the
+	 * tree balanced.  Look at the left and right sibling blocks to
+	 * see if we can re-balance by moving only one record.
+	 */
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		/*
+		 * One child of root, need to get a chance to copy its contents
+		 * into the root and delete it. Can't go up to next level,
+		 * there's nothing to delete there.
+		 */
+		if (xfs_btree_ptr_is_null(cur, &rptr) &&
+		    xfs_btree_ptr_is_null(cur, &lptr) &&
+		    level == cur->bc_nlevels - 2) {
+			error = xfs_btree_kill_iroot(cur);
+			if (!error)
+				error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+			return 0;
+		}
+	}
+
+	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+	       !xfs_btree_ptr_is_null(cur, &lptr));
+
+	/*
+	 * Duplicate the cursor so our btree manipulations here won't
+	 * disrupt the next level up.
+	 */
+	error = xfs_btree_dup_cursor(cur, &tcur);
+	if (error)
+		goto error0;
+
+	/*
+	 * If there's a right sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+		/*
+		 * Move the temp cursor to the last entry in the next block.
+		 * Actually any entry but the first would suffice.
+		 */
+		i = xfs_btree_lastrec(tcur, level);
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+
+		error = xfs_btree_increment(tcur, level, &i);
+		if (error)
+			goto error0;
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+
+		i = xfs_btree_lastrec(tcur, level);
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+
+		/* Grab a pointer to the block. */
+		right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(tcur, right, level, rbp);
+		if (error)
+			goto error0;
+#endif
+		/* Grab the current block number, for future use. */
+		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+		/*
+		 * If right block is full enough so that removing one entry
+		 * won't make it too empty, and left-shifting an entry out
+		 * of right to us works, we're done.
+		 */
+		if (xfs_btree_get_numrecs(right) - 1 >=
+		    cur->bc_ops->get_minrecs(tcur, level)) {
+			error = xfs_btree_lshift(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (i) {
+				ASSERT(xfs_btree_get_numrecs(block) >=
+				       cur->bc_ops->get_minrecs(tcur, level));
+
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+
+				error = xfs_btree_dec_cursor(cur, level, stat);
+				if (error)
+					goto error0;
+				return 0;
+			}
+		}
+
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference, and fix up the temp cursor to point
+		 * to our block again (last record).
+		 */
+		rrecs = xfs_btree_get_numrecs(right);
+		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+			i = xfs_btree_firstrec(tcur, level);
+			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto error0;
+			}
+
+			error = xfs_btree_decrement(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto error0;
+			}
+		}
+	}
+
+	/*
+	 * If there's a left sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+		/*
+		 * Move the temp cursor to the first entry in the
+		 * previous block.
+		 */
+		i = xfs_btree_firstrec(tcur, level);
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+
+		error = xfs_btree_decrement(tcur, level, &i);
+		if (error)
+			goto error0;
+		i = xfs_btree_firstrec(tcur, level);
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+
+		/* Grab a pointer to the block. */
+		left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, left, level, lbp);
+		if (error)
+			goto error0;
+#endif
+		/* Grab the current block number, for future use. */
+		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+		/*
+		 * If left block is full enough so that removing one entry
+		 * won't make it too empty, and right-shifting an entry out
+		 * of left to us works, we're done.
+		 */
+		if (xfs_btree_get_numrecs(left) - 1 >=
+		    cur->bc_ops->get_minrecs(tcur, level)) {
+			error = xfs_btree_rshift(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (i) {
+				ASSERT(xfs_btree_get_numrecs(block) >=
+				       cur->bc_ops->get_minrecs(tcur, level));
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+				if (level == 0)
+					cur->bc_levels[0].ptr++;
+
+				*stat = 1;
+				return 0;
+			}
+		}
+
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference.
+		 */
+		lrecs = xfs_btree_get_numrecs(left);
+	}
+
+	/* Delete the temp cursor, we're done with it. */
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	tcur = NULL;
+
+	/* If here, we need to do a join to keep the tree balanced. */
+	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+	    lrecs + xfs_btree_get_numrecs(block) <=
+			cur->bc_ops->get_maxrecs(cur, level)) {
+		/*
+		 * Set "right" to be the starting block,
+		 * "left" to be the left neighbor.
+		 */
+		rptr = cptr;
+		right = block;
+		rbp = bp;
+		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+		if (error)
+			goto error0;
+
+	/*
+	 * If that won't work, see if we can join with the right neighbor block.
+	 */
+	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+		   rrecs + xfs_btree_get_numrecs(block) <=
+			cur->bc_ops->get_maxrecs(cur, level)) {
+		/*
+		 * Set "left" to be the starting block,
+		 * "right" to be the right neighbor.
+		 */
+		lptr = cptr;
+		left = block;
+		lbp = bp;
+		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+		if (error)
+			goto error0;
+
+	/*
+	 * Otherwise, we can't fix the imbalance.
+	 * Just return.  This is probably a logic error, but it's not fatal.
+	 */
+	} else {
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		return 0;
+	}
+
+	rrecs = xfs_btree_get_numrecs(right);
+	lrecs = xfs_btree_get_numrecs(left);
+
+	/*
+	 * We're now going to join "left" and "right" by moving all the stuff
+	 * in "right" to "left" and deleting "right".
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+		union xfs_btree_key	*rkp;	/* right btree key */
+		union xfs_btree_ptr	*rpp;	/* right address pointer */
+
+		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+		for (i = 1; i < rrecs; i++) {
+			error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
+			if (error)
+				goto error0;
+		}
+
+		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+		union xfs_btree_rec	*rrp;	/* right record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+	}
+
+	XFS_BTREE_STATS_INC(cur, join);
+
+	/*
+	 * Fix up the number of records and right block pointer in the
+	 * surviving block, and log it.
+	 */
+	xfs_btree_set_numrecs(left, lrecs + rrecs);
+	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+	/* If there is a right sibling, point it to the remaining block. */
+	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
+		if (error)
+			goto error0;
+		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+
+	/* Free the deleted block. */
+	error = xfs_btree_free_block(cur, rbp);
+	if (error)
+		goto error0;
+
+	/*
+	 * If we joined with the left neighbor, set the buffer in the
+	 * cursor to the left block, and fix up the index.
+	 */
+	if (bp != lbp) {
+		cur->bc_levels[level].bp = lbp;
+		cur->bc_levels[level].ptr += lrecs;
+		cur->bc_levels[level].ra = 0;
+	}
+	/*
+	 * If we joined with the right neighbor and there's a level above
+	 * us, increment the cursor at that level.
+	 */
+	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+		   (level + 1 < cur->bc_nlevels)) {
+		error = xfs_btree_increment(cur, level + 1, &i);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * Readjust the ptr at this level if it's not a leaf, since it's
+	 * still pointing at the deletion point, which makes the cursor
+	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+	 * We can't use decrement because it would change the next level up.
+	 */
+	if (level > 0)
+		cur->bc_levels[level].ptr--;
+
+	/*
+	 * We combined blocks, so we have to update the parent keys if the
+	 * btree supports overlapped intervals.  However,
+	 * bc_levels[level + 1].ptr points to the old block so that the caller
+	 * knows which record to delete.  Therefore, the caller must be savvy
+	 * enough to call updkeys for us if we return stat == 2.  The other
+	 * exit points from this function don't require deletions further up
+	 * the tree, so they can call updkeys directly.
+	 */
+
+	/* Return value means the next level up has something to do. */
+	*stat = 2;
+	return 0;
+
+error0:
+	if (tcur)
+		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_btree_delete(
+	struct xfs_btree_cur	*cur,
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			level;
+	int			i;
+	bool			joined = false;
+
+	/*
+	 * Go up the tree, starting at leaf level.
+	 *
+	 * If 2 is returned then a join was done; go to the next level.
+	 * Otherwise we are done.
+	 */
+	for (level = 0, i = 2; i == 2; level++) {
+		error = xfs_btree_delrec(cur, level, &i);
+		if (error)
+			goto error0;
+		if (i == 2)
+			joined = true;
+	}
+
+	/*
+	 * If we combined blocks as part of deleting the record, delrec won't
+	 * have updated the parent high keys so we have to do that here.
+	 */
+	if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+		error = xfs_btree_updkeys_force(cur, 0);
+		if (error)
+			goto error0;
+	}
+
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_levels[level].ptr == 0) {
+				error = xfs_btree_decrement(cur, level, &i);
+				if (error)
+					goto error0;
+				break;
+			}
+		}
+	}
+
+	*stat = i;
+	return 0;
+error0:
+	return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_btree_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	union xfs_btree_rec	**recp,	/* output: btree record */
+	int			*stat)	/* output: success/failure */
+{
+	struct xfs_btree_block	*block;	/* btree block */
+	struct xfs_buf		*bp;	/* buffer pointer */
+	int			ptr;	/* record number */
+#ifdef DEBUG
+	int			error;	/* error return value */
+#endif
+
+	ptr = cur->bc_levels[0].ptr;
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, 0, bp);
+	if (error)
+		return error;
+#endif
+
+	/*
+	 * Off the right end or left end, return failure.
+	 */
+	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+
+	/*
+	 * Point to the record and extract its data.
+	 */
+	*recp = xfs_btree_rec_addr(cur, ptr, block);
+	*stat = 1;
+	return 0;
+}
+
+/* Visit a block in a btree. */
+STATIC int
+xfs_btree_visit_block(
+	struct xfs_btree_cur		*cur,
+	int				level,
+	xfs_btree_visit_blocks_fn	fn,
+	void				*data)
+{
+	struct xfs_btree_block		*block;
+	struct xfs_buf			*bp;
+	union xfs_btree_ptr		rptr;
+	int				error;
+
+	/* do right sibling readahead */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+	block = xfs_btree_get_block(cur, level, &bp);
+
+	/* process the block */
+	error = fn(cur, level, data);
+	if (error)
+		return error;
+
+	/* now read rh sibling block for next iteration */
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &rptr))
+		return -ENOENT;
+
+	/*
+	 * We only visit blocks once in this walk, so we have to avoid the
+	 * internal xfs_btree_lookup_get_block() optimisation where it will
+	 * return the same block without checking if the right sibling points
+	 * back to us and creates a cyclic reference in the btree.
+	 */
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
+							xfs_buf_daddr(bp)))
+			return -EFSCORRUPTED;
+	} else {
+		if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
+							xfs_buf_daddr(bp)))
+			return -EFSCORRUPTED;
+	}
+	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+
+/* Visit every block in a btree. */
+int
+xfs_btree_visit_blocks(
+	struct xfs_btree_cur		*cur,
+	xfs_btree_visit_blocks_fn	fn,
+	unsigned int			flags,
+	void				*data)
+{
+	union xfs_btree_ptr		lptr;
+	int				level;
+	struct xfs_btree_block		*block = NULL;
+	int				error = 0;
+
+	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+	/* for each level */
+	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+		/* grab the left hand block */
+		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+		if (error)
+			return error;
+
+		/* readahead the left most block for the next level down */
+		if (level > 0) {
+			union xfs_btree_ptr     *ptr;
+
+			ptr = xfs_btree_ptr_addr(cur, 1, block);
+			xfs_btree_readahead_ptr(cur, ptr, 1);
+
+			/* save for the next iteration of the loop */
+			xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
+
+			if (!(flags & XFS_BTREE_VISIT_LEAVES))
+				continue;
+		} else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
+			continue;
+		}
+
+		/* for each buffer in the level */
+		do {
+			error = xfs_btree_visit_block(cur, level, fn, data);
+		} while (!error);
+
+		if (error != -ENOENT)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction.  If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+struct xfs_btree_block_change_owner_info {
+	uint64_t		new_owner;
+	struct list_head	*buffer_list;
+};
+
+static int
+xfs_btree_block_change_owner(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*data)
+{
+	struct xfs_btree_block_change_owner_info	*bbcoi = data;
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+
+	/* modify the owner */
+	block = xfs_btree_get_block(cur, level, &bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
+			return 0;
+		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
+	} else {
+		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
+			return 0;
+		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
+	}
+
+	/*
+	 * If the block is a root block hosted in an inode, we might not have a
+	 * buffer pointer here and we shouldn't attempt to log the change as the
+	 * information is already held in the inode and discarded when the root
+	 * block is formatted into the on-disk inode fork. We still change it,
+	 * though, so everything is consistent in memory.
+	 */
+	if (!bp) {
+		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+		ASSERT(level == cur->bc_nlevels - 1);
+		return 0;
+	}
+
+	if (cur->bc_tp) {
+		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
+			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+			return -EAGAIN;
+		}
+	} else {
+		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+	}
+
+	return 0;
+}
+
+int
+xfs_btree_change_owner(
+	struct xfs_btree_cur	*cur,
+	uint64_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	struct xfs_btree_block_change_owner_info	bbcoi;
+
+	bbcoi.new_owner = new_owner;
+	bbcoi.buffer_list = buffer_list;
+
+	return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
+			XFS_BTREE_VISIT_ALL, &bbcoi);
+}
+
+/* Verify the v5 fields of a long-format btree block. */
+xfs_failaddr_t
+xfs_btree_lblock_v5hdr_verify(
+	struct xfs_buf		*bp,
+	uint64_t		owner)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+
+	if (!xfs_has_crc(mp))
+		return __this_address;
+	if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+	if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
+		return __this_address;
+	if (owner != XFS_RMAP_OWN_UNKNOWN &&
+	    be64_to_cpu(block->bb_u.l.bb_owner) != owner)
+		return __this_address;
+	return NULL;
+}
+
+/* Verify a long-format btree block. */
+xfs_failaddr_t
+xfs_btree_lblock_verify(
+	struct xfs_buf		*bp,
+	unsigned int		max_recs)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_fsblock_t		fsb;
+	xfs_failaddr_t		fa;
+
+	/* numrecs verification */
+	if (be16_to_cpu(block->bb_numrecs) > max_recs)
+		return __this_address;
+
+	/* sibling pointer verification */
+	fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+	fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+			block->bb_u.l.bb_leftsib);
+	if (!fa)
+		fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+				block->bb_u.l.bb_rightsib);
+	return fa;
+}
+
+/**
+ * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ *				      btree block
+ *
+ * @bp: buffer containing the btree block
+ */
+xfs_failaddr_t
+xfs_btree_sblock_v5hdr_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+
+	if (!xfs_has_crc(mp))
+		return __this_address;
+	if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+	if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
+		return __this_address;
+	if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+		return __this_address;
+	return NULL;
+}
+
+/**
+ * xfs_btree_sblock_verify() -- verify a short-format btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: maximum records allowed in this btree node
+ */
+xfs_failaddr_t
+xfs_btree_sblock_verify(
+	struct xfs_buf		*bp,
+	unsigned int		max_recs)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_agblock_t		agbno;
+	xfs_failaddr_t		fa;
+
+	/* numrecs verification */
+	if (be16_to_cpu(block->bb_numrecs) > max_recs)
+		return __this_address;
+
+	/* sibling pointer verification */
+	agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+	fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+			block->bb_u.s.bb_leftsib);
+	if (!fa)
+		fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+				block->bb_u.s.bb_rightsib);
+	return fa;
+}
+
+/*
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * height of the tree needed to index the number of leaf records.
+ */
+unsigned int
+xfs_btree_compute_maxlevels(
+	const unsigned int	*limits,
+	unsigned long long	records)
+{
+	unsigned long long	level_blocks = howmany_64(records, limits[0]);
+	unsigned int		height = 1;
+
+	while (level_blocks > 1) {
+		level_blocks = howmany_64(level_blocks, limits[1]);
+		height++;
+	}
+
+	return height;
+}
+
+/*
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * number of blocks needed to index the given number of leaf records.
+ */
+unsigned long long
+xfs_btree_calc_size(
+	const unsigned int	*limits,
+	unsigned long long	records)
+{
+	unsigned long long	level_blocks = howmany_64(records, limits[0]);
+	unsigned long long	blocks = level_blocks;
+
+	while (level_blocks > 1) {
+		level_blocks = howmany_64(level_blocks, limits[1]);
+		blocks += level_blocks;
+	}
+
+	return blocks;
+}
+
+/*
+ * Given a number of available blocks for the btree to consume with records and
+ * pointers, calculate the height of the tree needed to index all the records
+ * that space can hold based on the number of pointers each interior node
+ * holds.
+ *
+ * We start by assuming a single level tree consumes a single block, then track
+ * the number of blocks each node level consumes until we no longer have space
+ * to store the next node level. At this point, we are indexing all the leaf
+ * blocks in the space, and there's no more free space to split the tree any
+ * further. That's our maximum btree height.
+ */
+unsigned int
+xfs_btree_space_to_height(
+	const unsigned int	*limits,
+	unsigned long long	leaf_blocks)
+{
+	unsigned long long	node_blocks = limits[1];
+	unsigned long long	blocks_left = leaf_blocks - 1;
+	unsigned int		height = 1;
+
+	if (leaf_blocks < 1)
+		return 0;
+
+	while (node_blocks < blocks_left) {
+		blocks_left -= node_blocks;
+		node_blocks *= limits[1];
+		height++;
+	}
+
+	return height;
+}
+
+/*
+ * Query a regular btree for all records overlapping a given interval.
+ * Start with a LE lookup of the key of low_rec and return all records
+ * until we find a record with a key greater than the key of high_rec.
+ */
+STATIC int
+xfs_btree_simple_query_range(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*low_key,
+	const union xfs_btree_key	*high_key,
+	xfs_btree_query_range_fn	fn,
+	void				*priv)
+{
+	union xfs_btree_rec		*recp;
+	union xfs_btree_key		rec_key;
+	int64_t				diff;
+	int				stat;
+	bool				firstrec = true;
+	int				error;
+
+	ASSERT(cur->bc_ops->init_high_key_from_rec);
+	ASSERT(cur->bc_ops->diff_two_keys);
+
+	/*
+	 * Find the leftmost record.  The btree cursor must be set
+	 * to the low record used to generate low_key.
+	 */
+	stat = 0;
+	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+	if (error)
+		goto out;
+
+	/* Nothing?  See if there's anything to the right. */
+	if (!stat) {
+		error = xfs_btree_increment(cur, 0, &stat);
+		if (error)
+			goto out;
+	}
+
+	while (stat) {
+		/* Find the record. */
+		error = xfs_btree_get_rec(cur, &recp, &stat);
+		if (error || !stat)
+			break;
+
+		/* Skip if high_key(rec) < low_key. */
+		if (firstrec) {
+			cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
+			firstrec = false;
+			diff = cur->bc_ops->diff_two_keys(cur, low_key,
+					&rec_key);
+			if (diff > 0)
+				goto advloop;
+		}
+
+		/* Stop if high_key < low_key(rec). */
+		cur->bc_ops->init_key_from_rec(&rec_key, recp);
+		diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
+		if (diff > 0)
+			break;
+
+		/* Callback */
+		error = fn(cur, recp, priv);
+		if (error)
+			break;
+
+advloop:
+		/* Move on to the next record. */
+		error = xfs_btree_increment(cur, 0, &stat);
+		if (error)
+			break;
+	}
+
+out:
+	return error;
+}
+
+/*
+ * Query an overlapped interval btree for all records overlapping a given
+ * interval.  This function roughly follows the algorithm given in
+ * "Interval Trees" of _Introduction to Algorithms_, which is section
+ * 14.3 in the 2nd and 3rd editions.
+ *
+ * First, generate keys for the low and high records passed in.
+ *
+ * For any leaf node, generate the high and low keys for the record.
+ * If the record keys overlap with the query low/high keys, pass the
+ * record to the function iterator.
+ *
+ * For any internal node, compare the low and high keys of each
+ * pointer against the query low/high keys.  If there's an overlap,
+ * follow the pointer.
+ *
+ * As an optimization, we stop scanning a block when we find a low key
+ * that is greater than the query's high key.
+ */
+STATIC int
+xfs_btree_overlapped_query_range(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*low_key,
+	const union xfs_btree_key	*high_key,
+	xfs_btree_query_range_fn	fn,
+	void				*priv)
+{
+	union xfs_btree_ptr		ptr;
+	union xfs_btree_ptr		*pp;
+	union xfs_btree_key		rec_key;
+	union xfs_btree_key		rec_hkey;
+	union xfs_btree_key		*lkp;
+	union xfs_btree_key		*hkp;
+	union xfs_btree_rec		*recp;
+	struct xfs_btree_block		*block;
+	int64_t				ldiff;
+	int64_t				hdiff;
+	int				level;
+	struct xfs_buf			*bp;
+	int				i;
+	int				error;
+
+	/* Load the root of the btree. */
+	level = cur->bc_nlevels - 1;
+	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
+	if (error)
+		return error;
+	xfs_btree_get_block(cur, level, &bp);
+	trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto out;
+#endif
+	cur->bc_levels[level].ptr = 1;
+
+	while (level < cur->bc_nlevels) {
+		block = xfs_btree_get_block(cur, level, &bp);
+
+		/* End of node, pop back towards the root. */
+		if (cur->bc_levels[level].ptr >
+					be16_to_cpu(block->bb_numrecs)) {
+pop_up:
+			if (level < cur->bc_nlevels - 1)
+				cur->bc_levels[level + 1].ptr++;
+			level++;
+			continue;
+		}
+
+		if (level == 0) {
+			/* Handle a leaf node. */
+			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+					block);
+
+			cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
+			ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
+					low_key);
+
+			cur->bc_ops->init_key_from_rec(&rec_key, recp);
+			hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
+					&rec_key);
+
+			/*
+			 * If (record's high key >= query's low key) and
+			 *    (query's high key >= record's low key), then
+			 * this record overlaps the query range; callback.
+			 */
+			if (ldiff >= 0 && hdiff >= 0) {
+				error = fn(cur, recp, priv);
+				if (error)
+					break;
+			} else if (hdiff < 0) {
+				/* Record is larger than high key; pop. */
+				goto pop_up;
+			}
+			cur->bc_levels[level].ptr++;
+			continue;
+		}
+
+		/* Handle an internal node. */
+		lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
+		hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
+				block);
+		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
+
+		ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
+		hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
+
+		/*
+		 * If (pointer's high key >= query's low key) and
+		 *    (query's high key >= pointer's low key), then
+		 * this record overlaps the query range; follow pointer.
+		 */
+		if (ldiff >= 0 && hdiff >= 0) {
+			level--;
+			error = xfs_btree_lookup_get_block(cur, level, pp,
+					&block);
+			if (error)
+				goto out;
+			xfs_btree_get_block(cur, level, &bp);
+			trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+			error = xfs_btree_check_block(cur, block, level, bp);
+			if (error)
+				goto out;
+#endif
+			cur->bc_levels[level].ptr = 1;
+			continue;
+		} else if (hdiff < 0) {
+			/* The low key is larger than the upper range; pop. */
+			goto pop_up;
+		}
+		cur->bc_levels[level].ptr++;
+	}
+
+out:
+	/*
+	 * If we don't end this function with the cursor pointing at a record
+	 * block, a subsequent non-error cursor deletion will not release
+	 * node-level buffers, causing a buffer leak.  This is quite possible
+	 * with a zero-results range query, so release the buffers if we
+	 * failed to return any results.
+	 */
+	if (cur->bc_levels[0].bp == NULL) {
+		for (i = 0; i < cur->bc_nlevels; i++) {
+			if (cur->bc_levels[i].bp) {
+				xfs_trans_brelse(cur->bc_tp,
+						cur->bc_levels[i].bp);
+				cur->bc_levels[i].bp = NULL;
+				cur->bc_levels[i].ptr = 0;
+				cur->bc_levels[i].ra = 0;
+			}
+		}
+	}
+
+	return error;
+}
+
+/*
+ * Query a btree for all records overlapping a given interval of keys.  The
+ * supplied function will be called with each record found; return one of the
+ * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
+ * code.  This function returns -ECANCELED, zero, or a negative error code.
+ */
+int
+xfs_btree_query_range(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_irec	*low_rec,
+	const union xfs_btree_irec	*high_rec,
+	xfs_btree_query_range_fn	fn,
+	void				*priv)
+{
+	union xfs_btree_rec		rec;
+	union xfs_btree_key		low_key;
+	union xfs_btree_key		high_key;
+
+	/* Find the keys of both ends of the interval. */
+	cur->bc_rec = *high_rec;
+	cur->bc_ops->init_rec_from_cur(cur, &rec);
+	cur->bc_ops->init_key_from_rec(&high_key, &rec);
+
+	cur->bc_rec = *low_rec;
+	cur->bc_ops->init_rec_from_cur(cur, &rec);
+	cur->bc_ops->init_key_from_rec(&low_key, &rec);
+
+	/* Enforce low key < high key. */
+	if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
+		return -EINVAL;
+
+	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+		return xfs_btree_simple_query_range(cur, &low_key,
+				&high_key, fn, priv);
+	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
+			fn, priv);
+}
+
+/* Query a btree for all records. */
+int
+xfs_btree_query_all(
+	struct xfs_btree_cur		*cur,
+	xfs_btree_query_range_fn	fn,
+	void				*priv)
+{
+	union xfs_btree_key		low_key;
+	union xfs_btree_key		high_key;
+
+	memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
+	memset(&low_key, 0, sizeof(low_key));
+	memset(&high_key, 0xFF, sizeof(high_key));
+
+	return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
+}
+
+static int
+xfs_btree_count_blocks_helper(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*data)
+{
+	xfs_extlen_t		*blocks = data;
+	(*blocks)++;
+
+	return 0;
+}
+
+/* Count the blocks in a btree and return the result in *blocks. */
+int
+xfs_btree_count_blocks(
+	struct xfs_btree_cur	*cur,
+	xfs_extlen_t		*blocks)
+{
+	*blocks = 0;
+	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
+			XFS_BTREE_VISIT_ALL, blocks);
+}
+
+/* Compare two btree pointers. */
+int64_t
+xfs_btree_diff_two_ptrs(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*a,
+	const union xfs_btree_ptr	*b)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
+	return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
+}
+
+/* If there's an extent, we're done. */
+STATIC int
+xfs_btree_has_record_helper(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*rec,
+	void				*priv)
+{
+	return -ECANCELED;
+}
+
+/* Is there a record covering a given range of keys? */
+int
+xfs_btree_has_record(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_irec	*low,
+	const union xfs_btree_irec	*high,
+	bool				*exists)
+{
+	int				error;
+
+	error = xfs_btree_query_range(cur, low, high,
+			&xfs_btree_has_record_helper, NULL);
+	if (error == -ECANCELED) {
+		*exists = true;
+		return 0;
+	}
+	*exists = false;
+	return error;
+}
+
+/* Are there more records in this btree? */
+bool
+xfs_btree_has_more_records(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+	/* There are still records in this block. */
+	if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
+		return true;
+
+	/* There are more record blocks. */
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
+	else
+		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
+}
+
+/* Set up all the btree cursor caches. */
+int __init
+xfs_btree_init_cur_caches(void)
+{
+	int		error;
+
+	error = xfs_allocbt_init_cur_cache();
+	if (error)
+		return error;
+	error = xfs_inobt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_bmbt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_rmapbt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_refcountbt_init_cur_cache();
+	if (error)
+		goto err;
+
+	return 0;
+err:
+	xfs_btree_destroy_cur_caches();
+	return error;
+}
+
+/* Destroy all the btree cursor caches, if they've been allocated. */
+void
+xfs_btree_destroy_cur_caches(void)
+{
+	xfs_allocbt_destroy_cur_cache();
+	xfs_inobt_destroy_cur_cache();
+	xfs_bmbt_destroy_cur_cache();
+	xfs_rmapbt_destroy_cur_cache();
+	xfs_refcountbt_destroy_cur_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
new file mode 100644
index 000000000..eef27858a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -0,0 +1,606 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BTREE_H__
+#define	__XFS_BTREE_H__
+
+struct xfs_buf;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_ifork;
+struct xfs_perag;
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+	__be32			s;	/* short form ptr */
+	__be64			l;	/* long form ptr */
+};
+
+/*
+ * The in-core btree key.  Overlapping btrees actually store two keys
+ * per pointer, so we reserve enough memory to hold both.  The __*bigkey
+ * items should never be accessed directly.
+ */
+union xfs_btree_key {
+	struct xfs_bmbt_key		bmbt;
+	xfs_bmdr_key_t			bmbr;	/* bmbt root block */
+	xfs_alloc_key_t			alloc;
+	struct xfs_inobt_key		inobt;
+	struct xfs_rmap_key		rmap;
+	struct xfs_rmap_key		__rmap_bigkey[2];
+	struct xfs_refcount_key		refc;
+};
+
+union xfs_btree_rec {
+	struct xfs_bmbt_rec		bmbt;
+	xfs_bmdr_rec_t			bmbr;	/* bmbt root block */
+	struct xfs_alloc_rec		alloc;
+	struct xfs_inobt_rec		inobt;
+	struct xfs_rmap_rec		rmap;
+	struct xfs_refcount_rec		refc;
+};
+
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define	XFS_LOOKUP_EQ	((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define	XFS_LOOKUP_LE	((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define	XFS_LOOKUP_GE	((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define	XFS_BTNUM_BNO	((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define	XFS_BTNUM_CNT	((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define	XFS_BTNUM_BMAP	((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
+#define	XFS_BTNUM_FINO	((xfs_btnum_t)XFS_BTNUM_FINOi)
+#define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi)
+#define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
+
+uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
+
+/*
+ * For logging record fields.
+ */
+#define	XFS_BB_MAGIC		(1u << 0)
+#define	XFS_BB_LEVEL		(1u << 1)
+#define	XFS_BB_NUMRECS		(1u << 2)
+#define	XFS_BB_LEFTSIB		(1u << 3)
+#define	XFS_BB_RIGHTSIB		(1u << 4)
+#define	XFS_BB_BLKNO		(1u << 5)
+#define	XFS_BB_LSN		(1u << 6)
+#define	XFS_BB_UUID		(1u << 7)
+#define	XFS_BB_OWNER		(1u << 8)
+#define	XFS_BB_NUM_BITS		5
+#define	XFS_BB_ALL_BITS		((1u << XFS_BB_NUM_BITS) - 1)
+#define	XFS_BB_NUM_BITS_CRC	9
+#define	XFS_BB_ALL_BITS_CRC	((1u << XFS_BB_NUM_BITS_CRC) - 1)
+
+/*
+ * Generic stats interface
+ */
+#define XFS_BTREE_STATS_INC(cur, stat)	\
+	XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)	\
+	XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
+
+struct xfs_btree_ops {
+	/* size of the key and record structures */
+	size_t	key_len;
+	size_t	rec_len;
+
+	/* cursor operations */
+	struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+	void	(*update_cursor)(struct xfs_btree_cur *src,
+				 struct xfs_btree_cur *dst);
+
+	/* update btree root pointer */
+	void	(*set_root)(struct xfs_btree_cur *cur,
+			    const union xfs_btree_ptr *nptr, int level_change);
+
+	/* block allocation / freeing */
+	int	(*alloc_block)(struct xfs_btree_cur *cur,
+			       const union xfs_btree_ptr *start_bno,
+			       union xfs_btree_ptr *new_bno,
+			       int *stat);
+	int	(*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+	/* update last record information */
+	void	(*update_lastrec)(struct xfs_btree_cur *cur,
+				  const struct xfs_btree_block *block,
+				  const union xfs_btree_rec *rec,
+				  int ptr, int reason);
+
+	/* records in block/level */
+	int	(*get_minrecs)(struct xfs_btree_cur *cur, int level);
+	int	(*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+	/* records on disk.  Matter for the root in inode case. */
+	int	(*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+	/* init values of btree structures */
+	void	(*init_key_from_rec)(union xfs_btree_key *key,
+				     const union xfs_btree_rec *rec);
+	void	(*init_rec_from_cur)(struct xfs_btree_cur *cur,
+				     union xfs_btree_rec *rec);
+	void	(*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+				     union xfs_btree_ptr *ptr);
+	void	(*init_high_key_from_rec)(union xfs_btree_key *key,
+					  const union xfs_btree_rec *rec);
+
+	/* difference between key value and cursor value */
+	int64_t (*key_diff)(struct xfs_btree_cur *cur,
+			    const union xfs_btree_key *key);
+
+	/*
+	 * Difference between key2 and key1 -- positive if key1 > key2,
+	 * negative if key1 < key2, and zero if equal.
+	 */
+	int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
+				 const union xfs_btree_key *key1,
+				 const union xfs_btree_key *key2);
+
+	const struct xfs_buf_ops	*buf_ops;
+
+	/* check that k1 is lower than k2 */
+	int	(*keys_inorder)(struct xfs_btree_cur *cur,
+				const union xfs_btree_key *k1,
+				const union xfs_btree_key *k2);
+
+	/* check that r1 is lower than r2 */
+	int	(*recs_inorder)(struct xfs_btree_cur *cur,
+				const union xfs_btree_rec *r1,
+				const union xfs_btree_rec *r2);
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE	0
+#define LASTREC_INSREC	1
+#define LASTREC_DELREC	2
+
+
+union xfs_btree_irec {
+	struct xfs_alloc_rec_incore	a;
+	struct xfs_bmbt_irec		b;
+	struct xfs_inobt_rec_incore	i;
+	struct xfs_rmap_irec		r;
+	struct xfs_refcount_irec	rc;
+};
+
+/* Per-AG btree information. */
+struct xfs_btree_cur_ag {
+	struct xfs_perag		*pag;
+	union {
+		struct xfs_buf		*agbp;
+		struct xbtree_afakeroot	*afake;	/* for staging cursor */
+	};
+	union {
+		struct {
+			unsigned int	nr_ops;	/* # record updates */
+			unsigned int	shape_changes;	/* # of extent splits */
+		} refc;
+		struct {
+			bool		active;	/* allocation cursor state */
+		} abt;
+	};
+};
+
+/* Btree-in-inode cursor information */
+struct xfs_btree_cur_ino {
+	struct xfs_inode		*ip;
+	struct xbtree_ifakeroot		*ifake;	/* for staging cursor */
+	int				allocated;
+	short				forksize;
+	char				whichfork;
+	char				flags;
+/* We are converting a delalloc reservation */
+#define	XFS_BTCUR_BMBT_WASDEL		(1 << 0)
+
+/* For extent swap, ignore owner check in verifier */
+#define	XFS_BTCUR_BMBT_INVALID_OWNER	(1 << 1)
+};
+
+struct xfs_btree_level {
+	/* buffer pointer */
+	struct xfs_buf		*bp;
+
+	/* key/record number */
+	uint16_t		ptr;
+
+	/* readahead info */
+#define XFS_BTCUR_LEFTRA	(1 << 0) /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA	(1 << 1) /* right sibling has been read-ahead */
+	uint16_t		ra;
+};
+
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+struct xfs_btree_cur
+{
+	struct xfs_trans	*bc_tp;	/* transaction we're in, if any */
+	struct xfs_mount	*bc_mp;	/* file system mount struct */
+	const struct xfs_btree_ops *bc_ops;
+	struct kmem_cache	*bc_cache; /* cursor cache */
+	unsigned int		bc_flags; /* btree features - below */
+	xfs_btnum_t		bc_btnum; /* identifies which btree type */
+	union xfs_btree_irec	bc_rec;	/* current insert/search record value */
+	uint8_t			bc_nlevels; /* number of levels in the tree */
+	uint8_t			bc_maxlevels; /* maximum levels for this btree type */
+	int			bc_statoff; /* offset of btree stats array */
+
+	/*
+	 * Short btree pointers need an agno to be able to turn the pointers
+	 * into physical addresses for IO, so the btree cursor switches between
+	 * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the
+	 * cursor.
+	 */
+	union {
+		struct xfs_btree_cur_ag	bc_ag;
+		struct xfs_btree_cur_ino bc_ino;
+	};
+
+	/* Must be at the end of the struct! */
+	struct xfs_btree_level	bc_levels[];
+};
+
+/*
+ * Compute the size of a btree cursor that can handle a btree of a given
+ * height.  The bc_levels array handles node and leaf blocks, so its size
+ * is exactly nlevels.
+ */
+static inline size_t
+xfs_btree_cur_sizeof(unsigned int nlevels)
+{
+	return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels);
+}
+
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS		(1<<0)	/* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE		(1<<1)	/* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
+#define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata.  The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING		(1<<5)
+
+#define	XFS_BTREE_NOERROR	0
+#define	XFS_BTREE_ERROR		1
+
+/*
+ * Convert from buffer to btree block header.
+ */
+#define	XFS_BUF_TO_BLOCK(bp)	((struct xfs_btree_block *)((bp)->b_addr))
+
+/*
+ * Internal long and short btree block checks.  They return NULL if the
+ * block is ok or the address of the failed check otherwise.
+ */
+xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
+		struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
+		struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+
+/*
+ * Check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block, if any */
+
+/*
+ * Check that (long) pointer is ok.
+ */
+bool					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_fsblock_t		fsbno,	/* btree block disk address */
+	int			level);	/* btree block level */
+
+/*
+ * Check that (short) pointer is ok.
+ */
+bool					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		agbno,	/* btree block disk address */
+	int			level);	/* btree block level */
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			error);	/* del because of error */
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	struct xfs_btree_cur		*cur,	/* input cursor */
+	struct xfs_btree_cur		**ncur);/* output cursor */
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	uint32_t		fields,	/* bitmask of fields */
+	const short		*offsets,/* table of field offsets */
+	int			nbits,	/* number of bits to inspect */
+	int			*first,	/* output: first byte offset */
+	int			*last);	/* output: last byte offset */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	struct xfs_buf		**bpp,	/* buffer for fsbno */
+	int			refval,	/* ref count value for buffer */
+	const struct xfs_buf_ops *ops);
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	xfs_extlen_t		count,	/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops);
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	xfs_extlen_t		count,	/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops);
+
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_btnum_t	btnum,
+	__u16		level,
+	__u16		numrecs,
+	__u64		owner);
+
+void
+xfs_btree_init_block_int(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*buf,
+	xfs_daddr_t		blkno,
+	xfs_btnum_t		btnum,
+	__u16			level,
+	__u16			numrecs,
+	__u64			owner,
+	unsigned int		flags);
+
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
+			   struct list_head *buffer_list);
+
+/*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
+
+/*
+ * Helpers.
+ */
+static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block)
+{
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+		uint16_t numrecs)
+{
+	block->bb_numrecs = cpu_to_be16(numrecs);
+}
+
+static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
+{
+	return be16_to_cpu(block->bb_level);
+}
+
+
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#define	XFS_EXTLEN_MIN(a,b)	min_t(xfs_extlen_t, (a), (b))
+#define	XFS_EXTLEN_MAX(a,b)	max_t(xfs_extlen_t, (a), (b))
+#define	XFS_AGBLOCK_MIN(a,b)	min_t(xfs_agblock_t, (a), (b))
+#define	XFS_AGBLOCK_MAX(a,b)	max_t(xfs_agblock_t, (a), (b))
+#define	XFS_FILEOFF_MIN(a,b)	min_t(xfs_fileoff_t, (a), (b))
+#define	XFS_FILEOFF_MAX(a,b)	max_t(xfs_fileoff_t, (a), (b))
+#define	XFS_FILBLKS_MIN(a,b)	min_t(xfs_filblks_t, (a), (b))
+#define	XFS_FILBLKS_MAX(a,b)	max_t(xfs_filblks_t, (a), (b))
+
+xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
+xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp,
+		unsigned int max_recs);
+xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
+		uint64_t owner);
+xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
+		unsigned int max_recs);
+
+unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
+		unsigned long long records);
+unsigned long long xfs_btree_calc_size(const unsigned int *limits,
+		unsigned long long records);
+unsigned int xfs_btree_space_to_height(const unsigned int *limits,
+		unsigned long long blocks);
+
+/*
+ * Return codes for the query range iterator function are 0 to continue
+ * iterating, and non-zero to stop iterating.  Any non-zero value will be
+ * passed up to the _query_range caller.  The special value -ECANCELED can be
+ * used to stop iteration, because _query_range never generates that error
+ * code on its own.
+ */
+typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
+		const union xfs_btree_rec *rec, void *priv);
+
+int xfs_btree_query_range(struct xfs_btree_cur *cur,
+		const union xfs_btree_irec *low_rec,
+		const union xfs_btree_irec *high_rec,
+		xfs_btree_query_range_fn fn, void *priv);
+int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
+		void *priv);
+
+typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
+		void *data);
+/* Visit record blocks. */
+#define XFS_BTREE_VISIT_RECORDS		(1 << 0)
+/* Visit leaf blocks. */
+#define XFS_BTREE_VISIT_LEAVES		(1 << 1)
+/* Visit all blocks. */
+#define XFS_BTREE_VISIT_ALL		(XFS_BTREE_VISIT_RECORDS | \
+					 XFS_BTREE_VISIT_LEAVES)
+int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
+		xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data);
+
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+
+union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n,
+		struct xfs_btree_block *block);
+union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n,
+		struct xfs_btree_block *block);
+union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n,
+		struct xfs_btree_block *block);
+union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n,
+		struct xfs_btree_block *block);
+int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
+		const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
+struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
+		int level, struct xfs_buf **bpp);
+bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *ptr);
+int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
+				const union xfs_btree_ptr *a,
+				const union xfs_btree_ptr *b);
+void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
+			   struct xfs_btree_block *block,
+			   union xfs_btree_ptr *ptr, int lr);
+void xfs_btree_get_keys(struct xfs_btree_cur *cur,
+		struct xfs_btree_block *block, union xfs_btree_key *key);
+union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
+		union xfs_btree_key *key);
+int xfs_btree_has_record(struct xfs_btree_cur *cur,
+		const union xfs_btree_irec *low,
+		const union xfs_btree_irec *high, bool *exists);
+bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
+struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
+
+/* Does this cursor point to the last block in the given level? */
+static inline bool
+xfs_btree_islastblock(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
+	return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
+}
+
+void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
+		union xfs_btree_ptr *ptr);
+int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
+		struct xfs_buf **bpp);
+void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
+		struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
+		int lr);
+void xfs_btree_init_block_cur(struct xfs_btree_cur *cur,
+		struct xfs_buf *bp, int level, int numrecs);
+void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
+		union xfs_btree_ptr *dst_ptr,
+		const union xfs_btree_ptr *src_ptr, int numptrs);
+void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
+		union xfs_btree_key *dst_key,
+		const union xfs_btree_key *src_key, int numkeys);
+
+static inline struct xfs_btree_cur *
+xfs_btree_alloc_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_btnum_t		btnum,
+	uint8_t			maxlevels,
+	struct kmem_cache	*cache)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = btnum;
+	cur->bc_maxlevels = maxlevels;
+	cur->bc_cache = cache;
+
+	return cur;
+}
+
+int __init xfs_btree_init_cur_caches(void);
+void xfs_btree_destroy_cur_caches(void);
+
+#endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
new file mode 100644
index 000000000..dd75e208b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
+#include "xfs_btree_staging.h"
+
+/*
+ * Staging Cursors and Fake Roots for Btrees
+ * =========================================
+ *
+ * A staging btree cursor is a special type of btree cursor that callers must
+ * use to construct a new btree index using the btree bulk loader code.  The
+ * bulk loading code uses the staging btree cursor to abstract the details of
+ * initializing new btree blocks and filling them with records or key/ptr
+ * pairs.  Regular btree operations (e.g. queries and modifications) are not
+ * supported with staging cursors, and callers must not invoke them.
+ *
+ * Fake root structures contain all the information about a btree that is under
+ * construction by the bulk loading code.  Staging btree cursors point to fake
+ * root structures instead of the usual AG header or inode structure.
+ *
+ * Callers are expected to initialize a fake root structure and pass it into
+ * the _stage_cursor function for a specific btree type.  When bulk loading is
+ * complete, callers should call the _commit_staged_btree function for that
+ * specific btree type to commit the new btree into the filesystem.
+ */
+
+/*
+ * Don't allow staging cursors to be duplicated because they're supposed to be
+ * kept private to a single thread.
+ */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	ASSERT(0);
+	return NULL;
+}
+
+/*
+ * Don't allow block allocation for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ *
+ * Bulk loading uses a separate callback to obtain new blocks from a
+ * preallocated list, which prevents ENOSPC failures during loading.
+ */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start_bno,
+	union xfs_btree_ptr		*new_bno,
+	int				*stat)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Don't allow block freeing for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ */
+STATIC int
+xfs_btree_fakeroot_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xbtree_afakeroot	*afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	afake = cur->bc_ag.afake;
+	ptr->s = cpu_to_be32(afake->af_root);
+}
+
+/*
+ * Bulk Loading for AG Btrees
+ * ==========================
+ *
+ * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
+ * staging cursor.  Callers should initialize this to zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
+ * cursor.  The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_afakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/* Update the btree root information for a per-AG fake root. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				inc)
+{
+	struct xbtree_afakeroot	*afake = cur->bc_ag.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	afake->af_root = be32_to_cpu(ptr->s);
+	afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG btree fake root.
+ * The btree cursor's bc_ops will be overridden as needed to make the staging
+ * functionality work.
+ */
+void
+xfs_btree_stage_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xbtree_afakeroot		*afake)
+{
+	struct xfs_btree_ops		*nops;
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+	ASSERT(cur->bc_tp == NULL);
+
+	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+	nops->free_block = xfs_btree_fakeroot_free_block;
+	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+	nops->set_root = xfs_btree_afakeroot_set_root;
+	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+	cur->bc_ag.afake = afake;
+	cur->bc_nlevels = afake->af_levels;
+	cur->bc_ops = nops;
+	cur->bc_flags |= XFS_BTREE_STAGING;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops.  The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	const struct xfs_btree_ops	*ops)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(cur->bc_tp == NULL);
+
+	trace_xfs_btree_commit_afakeroot(cur);
+
+	kmem_free((void *)cur->bc_ops);
+	cur->bc_ag.agbp = agbp;
+	cur->bc_ops = ops;
+	cur->bc_flags &= ~XFS_BTREE_STAGING;
+	cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading for Inode-Rooted Btrees
+ * ====================================
+ *
+ * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
+ * the staging cursor.  This structure should be initialized as follows:
+ *
+ * - if_fork_size field should be set to the number of bytes available to the
+ *   fork in the inode.
+ *
+ * - if_fork should point to a freshly allocated struct xfs_ifork.
+ *
+ * - if_format should be set to the appropriate fork type (e.g.
+ *   XFS_DINODE_FMT_BTREE).
+ *
+ * All other fields must be zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
+ * cursor.  The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_ifakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/*
+ * Initialize an inode-rooted btree cursor with the given inode btree fake
+ * root.  The btree cursor's bc_ops will be overridden as needed to make the
+ * staging functionality work.  If new_ops is not NULL, these new ops will be
+ * passed out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_ifakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xbtree_ifakeroot		*ifake,
+	struct xfs_btree_ops		**new_ops)
+{
+	struct xfs_btree_ops		*nops;
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_tp == NULL);
+
+	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+	nops->free_block = xfs_btree_fakeroot_free_block;
+	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+	cur->bc_ino.ifake = ifake;
+	cur->bc_nlevels = ifake->if_levels;
+	cur->bc_ops = nops;
+	cur->bc_flags |= XFS_BTREE_STAGING;
+
+	if (new_ops)
+		*new_ops = nops;
+}
+
+/*
+ * Transform an inode-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops.  The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_ifakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xfs_trans		*tp,
+	int				whichfork,
+	const struct xfs_btree_ops	*ops)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(cur->bc_tp == NULL);
+
+	trace_xfs_btree_commit_ifakeroot(cur);
+
+	kmem_free((void *)cur->bc_ops);
+	cur->bc_ino.ifake = NULL;
+	cur->bc_ino.whichfork = whichfork;
+	cur->bc_ops = ops;
+	cur->bc_flags &= ~XFS_BTREE_STAGING;
+	cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading of Staged Btrees
+ * =============================
+ *
+ * This interface is used with a staged btree cursor to create a totally new
+ * btree with a large number of records (i.e. more than what would fit in a
+ * single root block).  When the creation is complete, the new root can be
+ * linked atomically into the filesystem by committing the staged cursor.
+ *
+ * Creation of a new btree proceeds roughly as follows:
+ *
+ * The first step is to initialize an appropriate fake btree root structure and
+ * then construct a staged btree cursor.  Refer to the block comments about
+ * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
+ * more information about how to do this.
+ *
+ * The second step is to initialize a struct xfs_btree_bload context as
+ * documented in the structure definition.
+ *
+ * The third step is to call xfs_btree_bload_compute_geometry to compute the
+ * height of and the number of blocks needed to construct the btree.  See the
+ * section "Computing the Geometry of the New Btree" for details about this
+ * computation.
+ *
+ * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
+ * save them for later use by ->claim_block().  Bulk loading requires all
+ * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
+ * rebuild, and to minimize seek distances of the new btree.
+ *
+ * Step five is to call xfs_btree_bload() to start constructing the btree.
+ *
+ * The final step is to commit the staging btree cursor, which logs the new
+ * btree root and turns the staging cursor into a regular cursor.  The caller
+ * is responsible for cleaning up the previous btree blocks, if any.
+ *
+ * Computing the Geometry of the New Btree
+ * =======================================
+ *
+ * The number of items placed in each btree block is computed via the following
+ * algorithm: For leaf levels, the number of items for the level is nr_records
+ * in the bload structure.  For node levels, the number of items for the level
+ * is the number of blocks in the next lower level of the tree.  For each
+ * level, the desired number of items per block is defined as:
+ *
+ * desired = max(minrecs, maxrecs - slack factor)
+ *
+ * The number of blocks for the level is defined to be:
+ *
+ * blocks = floor(nr_items / desired)
+ *
+ * Note this is rounded down so that the npb calculation below will never fall
+ * below minrecs.  The number of items that will actually be loaded into each
+ * btree block is defined as:
+ *
+ * npb =  nr_items / blocks
+ *
+ * Some of the leftmost blocks in the level will contain one extra record as
+ * needed to handle uneven division.  If the number of records in any block
+ * would exceed maxrecs for that level, blocks is incremented and npb is
+ * recalculated.
+ *
+ * In other words, we compute the number of blocks needed to satisfy a given
+ * loading level, then spread the items as evenly as possible.
+ *
+ * The height and number of fs blocks required to create the btree are computed
+ * and returned via btree_height and nr_blocks.
+ */
+
+/*
+ * Put a btree block that we're loading onto the ordered list and release it.
+ * The btree blocks will be written to disk when bulk loading is finished.
+ */
+static void
+xfs_btree_bload_drop_buf(
+	struct list_head	*buffers_list,
+	struct xfs_buf		**bpp)
+{
+	if (*bpp == NULL)
+		return;
+
+	if (!xfs_buf_delwri_queue(*bpp, buffers_list))
+		ASSERT(0);
+
+	xfs_buf_relse(*bpp);
+	*bpp = NULL;
+}
+
+/*
+ * Allocate and initialize one btree block for bulk loading.
+ *
+ * The new btree block will have its level and numrecs fields set to the values
+ * of the level and nr_this_block parameters, respectively.
+ *
+ * The caller should ensure that ptrp, bpp, and blockp refer to the left
+ * sibling of the new block, if there is any.  On exit, ptrp, bpp, and blockp
+ * will all point to the new block.
+ */
+STATIC int
+xfs_btree_bload_prep_block(
+	struct xfs_btree_cur		*cur,
+	struct xfs_btree_bload		*bbl,
+	struct list_head		*buffers_list,
+	unsigned int			level,
+	unsigned int			nr_this_block,
+	union xfs_btree_ptr		*ptrp, /* in/out */
+	struct xfs_buf			**bpp, /* in/out */
+	struct xfs_btree_block		**blockp, /* in/out */
+	void				*priv)
+{
+	union xfs_btree_ptr		new_ptr;
+	struct xfs_buf			*new_bp;
+	struct xfs_btree_block		*new_block;
+	int				ret;
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+		size_t			new_size;
+
+		ASSERT(*bpp == NULL);
+
+		/* Allocate a new incore btree root block. */
+		new_size = bbl->iroot_size(cur, nr_this_block, priv);
+		ifp->if_broot = kmem_zalloc(new_size, 0);
+		ifp->if_broot_bytes = (int)new_size;
+
+		/* Initialize it and send it out. */
+		xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
+				XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
+				nr_this_block, cur->bc_ino.ip->i_ino,
+				cur->bc_flags);
+
+		*bpp = NULL;
+		*blockp = ifp->if_broot;
+		xfs_btree_set_ptr_null(cur, ptrp);
+		return 0;
+	}
+
+	/* Claim one of the caller's preallocated blocks. */
+	xfs_btree_set_ptr_null(cur, &new_ptr);
+	ret = bbl->claim_block(cur, &new_ptr, priv);
+	if (ret)
+		return ret;
+
+	ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
+
+	ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
+	if (ret)
+		return ret;
+
+	/*
+	 * The previous block (if any) is the left sibling of the new block,
+	 * so set its right sibling pointer to the new block and drop it.
+	 */
+	if (*blockp)
+		xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
+	xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+	/* Initialize the new btree block. */
+	xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
+	xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
+
+	/* Set the out parameters. */
+	*bpp = new_bp;
+	*blockp = new_block;
+	xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
+	return 0;
+}
+
+/* Load one leaf block. */
+STATIC int
+xfs_btree_bload_leaf(
+	struct xfs_btree_cur		*cur,
+	unsigned int			recs_this_block,
+	xfs_btree_bload_get_record_fn	get_record,
+	struct xfs_btree_block		*block,
+	void				*priv)
+{
+	unsigned int			j;
+	int				ret;
+
+	/* Fill the leaf block with records. */
+	for (j = 1; j <= recs_this_block; j++) {
+		union xfs_btree_rec	*block_rec;
+
+		ret = get_record(cur, priv);
+		if (ret)
+			return ret;
+		block_rec = xfs_btree_rec_addr(cur, j, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return 0;
+}
+
+/*
+ * Load one node block with key/ptr pairs.
+ *
+ * child_ptr must point to a block within the next level down in the tree.  A
+ * key/ptr entry will be created in the new node block to the block pointed to
+ * by child_ptr.  On exit, child_ptr points to the next block on the child
+ * level that needs processing.
+ */
+STATIC int
+xfs_btree_bload_node(
+	struct xfs_btree_cur	*cur,
+	unsigned int		recs_this_block,
+	union xfs_btree_ptr	*child_ptr,
+	struct xfs_btree_block	*block)
+{
+	unsigned int		j;
+	int			ret;
+
+	/* Fill the node block with keys and pointers. */
+	for (j = 1; j <= recs_this_block; j++) {
+		union xfs_btree_key	child_key;
+		union xfs_btree_ptr	*block_ptr;
+		union xfs_btree_key	*block_key;
+		struct xfs_btree_block	*child_block;
+		struct xfs_buf		*child_bp;
+
+		ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
+
+		ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+				&child_bp);
+		if (ret)
+			return ret;
+
+		block_ptr = xfs_btree_ptr_addr(cur, j, block);
+		xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
+
+		block_key = xfs_btree_key_addr(cur, j, block);
+		xfs_btree_get_keys(cur, child_block, &child_key);
+		xfs_btree_copy_keys(cur, block_key, &child_key, 1);
+
+		xfs_btree_get_sibling(cur, child_block, child_ptr,
+				XFS_BB_RIGHTSIB);
+		xfs_buf_relse(child_bp);
+	}
+
+	return 0;
+}
+
+/*
+ * Compute the maximum number of records (or keyptrs) per block that we want to
+ * install at this level in the btree.  Caller is responsible for having set
+ * @cur->bc_ino.forksize to the desired fork size, if appropriate.
+ */
+STATIC unsigned int
+xfs_btree_bload_max_npb(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_bload	*bbl,
+	unsigned int		level)
+{
+	unsigned int		ret;
+
+	if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
+		return cur->bc_ops->get_dmaxrecs(cur, level);
+
+	ret = cur->bc_ops->get_maxrecs(cur, level);
+	if (level == 0)
+		ret -= bbl->leaf_slack;
+	else
+		ret -= bbl->node_slack;
+	return ret;
+}
+
+/*
+ * Compute the desired number of records (or keyptrs) per block that we want to
+ * install at this level in the btree, which must be somewhere between minrecs
+ * and max_npb.  The caller is free to install fewer records per block.
+ */
+STATIC unsigned int
+xfs_btree_bload_desired_npb(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_bload	*bbl,
+	unsigned int		level)
+{
+	unsigned int		npb = xfs_btree_bload_max_npb(cur, bbl, level);
+
+	/* Root blocks are not subject to minrecs rules. */
+	if (level == cur->bc_nlevels - 1)
+		return max(1U, npb);
+
+	return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
+}
+
+/*
+ * Compute the number of records to be stored in each block at this level and
+ * the number of blocks for this level.  For leaf levels, we must populate an
+ * empty root block even if there are no records, so we have to have at least
+ * one block.
+ */
+STATIC void
+xfs_btree_bload_level_geometry(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_bload	*bbl,
+	unsigned int		level,
+	uint64_t		nr_this_level,
+	unsigned int		*avg_per_block,
+	uint64_t		*blocks,
+	uint64_t		*blocks_with_extra)
+{
+	uint64_t		npb;
+	uint64_t		dontcare;
+	unsigned int		desired_npb;
+	unsigned int		maxnr;
+
+	maxnr = cur->bc_ops->get_maxrecs(cur, level);
+
+	/*
+	 * Compute the number of blocks we need to fill each block with the
+	 * desired number of records/keyptrs per block.  Because desired_npb
+	 * could be minrecs, we use regular integer division (which rounds
+	 * the block count down) so that in the next step the effective # of
+	 * items per block will never be less than desired_npb.
+	 */
+	desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
+	*blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
+	*blocks = max(1ULL, *blocks);
+
+	/*
+	 * Compute the number of records that we will actually put in each
+	 * block, assuming that we want to spread the records evenly between
+	 * the blocks.  Take care that the effective # of items per block (npb)
+	 * won't exceed maxrecs even for the blocks that get an extra record,
+	 * since desired_npb could be maxrecs, and in the previous step we
+	 * rounded the block count down.
+	 */
+	npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+	if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
+		(*blocks)++;
+		npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+	}
+
+	*avg_per_block = min_t(uint64_t, npb, nr_this_level);
+
+	trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
+			*avg_per_block, desired_npb, *blocks,
+			*blocks_with_extra);
+}
+
+/*
+ * Ensure a slack value is appropriate for the btree.
+ *
+ * If the slack value is negative, set slack so that we fill the block to
+ * halfway between minrecs and maxrecs.  Make sure the slack is never so large
+ * that we can underflow minrecs.
+ */
+static void
+xfs_btree_bload_ensure_slack(
+	struct xfs_btree_cur	*cur,
+	int			*slack,
+	int			level)
+{
+	int			maxr;
+	int			minr;
+
+	maxr = cur->bc_ops->get_maxrecs(cur, level);
+	minr = cur->bc_ops->get_minrecs(cur, level);
+
+	/*
+	 * If slack is negative, automatically set slack so that we load the
+	 * btree block approximately halfway between minrecs and maxrecs.
+	 * Generally, this will net us 75% loading.
+	 */
+	if (*slack < 0)
+		*slack = maxr - ((maxr + minr) >> 1);
+
+	*slack = min(*slack, maxr - minr);
+}
+
+/*
+ * Prepare a btree cursor for a bulk load operation by computing the geometry
+ * fields in bbl.  Caller must ensure that the btree cursor is a staging
+ * cursor.  This function can be called multiple times.
+ */
+int
+xfs_btree_bload_compute_geometry(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_bload	*bbl,
+	uint64_t		nr_records)
+{
+	uint64_t		nr_blocks = 0;
+	uint64_t		nr_this_level;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	/*
+	 * Make sure that the slack values make sense for traditional leaf and
+	 * node blocks.  Inode-rooted btrees will return different minrecs and
+	 * maxrecs values for the root block (bc_nlevels == level - 1).  We're
+	 * checking levels 0 and 1 here, so set bc_nlevels such that the btree
+	 * code doesn't interpret either as the root level.
+	 */
+	cur->bc_nlevels = cur->bc_maxlevels - 1;
+	xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
+	xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
+
+	bbl->nr_records = nr_this_level = nr_records;
+	for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
+		uint64_t	level_blocks;
+		uint64_t	dontcare64;
+		unsigned int	level = cur->bc_nlevels - 1;
+		unsigned int	avg_per_block;
+
+		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+				&avg_per_block, &level_blocks, &dontcare64);
+
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+			/*
+			 * If all the items we want to store at this level
+			 * would fit in the inode root block, then we have our
+			 * btree root and are done.
+			 *
+			 * Note that bmap btrees forbid records in the root.
+			 */
+			if (level != 0 && nr_this_level <= avg_per_block) {
+				nr_blocks++;
+				break;
+			}
+
+			/*
+			 * Otherwise, we have to store all the items for this
+			 * level in traditional btree blocks and therefore need
+			 * another level of btree to point to those blocks.
+			 *
+			 * We have to re-compute the geometry for each level of
+			 * an inode-rooted btree because the geometry differs
+			 * between a btree root in an inode fork and a
+			 * traditional btree block.
+			 *
+			 * This distinction is made in the btree code based on
+			 * whether level == bc_nlevels - 1.  Based on the
+			 * previous root block size check against the root
+			 * block geometry, we know that we aren't yet ready to
+			 * populate the root.  Increment bc_nevels and
+			 * recalculate the geometry for a traditional
+			 * block-based btree level.
+			 */
+			cur->bc_nlevels++;
+			ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+			xfs_btree_bload_level_geometry(cur, bbl, level,
+					nr_this_level, &avg_per_block,
+					&level_blocks, &dontcare64);
+		} else {
+			/*
+			 * If all the items we want to store at this level
+			 * would fit in a single root block, we're done.
+			 */
+			if (nr_this_level <= avg_per_block) {
+				nr_blocks++;
+				break;
+			}
+
+			/* Otherwise, we need another level of btree. */
+			cur->bc_nlevels++;
+			ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+		}
+
+		nr_blocks += level_blocks;
+		nr_this_level = level_blocks;
+	}
+
+	if (cur->bc_nlevels > cur->bc_maxlevels)
+		return -EOVERFLOW;
+
+	bbl->btree_height = cur->bc_nlevels;
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+		bbl->nr_blocks = nr_blocks - 1;
+	else
+		bbl->nr_blocks = nr_blocks;
+	return 0;
+}
+
+/* Bulk load a btree given the parameters and geometry established in bbl. */
+int
+xfs_btree_bload(
+	struct xfs_btree_cur		*cur,
+	struct xfs_btree_bload		*bbl,
+	void				*priv)
+{
+	struct list_head		buffers_list;
+	union xfs_btree_ptr		child_ptr;
+	union xfs_btree_ptr		ptr;
+	struct xfs_buf			*bp = NULL;
+	struct xfs_btree_block		*block = NULL;
+	uint64_t			nr_this_level = bbl->nr_records;
+	uint64_t			blocks;
+	uint64_t			i;
+	uint64_t			blocks_with_extra;
+	uint64_t			total_blocks = 0;
+	unsigned int			avg_per_block;
+	unsigned int			level = 0;
+	int				ret;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	INIT_LIST_HEAD(&buffers_list);
+	cur->bc_nlevels = bbl->btree_height;
+	xfs_btree_set_ptr_null(cur, &child_ptr);
+	xfs_btree_set_ptr_null(cur, &ptr);
+
+	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+			&avg_per_block, &blocks, &blocks_with_extra);
+
+	/* Load each leaf block. */
+	for (i = 0; i < blocks; i++) {
+		unsigned int		nr_this_block = avg_per_block;
+
+		/*
+		 * Due to rounding, btree blocks will not be evenly populated
+		 * in most cases.  blocks_with_extra tells us how many blocks
+		 * will receive an extra record to distribute the excess across
+		 * the current level as evenly as possible.
+		 */
+		if (i < blocks_with_extra)
+			nr_this_block++;
+
+		ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
+				nr_this_block, &ptr, &bp, &block, priv);
+		if (ret)
+			goto out;
+
+		trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
+				nr_this_block);
+
+		ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+				block, priv);
+		if (ret)
+			goto out;
+
+		/*
+		 * Record the leftmost leaf pointer so we know where to start
+		 * with the first node level.
+		 */
+		if (i == 0)
+			xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
+	}
+	total_blocks += blocks;
+	xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+	/* Populate the internal btree nodes. */
+	for (level = 1; level < cur->bc_nlevels; level++) {
+		union xfs_btree_ptr	first_ptr;
+
+		nr_this_level = blocks;
+		block = NULL;
+		xfs_btree_set_ptr_null(cur, &ptr);
+
+		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+				&avg_per_block, &blocks, &blocks_with_extra);
+
+		/* Load each node block. */
+		for (i = 0; i < blocks; i++) {
+			unsigned int	nr_this_block = avg_per_block;
+
+			if (i < blocks_with_extra)
+				nr_this_block++;
+
+			ret = xfs_btree_bload_prep_block(cur, bbl,
+					&buffers_list, level, nr_this_block,
+					&ptr, &bp, &block, priv);
+			if (ret)
+				goto out;
+
+			trace_xfs_btree_bload_block(cur, level, i, blocks,
+					&ptr, nr_this_block);
+
+			ret = xfs_btree_bload_node(cur, nr_this_block,
+					&child_ptr, block);
+			if (ret)
+				goto out;
+
+			/*
+			 * Record the leftmost node pointer so that we know
+			 * where to start the next node level above this one.
+			 */
+			if (i == 0)
+				xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
+		}
+		total_blocks += blocks;
+		xfs_btree_bload_drop_buf(&buffers_list, &bp);
+		xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
+	}
+
+	/* Initialize the new root. */
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+		cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
+		cur->bc_ino.ifake->if_blocks = total_blocks - 1;
+	} else {
+		cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
+		cur->bc_ag.afake->af_levels = cur->bc_nlevels;
+		cur->bc_ag.afake->af_blocks = total_blocks;
+	}
+
+	/*
+	 * Write the new blocks to disk.  If the ordered list isn't empty after
+	 * that, then something went wrong and we have to fail.  This should
+	 * never happen, but we'll check anyway.
+	 */
+	ret = xfs_buf_delwri_submit(&buffers_list);
+	if (ret)
+		goto out;
+	if (!list_empty(&buffers_list)) {
+		ASSERT(list_empty(&buffers_list));
+		ret = -EIO;
+	}
+
+out:
+	xfs_buf_delwri_cancel(&buffers_list);
+	if (bp)
+		xfs_buf_relse(bp);
+	return ret;
+}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
new file mode 100644
index 000000000..f0d297605
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_BTREE_STAGING_H__
+#define __XFS_BTREE_STAGING_H__
+
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+	/* AG block number of the new btree root. */
+	xfs_agblock_t		af_root;
+
+	/* Height of the new btree. */
+	unsigned int		af_levels;
+
+	/* Number of blocks used by the btree. */
+	unsigned int		af_blocks;
+};
+
+/* Cursor interactions with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+		struct xbtree_afakeroot *afake);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+		struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+
+/* Fake root for an inode-rooted btree. */
+struct xbtree_ifakeroot {
+	/* Fake inode fork. */
+	struct xfs_ifork	*if_fork;
+
+	/* Number of blocks used by the btree. */
+	int64_t			if_blocks;
+
+	/* Height of the new btree. */
+	unsigned int		if_levels;
+
+	/* Number of bytes available for this fork in the inode. */
+	unsigned int		if_fork_size;
+
+	/* Fork format. */
+	unsigned int		if_format;
+
+	/* Number of records. */
+	unsigned int		if_extents;
+};
+
+/* Cursor interactions with fake roots for inode-rooted btrees. */
+void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
+		struct xbtree_ifakeroot *ifake,
+		struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+		int whichfork, const struct xfs_btree_ops *ops);
+
+/* Bulk loading of staged btrees. */
+typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
+		union xfs_btree_ptr *ptr, void *priv);
+typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
+		unsigned int nr_this_level, void *priv);
+
+struct xfs_btree_bload {
+	/*
+	 * This function will be called nr_records times to load records into
+	 * the btree.  The function does this by setting the cursor's bc_rec
+	 * field in in-core format.  Records must be returned in sort order.
+	 */
+	xfs_btree_bload_get_record_fn	get_record;
+
+	/*
+	 * This function will be called nr_blocks times to obtain a pointer
+	 * to a new btree block on disk.  Callers must preallocate all space
+	 * for the new btree before calling xfs_btree_bload, and this function
+	 * is what claims that reservation.
+	 */
+	xfs_btree_bload_claim_block_fn	claim_block;
+
+	/*
+	 * This function should return the size of the in-core btree root
+	 * block.  It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
+	 * types.
+	 */
+	xfs_btree_bload_iroot_size_fn	iroot_size;
+
+	/*
+	 * The caller should set this to the number of records that will be
+	 * stored in the new btree.
+	 */
+	uint64_t			nr_records;
+
+	/*
+	 * Number of free records to leave in each leaf block.  If the caller
+	 * sets this to -1, the slack value will be calculated to be halfway
+	 * between maxrecs and minrecs.  This typically leaves the block 75%
+	 * full.  Note that slack values are not enforced on inode root blocks.
+	 */
+	int				leaf_slack;
+
+	/*
+	 * Number of free key/ptrs pairs to leave in each node block.  This
+	 * field has the same semantics as leaf_slack.
+	 */
+	int				node_slack;
+
+	/*
+	 * The xfs_btree_bload_compute_geometry function will set this to the
+	 * number of btree blocks needed to store nr_records records.
+	 */
+	uint64_t			nr_blocks;
+
+	/*
+	 * The xfs_btree_bload_compute_geometry function will set this to the
+	 * height of the new btree.
+	 */
+	unsigned int			btree_height;
+};
+
+int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
+		struct xfs_btree_bload *bbl, uint64_t nr_records);
+int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl,
+		void *priv);
+
+#endif	/* __XFS_BTREE_STAGING_H__ */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
new file mode 100644
index 000000000..999a290cf
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED	(~(uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it.  The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter. We do not modify the buffer during verification,
+ * hence we have to split the CRC calculation across the cksum_offset.
+ */
+static inline uint32_t
+xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	uint32_t zero = 0;
+	uint32_t crc;
+
+	/* Calculate CRC up to the checksum. */
+	crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+	/* Skip checksum field */
+	crc = crc32c(crc, &zero, sizeof(__u32));
+
+	/* Calculate the rest of the CRC. */
+	return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+		      length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Fast CRC method where the buffer is modified. Callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline uint32_t
+xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	/* zero the CRC field */
+	*(__le32 *)(buffer + cksum_offset) = 0;
+
+	/* single pass CRC calculation for the entire buffer */
+	return crc32c(XFS_CRC_SEED, buffer, length);
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(uint32_t crc)
+{
+	return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ *
+ * This modifies the buffer temporarily - callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
+
+	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
+
+	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
new file mode 100644
index 000000000..e576560b4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -0,0 +1,2698 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_errortag.h"
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_root,
+					    xfs_da_state_blk_t *new_child);
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_blk,
+					    xfs_da_state_blk_t *split_blk,
+					    xfs_da_state_blk_t *blk_to_add,
+					    int treelevel,
+					    int *result);
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *node_blk_1,
+					 xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
+				   xfs_da_state_blk_t *old_node_blk,
+				   xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
+					      xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *src_node_blk,
+					 xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC int	xfs_da3_blk_unlink(xfs_da_state_t *state,
+				  xfs_da_state_blk_t *drop_blk,
+				  xfs_da_state_blk_t *save_blk);
+
+
+struct kmem_cache	*xfs_da_state_cache;	/* anchor for dir/attr state */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+struct xfs_da_state *
+xfs_da_state_alloc(
+	struct xfs_da_args	*args)
+{
+	struct xfs_da_state	*state;
+
+	state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	return state;
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+	int	i;
+
+	for (i = 0; i < state->altpath.active; i++)
+		state->altpath.blk[i].bp = NULL;
+	state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+	xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+	memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+	kmem_cache_free(xfs_da_state_cache, state);
+}
+
+void
+xfs_da_state_reset(
+	struct xfs_da_state	*state,
+	struct xfs_da_args	*args)
+{
+	xfs_da_state_kill_altpath(state);
+	memset(state, 0, sizeof(struct xfs_da_state));
+	state->args = args;
+	state->mp = state->args->dp->i_mount;
+}
+
+static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
+{
+	if (whichfork == XFS_DATA_FORK)
+		return mp->m_dir_geo->fsbcount;
+	return mp->m_attr_geo->fsbcount;
+}
+
+void
+xfs_da3_node_hdr_from_disk(
+	struct xfs_mount		*mp,
+	struct xfs_da3_icnode_hdr	*to,
+	struct xfs_da_intnode		*from)
+{
+	if (xfs_has_crc(mp)) {
+		struct xfs_da3_intnode	*from3 = (struct xfs_da3_intnode *)from;
+
+		to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
+		to->back = be32_to_cpu(from3->hdr.info.hdr.back);
+		to->magic = be16_to_cpu(from3->hdr.info.hdr.magic);
+		to->count = be16_to_cpu(from3->hdr.__count);
+		to->level = be16_to_cpu(from3->hdr.__level);
+		to->btree = from3->__btree;
+		ASSERT(to->magic == XFS_DA3_NODE_MAGIC);
+	} else {
+		to->forw = be32_to_cpu(from->hdr.info.forw);
+		to->back = be32_to_cpu(from->hdr.info.back);
+		to->magic = be16_to_cpu(from->hdr.info.magic);
+		to->count = be16_to_cpu(from->hdr.__count);
+		to->level = be16_to_cpu(from->hdr.__level);
+		to->btree = from->__btree;
+		ASSERT(to->magic == XFS_DA_NODE_MAGIC);
+	}
+}
+
+void
+xfs_da3_node_hdr_to_disk(
+	struct xfs_mount		*mp,
+	struct xfs_da_intnode		*to,
+	struct xfs_da3_icnode_hdr	*from)
+{
+	if (xfs_has_crc(mp)) {
+		struct xfs_da3_intnode	*to3 = (struct xfs_da3_intnode *)to;
+
+		ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+		to3->hdr.info.hdr.forw = cpu_to_be32(from->forw);
+		to3->hdr.info.hdr.back = cpu_to_be32(from->back);
+		to3->hdr.info.hdr.magic = cpu_to_be16(from->magic);
+		to3->hdr.__count = cpu_to_be16(from->count);
+		to3->hdr.__level = cpu_to_be16(from->level);
+	} else {
+		ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+		to->hdr.info.forw = cpu_to_be32(from->forw);
+		to->hdr.info.back = cpu_to_be32(from->back);
+		to->hdr.info.magic = cpu_to_be16(from->magic);
+		to->hdr.__count = cpu_to_be16(from->count);
+		to->hdr.__level = cpu_to_be16(from->level);
+	}
+}
+
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+	struct xfs_buf		*bp,
+	struct xfs_da3_blkinfo	*hdr3)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_da_blkinfo	*hdr = &hdr3->hdr;
+
+	if (!xfs_verify_magic16(bp, hdr->magic))
+		return __this_address;
+
+	if (xfs_has_crc(mp)) {
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
+			return __this_address;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+			return __this_address;
+	}
+
+	return NULL;
+}
+
+static xfs_failaddr_t
+xfs_da3_node_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_da_intnode	*hdr = bp->b_addr;
+	struct xfs_da3_icnode_hdr ichdr;
+	xfs_failaddr_t		fa;
+
+	xfs_da3_node_hdr_from_disk(mp, &ichdr, hdr);
+
+	fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+	if (fa)
+		return fa;
+
+	if (ichdr.level == 0)
+		return __this_address;
+	if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+		return __this_address;
+	if (ichdr.count == 0)
+		return __this_address;
+
+	/*
+	 * we don't know if the node is for and attribute or directory tree,
+	 * so only fail if the count is outside both bounds
+	 */
+	if (ichdr.count > mp->m_dir_geo->node_ents &&
+	    ichdr.count > mp->m_attr_geo->node_ents)
+		return __this_address;
+
+	/* XXX: hash order check? */
+
+	return NULL;
+}
+
+static void
+xfs_da3_node_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_da3_node_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da3_node_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	switch (be16_to_cpu(info->magic)) {
+		case XFS_DA3_NODE_MAGIC:
+			if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+				xfs_verifier_error(bp, -EFSBADCRC,
+						__this_address);
+				break;
+			}
+			fallthrough;
+		case XFS_DA_NODE_MAGIC:
+			fa = xfs_da3_node_verify(bp);
+			if (fa)
+				xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+			return;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			bp->b_ops = &xfs_attr3_leaf_buf_ops;
+			bp->b_ops->verify_read(bp);
+			return;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			bp->b_ops = &xfs_dir3_leafn_buf_ops;
+			bp->b_ops->verify_read(bp);
+			return;
+		default:
+			xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+			break;
+	}
+}
+
+/* Verify the structure of a da3 block. */
+static xfs_failaddr_t
+xfs_da3_node_verify_struct(
+	struct xfs_buf		*bp)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+
+	switch (be16_to_cpu(info->magic)) {
+	case XFS_DA3_NODE_MAGIC:
+	case XFS_DA_NODE_MAGIC:
+		return xfs_da3_node_verify(bp);
+	case XFS_ATTR_LEAF_MAGIC:
+	case XFS_ATTR3_LEAF_MAGIC:
+		bp->b_ops = &xfs_attr3_leaf_buf_ops;
+		return bp->b_ops->verify_struct(bp);
+	case XFS_DIR2_LEAFN_MAGIC:
+	case XFS_DIR3_LEAFN_MAGIC:
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		return bp->b_ops->verify_struct(bp);
+	default:
+		return __this_address;
+	}
+}
+
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+	.name = "xfs_da3_node",
+	.magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+		     cpu_to_be16(XFS_DA3_NODE_MAGIC) },
+	.verify_read = xfs_da3_node_read_verify,
+	.verify_write = xfs_da3_node_write_verify,
+	.verify_struct = xfs_da3_node_verify_struct,
+};
+
+static int
+xfs_da3_node_set_type(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+
+	switch (be16_to_cpu(info->magic)) {
+	case XFS_DA_NODE_MAGIC:
+	case XFS_DA3_NODE_MAGIC:
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+		return 0;
+	case XFS_ATTR_LEAF_MAGIC:
+	case XFS_ATTR3_LEAF_MAGIC:
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_ATTR_LEAF_BUF);
+		return 0;
+	case XFS_DIR2_LEAFN_MAGIC:
+	case XFS_DIR3_LEAFN_MAGIC:
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+		return 0;
+	default:
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp,
+				info, sizeof(*info));
+		xfs_trans_brelse(tp, bp);
+		return -EFSCORRUPTED;
+	}
+}
+
+int
+xfs_da3_node_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	struct xfs_buf		**bpp,
+	int			whichfork)
+{
+	int			error;
+
+	error = xfs_da_read_buf(tp, dp, bno, 0, bpp, whichfork,
+			&xfs_da3_node_buf_ops);
+	if (error || !*bpp || !tp)
+		return error;
+	return xfs_da3_node_set_type(tp, *bpp);
+}
+
+int
+xfs_da3_node_read_mapped(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	int			error;
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno,
+			XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0,
+			bpp, &xfs_da3_node_buf_ops);
+	if (error || !*bpp)
+		return error;
+
+	if (whichfork == XFS_ATTR_FORK)
+		xfs_buf_set_ref(*bpp, XFS_ATTR_BTREE_REF);
+	else
+		xfs_buf_set_ref(*bpp, XFS_DIR_BTREE_REF);
+
+	if (!tp)
+		return 0;
+	return xfs_da3_node_set_type(tp, *bpp);
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da3_node_create(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		blkno,
+	int			level,
+	struct xfs_buf		**bpp,
+	int			whichfork)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_da3_icnode_hdr ichdr = {0};
+	struct xfs_buf		*bp;
+	int			error;
+	struct xfs_inode	*dp = args->dp;
+
+	trace_xfs_da_node_create(args);
+	ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
+
+	error = xfs_da_get_buf(tp, dp, blkno, &bp, whichfork);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_da3_node_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+	node = bp->b_addr;
+
+	if (xfs_has_crc(mp)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
+		ichdr.magic = XFS_DA3_NODE_MAGIC;
+		hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+		uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
+	} else {
+		ichdr.magic = XFS_DA_NODE_MAGIC;
+	}
+	ichdr.level = level;
+
+	xfs_da3_node_hdr_to_disk(dp->i_mount, node, &ichdr);
+	xfs_trans_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, args->geo->node_hdr_size));
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int							/* error */
+xfs_da3_split(
+	struct xfs_da_state	*state)
+{
+	struct xfs_da_state_blk	*oldblk;
+	struct xfs_da_state_blk	*newblk;
+	struct xfs_da_state_blk	*addblk;
+	struct xfs_da_intnode	*node;
+	int			max;
+	int			action = 0;
+	int			error;
+	int			i;
+
+	trace_xfs_da_split(state->args);
+
+	if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+		return -EIO;
+
+	/*
+	 * Walk back up the tree splitting/inserting/adjusting as necessary.
+	 * If we need to insert and there isn't room, split the node, then
+	 * decide which fragment to insert the new block from below into.
+	 * Note that we may split the root this way, but we need more fixup.
+	 */
+	max = state->path.active - 1;
+	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+	addblk = &state->path.blk[max];		/* initial dummy value */
+	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+		oldblk = &state->path.blk[i];
+		newblk = &state->altpath.blk[i];
+
+		/*
+		 * If a leaf node then
+		 *     Allocate a new leaf node, then rebalance across them.
+		 * else if an intermediate node then
+		 *     We split on the last layer, must we split the node?
+		 */
+		switch (oldblk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+			error = xfs_attr3_leaf_split(state, oldblk, newblk);
+			if ((error != 0) && (error != -ENOSPC)) {
+				return error;	/* GROT: attr is inconsistent */
+			}
+			if (!error) {
+				addblk = newblk;
+				break;
+			}
+			/*
+			 * Entry wouldn't fit, split the leaf again. The new
+			 * extrablk will be consumed by xfs_da3_node_split if
+			 * the node is split.
+			 */
+			state->extravalid = 1;
+			if (state->inleaf) {
+				state->extraafter = 0;	/* before newblk */
+				trace_xfs_attr_leaf_split_before(state->args);
+				error = xfs_attr3_leaf_split(state, oldblk,
+							    &state->extrablk);
+			} else {
+				state->extraafter = 1;	/* after newblk */
+				trace_xfs_attr_leaf_split_after(state->args);
+				error = xfs_attr3_leaf_split(state, newblk,
+							    &state->extrablk);
+			}
+			if (error)
+				return error;	/* GROT: attr inconsistent */
+			addblk = newblk;
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			error = xfs_dir2_leafn_split(state, oldblk, newblk);
+			if (error)
+				return error;
+			addblk = newblk;
+			break;
+		case XFS_DA_NODE_MAGIC:
+			error = xfs_da3_node_split(state, oldblk, newblk, addblk,
+							 max - i, &action);
+			addblk->bp = NULL;
+			if (error)
+				return error;	/* GROT: dir is inconsistent */
+			/*
+			 * Record the newly split block for the next time thru?
+			 */
+			if (action)
+				addblk = newblk;
+			else
+				addblk = NULL;
+			break;
+		}
+
+		/*
+		 * Update the btree to show the new hashval for this child.
+		 */
+		xfs_da3_fixhashpath(state, &state->path);
+	}
+	if (!addblk)
+		return 0;
+
+	/*
+	 * xfs_da3_node_split() should have consumed any extra blocks we added
+	 * during a double leaf split in the attr fork. This is guaranteed as
+	 * we can't be here if the attr fork only has a single leaf block.
+	 */
+	ASSERT(state->extravalid == 0 ||
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+	/*
+	 * Split the root node.
+	 */
+	ASSERT(state->path.active == 0);
+	oldblk = &state->path.blk[0];
+	error = xfs_da3_root_split(state, oldblk, addblk);
+	if (error)
+		goto out;
+
+	/*
+	 * Update pointers to the node which used to be block 0 and just got
+	 * bumped because of the addition of a new root node.  Note that the
+	 * original block 0 could be at any position in the list of blocks in
+	 * the tree.
+	 *
+	 * Note: the magic numbers and sibling pointers are in the same physical
+	 * place for both v2 and v3 headers (by design). Hence it doesn't matter
+	 * which version of the xfs_da_intnode structure we use here as the
+	 * result will be the same using either structure.
+	 */
+	node = oldblk->bp->b_addr;
+	if (node->hdr.info.forw) {
+		if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
+			xfs_buf_mark_corrupt(oldblk->bp);
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+		node = addblk->bp->b_addr;
+		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+		xfs_trans_log_buf(state->args->trans, addblk->bp,
+				  XFS_DA_LOGRANGE(node, &node->hdr.info,
+				  sizeof(node->hdr.info)));
+	}
+	node = oldblk->bp->b_addr;
+	if (node->hdr.info.back) {
+		if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
+			xfs_buf_mark_corrupt(oldblk->bp);
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+		node = addblk->bp->b_addr;
+		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+		xfs_trans_log_buf(state->args->trans, addblk->bp,
+				  XFS_DA_LOGRANGE(node, &node->hdr.info,
+				  sizeof(node->hdr.info)));
+	}
+out:
+	addblk->bp = NULL;
+	return error;
+}
+
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int						/* error */
+xfs_da3_root_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da_intnode	*oldroot;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	struct xfs_inode	*dp;
+	struct xfs_trans	*tp;
+	struct xfs_dir2_leaf	*leaf;
+	xfs_dablk_t		blkno;
+	int			level;
+	int			error;
+	int			size;
+
+	trace_xfs_da_root_split(state->args);
+
+	/*
+	 * Copy the existing (incorrect) block from the root node position
+	 * to a free space somewhere.
+	 */
+	args = state->args;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		return error;
+
+	dp = args->dp;
+	tp = args->trans;
+	error = xfs_da_get_buf(tp, dp, blkno, &bp, args->whichfork);
+	if (error)
+		return error;
+	node = bp->b_addr;
+	oldroot = blk1->bp->b_addr;
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+		struct xfs_da3_icnode_hdr icnodehdr;
+
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &icnodehdr, oldroot);
+		btree = icnodehdr.btree;
+		size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
+		level = icnodehdr.level;
+
+		/*
+		 * we are about to copy oldroot to bp, so set up the type
+		 * of bp while we know exactly what it will be.
+		 */
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+	} else {
+		struct xfs_dir3_icleaf_hdr leafhdr;
+
+		leaf = (xfs_dir2_leaf_t *)oldroot;
+		xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+
+		ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+		       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+		size = (int)((char *)&leafhdr.ents[leafhdr.count] -
+			(char *)leaf);
+		level = 0;
+
+		/*
+		 * we are about to copy oldroot to bp, so set up the type
+		 * of bp while we know exactly what it will be.
+		 */
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+	}
+
+	/*
+	 * we can copy most of the information in the node from one block to
+	 * another, but for CRC enabled headers we have to make sure that the
+	 * block specific identifiers are kept intact. We update the buffer
+	 * directly for this.
+	 */
+	memcpy(node, oldroot, size);
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+		node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
+	}
+	xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+	bp->b_ops = blk1->bp->b_ops;
+	xfs_trans_buf_copy_type(bp, blk1->bp);
+	blk1->bp = bp;
+	blk1->blkno = blkno;
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da3_node_create(args,
+		(args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
+		level + 1, &bp, args->whichfork);
+	if (error)
+		return error;
+
+	node = bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+	btree = nodehdr.btree;
+	btree[0].hashval = cpu_to_be32(blk1->hashval);
+	btree[0].before = cpu_to_be32(blk1->blkno);
+	btree[1].hashval = cpu_to_be32(blk2->hashval);
+	btree[1].before = cpu_to_be32(blk2->blkno);
+	nodehdr.count = 2;
+	xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
+
+#ifdef DEBUG
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		ASSERT(blk1->blkno >= args->geo->leafblk &&
+		       blk1->blkno < args->geo->freeblk);
+		ASSERT(blk2->blkno >= args->geo->leafblk &&
+		       blk2->blkno < args->geo->freeblk);
+	}
+#endif
+
+	/* Header is already logged by xfs_da_node_create */
+	xfs_trans_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
+
+	return 0;
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int						/* error */
+xfs_da3_node_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk,
+	struct xfs_da_state_blk	*addblk,
+	int			treelevel,
+	int			*result)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	xfs_dablk_t		blkno;
+	int			newcount;
+	int			error;
+	int			useextra;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_split(state->args);
+
+	node = oldblk->bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+
+	/*
+	 * With V2 dirs the extra block is data or freespace.
+	 */
+	useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
+	newcount = 1 + useextra;
+	/*
+	 * Do we have to split the node?
+	 */
+	if (nodehdr.count + newcount > state->args->geo->node_ents) {
+		/*
+		 * Allocate a new node, add to the doubly linked chain of
+		 * nodes, then move some of our excess entries into it.
+		 */
+		error = xfs_da_grow_inode(state->args, &blkno);
+		if (error)
+			return error;	/* GROT: dir is inconsistent */
+
+		error = xfs_da3_node_create(state->args, blkno, treelevel,
+					   &newblk->bp, state->args->whichfork);
+		if (error)
+			return error;	/* GROT: dir is inconsistent */
+		newblk->blkno = blkno;
+		newblk->magic = XFS_DA_NODE_MAGIC;
+		xfs_da3_node_rebalance(state, oldblk, newblk);
+		error = xfs_da3_blk_link(state, oldblk, newblk);
+		if (error)
+			return error;
+		*result = 1;
+	} else {
+		*result = 0;
+	}
+
+	/*
+	 * Insert the new entry(s) into the correct block
+	 * (updating last hashval in the process).
+	 *
+	 * xfs_da3_node_add() inserts BEFORE the given index,
+	 * and as a result of using node_lookup_int() we always
+	 * point to a valid entry (not after one), but a split
+	 * operation always results in a new block whose hashvals
+	 * FOLLOW the current block.
+	 *
+	 * If we had double-split op below us, then add the extra block too.
+	 */
+	node = oldblk->bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+	if (oldblk->index <= nodehdr.count) {
+		oldblk->index++;
+		xfs_da3_node_add(state, oldblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				oldblk->index++;
+			xfs_da3_node_add(state, oldblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	} else {
+		newblk->index++;
+		xfs_da3_node_add(state, newblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				newblk->index++;
+			xfs_da3_node_add(state, newblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da3_node_rebalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_intnode	*node1;
+	struct xfs_da_intnode	*node2;
+	struct xfs_da_node_entry *btree1;
+	struct xfs_da_node_entry *btree2;
+	struct xfs_da_node_entry *btree_s;
+	struct xfs_da_node_entry *btree_d;
+	struct xfs_da3_icnode_hdr nodehdr1;
+	struct xfs_da3_icnode_hdr nodehdr2;
+	struct xfs_trans	*tp;
+	int			count;
+	int			tmp;
+	int			swap = 0;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_rebalance(state->args);
+
+	node1 = blk1->bp->b_addr;
+	node2 = blk2->bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+	btree1 = nodehdr1.btree;
+	btree2 = nodehdr2.btree;
+
+	/*
+	 * Figure out how many entries need to move, and in which direction.
+	 * Swap the nodes around if that makes it simpler.
+	 */
+	if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+	     (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+			be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
+		swap(node1, node2);
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+		btree1 = nodehdr1.btree;
+		btree2 = nodehdr2.btree;
+		swap = 1;
+	}
+
+	count = (nodehdr1.count - nodehdr2.count) / 2;
+	if (count == 0)
+		return;
+	tp = state->args->trans;
+	/*
+	 * Two cases: high-to-low and low-to-high.
+	 */
+	if (count > 0) {
+		/*
+		 * Move elements in node2 up to make a hole.
+		 */
+		tmp = nodehdr2.count;
+		if (tmp > 0) {
+			tmp *= (uint)sizeof(xfs_da_node_entry_t);
+			btree_s = &btree2[0];
+			btree_d = &btree2[count];
+			memmove(btree_d, btree_s, tmp);
+		}
+
+		/*
+		 * Move the req'd B-tree elements from high in node1 to
+		 * low in node2.
+		 */
+		nodehdr2.count += count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree1[nodehdr1.count - count];
+		btree_d = &btree2[0];
+		memcpy(btree_d, btree_s, tmp);
+		nodehdr1.count -= count;
+	} else {
+		/*
+		 * Move the req'd B-tree elements from low in node2 to
+		 * high in node1.
+		 */
+		count = -count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree2[0];
+		btree_d = &btree1[nodehdr1.count];
+		memcpy(btree_d, btree_s, tmp);
+		nodehdr1.count += count;
+
+		xfs_trans_log_buf(tp, blk1->bp,
+			XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+		/*
+		 * Move elements in node2 down to fill the hole.
+		 */
+		tmp  = nodehdr2.count - count;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree2[count];
+		btree_d = &btree2[0];
+		memmove(btree_d, btree_s, tmp);
+		nodehdr2.count -= count;
+	}
+
+	/*
+	 * Log header of node 1 and all current bits of node 2.
+	 */
+	xfs_da3_node_hdr_to_disk(dp->i_mount, node1, &nodehdr1);
+	xfs_trans_log_buf(tp, blk1->bp,
+		XFS_DA_LOGRANGE(node1, &node1->hdr,
+				state->args->geo->node_hdr_size));
+
+	xfs_da3_node_hdr_to_disk(dp->i_mount, node2, &nodehdr2);
+	xfs_trans_log_buf(tp, blk2->bp,
+		XFS_DA_LOGRANGE(node2, &node2->hdr,
+				state->args->geo->node_hdr_size +
+				(sizeof(btree2[0]) * nodehdr2.count)));
+
+	/*
+	 * Record the last hashval from each block for upward propagation.
+	 * (note: don't use the swapped node pointers)
+	 */
+	if (swap) {
+		node1 = blk1->bp->b_addr;
+		node2 = blk2->bp->b_addr;
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+		btree1 = nodehdr1.btree;
+		btree2 = nodehdr2.btree;
+	}
+	blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+	blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (blk1->index >= nodehdr1.count) {
+		blk2->index = blk1->index - nodehdr1.count;
+		blk1->index = nodehdr1.count + 1;	/* make it invalid */
+	}
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da3_node_add(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_add(state->args);
+
+	node = oldblk->bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+	btree = nodehdr.btree;
+
+	ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
+	ASSERT(newblk->blkno != 0);
+	if (state->args->whichfork == XFS_DATA_FORK)
+		ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+		       newblk->blkno < state->args->geo->freeblk);
+
+	/*
+	 * We may need to make some room before we insert the new node.
+	 */
+	tmp = 0;
+	if (oldblk->index < nodehdr.count) {
+		tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+		memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
+	}
+	btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+	btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
+	xfs_trans_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+				tmp + sizeof(*btree)));
+
+	nodehdr.count += 1;
+	xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
+	xfs_trans_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &node->hdr,
+				state->args->geo->node_hdr_size));
+
+	/*
+	 * Copy the last hash value from the oldblk to propagate upwards.
+	 */
+	oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da3_join(
+	struct xfs_da_state	*state)
+{
+	struct xfs_da_state_blk	*drop_blk;
+	struct xfs_da_state_blk	*save_blk;
+	int			action = 0;
+	int			error;
+
+	trace_xfs_da_join(state->args);
+
+	drop_blk = &state->path.blk[ state->path.active-1 ];
+	save_blk = &state->altpath.blk[ state->path.active-1 ];
+	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+	       drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+	/*
+	 * Walk back up the tree joining/deallocating as necessary.
+	 * When we stop dropping blocks, break out.
+	 */
+	for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+		 state->path.active--) {
+		/*
+		 * See if we can combine the block with a neighbor.
+		 *   (action == 0) => no options, just leave
+		 *   (action == 1) => coalesce, then unlink
+		 *   (action == 2) => block empty, unlink it
+		 */
+		switch (drop_blk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+			error = xfs_attr3_leaf_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			error = xfs_dir2_leafn_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DA_NODE_MAGIC:
+			/*
+			 * Remove the offending node, fixup hashvals,
+			 * check for a toosmall neighbor.
+			 */
+			xfs_da3_node_remove(state, drop_blk);
+			xfs_da3_fixhashpath(state, &state->path);
+			error = xfs_da3_node_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_da3_node_unbalance(state, drop_blk, save_blk);
+			break;
+		}
+		xfs_da3_fixhashpath(state, &state->altpath);
+		error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
+		xfs_da_state_kill_altpath(state);
+		if (error)
+			return error;
+		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+							 drop_blk->bp);
+		drop_blk->bp = NULL;
+		if (error)
+			return error;
+	}
+	/*
+	 * We joined all the way to the top.  If it turns out that
+	 * we only have one entry in the root, make the child block
+	 * the new root.
+	 */
+	xfs_da3_node_remove(state, drop_blk);
+	xfs_da3_fixhashpath(state, &state->path);
+	error = xfs_da3_root_join(state, &state->path.blk[0]);
+	return error;
+}
+
+#ifdef	DEBUG
+static void
+xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
+{
+	__be16	magic = blkinfo->magic;
+
+	if (level == 1) {
+		ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+		       magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+		       magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+		       magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+	} else {
+		ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+		       magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+	}
+	ASSERT(!blkinfo->forw);
+	ASSERT(!blkinfo->back);
+}
+#else	/* !DEBUG */
+#define	xfs_da_blkinfo_onlychild_validate(blkinfo, level)
+#endif	/* !DEBUG */
+
+/*
+ * We have only one entry in the root.  Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da3_root_join(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*root_blk)
+{
+	struct xfs_da_intnode	*oldroot;
+	struct xfs_da_args	*args;
+	xfs_dablk_t		child;
+	struct xfs_buf		*bp;
+	struct xfs_da3_icnode_hdr oldroothdr;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_root_join(state->args);
+
+	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+
+	args = state->args;
+	oldroot = root_blk->bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &oldroothdr, oldroot);
+	ASSERT(oldroothdr.forw == 0);
+	ASSERT(oldroothdr.back == 0);
+
+	/*
+	 * If the root has more than one child, then don't do anything.
+	 */
+	if (oldroothdr.count > 1)
+		return 0;
+
+	/*
+	 * Read in the (only) child block, then copy those bytes into
+	 * the root block's buffer and free the original child block.
+	 */
+	child = be32_to_cpu(oldroothdr.btree[0].before);
+	ASSERT(child != 0);
+	error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork);
+	if (error)
+		return error;
+	xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
+
+	/*
+	 * This could be copying a leaf back into the root block in the case of
+	 * there only being a single leaf block left in the tree. Hence we have
+	 * to update the b_ops pointer as well to match the buffer type change
+	 * that could occur. For dir3 blocks we also need to update the block
+	 * number in the buffer header.
+	 */
+	memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
+	root_blk->bp->b_ops = bp->b_ops;
+	xfs_trans_buf_copy_type(root_blk->bp, bp);
+	if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+		struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+		da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp));
+	}
+	xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+			  args->geo->blksize - 1);
+	error = xfs_da_shrink_inode(args, child, bp);
+	return error;
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da3_node_toosmall(
+	struct xfs_da_state	*state,
+	int			*action)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*info;
+	xfs_dablk_t		blkno;
+	struct xfs_buf		*bp;
+	struct xfs_da3_icnode_hdr nodehdr;
+	int			count;
+	int			forward;
+	int			error;
+	int			retval;
+	int			i;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_toosmall(state->args);
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->b_addr;
+	node = (xfs_da_intnode_t *)info;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+	if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
+		*action = 0;	/* blk over 50%, don't try to join */
+		return 0;	/* blk over 50%, don't try to join */
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (nodehdr.count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (info->forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return error;
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return 0;
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	count  = state->args->geo->node_ents;
+	count -= state->args->geo->node_ents >> 2;
+	count -= nodehdr.count;
+
+	/* start with smaller blk num */
+	forward = nodehdr.forw < nodehdr.back;
+	for (i = 0; i < 2; forward = !forward, i++) {
+		struct xfs_da3_icnode_hdr thdr;
+		if (forward)
+			blkno = nodehdr.forw;
+		else
+			blkno = nodehdr.back;
+		if (blkno == 0)
+			continue;
+		error = xfs_da3_node_read(state->args->trans, dp, blkno, &bp,
+				state->args->whichfork);
+		if (error)
+			return error;
+
+		node = bp->b_addr;
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node);
+		xfs_trans_brelse(state->args->trans, bp);
+
+		if (count - thdr.count >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da3_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return error;
+	if (retval) {
+		*action = 0;
+		return 0;
+	}
+	*action = 1;
+	return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp,
+	int			*count)
+{
+	struct xfs_da3_icnode_hdr nodehdr;
+
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr);
+	if (count)
+		*count = nodehdr.count;
+	if (!nodehdr.count)
+		return 0;
+	return be32_to_cpu(nodehdr.btree[nodehdr.count - 1].hashval);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da3_fixhashpath(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_path *path)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_node_entry *btree;
+	xfs_dahash_t		lasthash=0;
+	int			level;
+	int			count;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_fixhashpath(state->args);
+
+	level = path->active-1;
+	blk = &path->blk[ level ];
+	switch (blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+		lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		lasthash = xfs_dir2_leaf_lasthash(dp, blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DA_NODE_MAGIC:
+		lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	}
+	for (blk--, level--; level >= 0; blk--, level--) {
+		struct xfs_da3_icnode_hdr nodehdr;
+
+		node = blk->bp->b_addr;
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+		btree = nodehdr.btree;
+		if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
+			break;
+		blk->hashval = lasthash;
+		btree[blk->index].hashval = cpu_to_be32(lasthash);
+		xfs_trans_log_buf(state->args->trans, blk->bp,
+				  XFS_DA_LOGRANGE(node, &btree[blk->index],
+						  sizeof(*btree)));
+
+		lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+	}
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da3_node_remove(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int			index;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_remove(state->args);
+
+	node = drop_blk->bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+	ASSERT(drop_blk->index < nodehdr.count);
+	ASSERT(drop_blk->index >= 0);
+
+	/*
+	 * Copy over the offending entry, or just zero it out.
+	 */
+	index = drop_blk->index;
+	btree = nodehdr.btree;
+	if (index < nodehdr.count - 1) {
+		tmp  = nodehdr.count - index - 1;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		memmove(&btree[index], &btree[index + 1], tmp);
+		xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+		    XFS_DA_LOGRANGE(node, &btree[index], tmp));
+		index = nodehdr.count - 1;
+	}
+	memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
+	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+	nodehdr.count -= 1;
+	xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
+	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &node->hdr, state->args->geo->node_hdr_size));
+
+	/*
+	 * Copy the last hash value from the block to propagate upwards.
+	 */
+	drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
+}
+
+/*
+ * Unbalance the elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da3_node_unbalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_da_intnode	*drop_node;
+	struct xfs_da_intnode	*save_node;
+	struct xfs_da_node_entry *drop_btree;
+	struct xfs_da_node_entry *save_btree;
+	struct xfs_da3_icnode_hdr drop_hdr;
+	struct xfs_da3_icnode_hdr save_hdr;
+	struct xfs_trans	*tp;
+	int			sindex;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_unbalance(state->args);
+
+	drop_node = drop_blk->bp->b_addr;
+	save_node = save_blk->bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &drop_hdr, drop_node);
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &save_hdr, save_node);
+	drop_btree = drop_hdr.btree;
+	save_btree = save_hdr.btree;
+	tp = state->args->trans;
+
+	/*
+	 * If the dying block has lower hashvals, then move all the
+	 * elements in the remaining block up to make a hole.
+	 */
+	if ((be32_to_cpu(drop_btree[0].hashval) <
+			be32_to_cpu(save_btree[0].hashval)) ||
+	    (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+			be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+		/* XXX: check this - is memmove dst correct? */
+		tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+		memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+		sindex = 0;
+		xfs_trans_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, &save_btree[0],
+				(save_hdr.count + drop_hdr.count) *
+						sizeof(xfs_da_node_entry_t)));
+	} else {
+		sindex = save_hdr.count;
+		xfs_trans_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+				drop_hdr.count * sizeof(xfs_da_node_entry_t)));
+	}
+
+	/*
+	 * Move all the B-tree elements from drop_blk to save_blk.
+	 */
+	tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+	memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+	save_hdr.count += drop_hdr.count;
+
+	xfs_da3_node_hdr_to_disk(dp->i_mount, save_node, &save_hdr);
+	xfs_trans_log_buf(tp, save_blk->bp,
+		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+				state->args->geo->node_hdr_size));
+
+	/*
+	 * Save the last hashval in the remaining block for upward propagation.
+	 */
+	save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int							/* error */
+xfs_da3_node_lookup_int(
+	struct xfs_da_state	*state,
+	int			*result)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*curr;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_args	*args;
+	xfs_dablk_t		blkno;
+	xfs_dahash_t		hashval;
+	xfs_dahash_t		btreehashval;
+	int			probe;
+	int			span;
+	int			max;
+	int			error;
+	int			retval;
+	unsigned int		expected_level = 0;
+	uint16_t		magic;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+
+	/*
+	 * Descend thru the B-tree searching each level for the right
+	 * node to use, until the right hashval is found.
+	 */
+	blkno = args->geo->leafblk;
+	for (blk = &state->path.blk[0], state->path.active = 1;
+			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
+			 blk++, state->path.active++) {
+		/*
+		 * Read the next node down in the tree.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da3_node_read(args->trans, args->dp, blkno,
+					&blk->bp, args->whichfork);
+		if (error) {
+			blk->blkno = 0;
+			state->path.active--;
+			return error;
+		}
+		curr = blk->bp->b_addr;
+		magic = be16_to_cpu(curr->magic);
+
+		if (magic == XFS_ATTR_LEAF_MAGIC ||
+		    magic == XFS_ATTR3_LEAF_MAGIC) {
+			blk->magic = XFS_ATTR_LEAF_MAGIC;
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		}
+
+		if (magic == XFS_DIR2_LEAFN_MAGIC ||
+		    magic == XFS_DIR3_LEAFN_MAGIC) {
+			blk->magic = XFS_DIR2_LEAFN_MAGIC;
+			blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
+							      blk->bp, NULL);
+			break;
+		}
+
+		if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
+			xfs_buf_mark_corrupt(blk->bp);
+			return -EFSCORRUPTED;
+		}
+
+		blk->magic = XFS_DA_NODE_MAGIC;
+
+		/*
+		 * Search an intermediate node for a match.
+		 */
+		node = blk->bp->b_addr;
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+		btree = nodehdr.btree;
+
+		/* Tree taller than we can handle; bail out! */
+		if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+			xfs_buf_mark_corrupt(blk->bp);
+			return -EFSCORRUPTED;
+		}
+
+		/* Check the level from the root. */
+		if (blkno == args->geo->leafblk)
+			expected_level = nodehdr.level - 1;
+		else if (expected_level != nodehdr.level) {
+			xfs_buf_mark_corrupt(blk->bp);
+			return -EFSCORRUPTED;
+		} else
+			expected_level--;
+
+		max = nodehdr.count;
+		blk->hashval = be32_to_cpu(btree[max - 1].hashval);
+
+		/*
+		 * Binary search.  (note: small blocks will skip loop)
+		 */
+		probe = span = max / 2;
+		hashval = args->hashval;
+		while (span > 4) {
+			span /= 2;
+			btreehashval = be32_to_cpu(btree[probe].hashval);
+			if (btreehashval < hashval)
+				probe += span;
+			else if (btreehashval > hashval)
+				probe -= span;
+			else
+				break;
+		}
+		ASSERT((probe >= 0) && (probe < max));
+		ASSERT((span <= 4) ||
+			(be32_to_cpu(btree[probe].hashval) == hashval));
+
+		/*
+		 * Since we may have duplicate hashval's, find the first
+		 * matching hashval in the node.
+		 */
+		while (probe > 0 &&
+		       be32_to_cpu(btree[probe].hashval) >= hashval) {
+			probe--;
+		}
+		while (probe < max &&
+		       be32_to_cpu(btree[probe].hashval) < hashval) {
+			probe++;
+		}
+
+		/*
+		 * Pick the right block to descend on.
+		 */
+		if (probe == max) {
+			blk->index = max - 1;
+			blkno = be32_to_cpu(btree[max - 1].before);
+		} else {
+			blk->index = probe;
+			blkno = be32_to_cpu(btree[probe].before);
+		}
+
+		/* We can't point back to the root. */
+		if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk))
+			return -EFSCORRUPTED;
+	}
+
+	if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0))
+		return -EFSCORRUPTED;
+
+	/*
+	 * A leaf block that ends in the hashval that we are interested in
+	 * (final hashval == search hashval) means that the next block may
+	 * contain more entries with the same hashval, shift upward to the
+	 * next leaf and keep searching.
+	 */
+	for (;;) {
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+							&blk->index, state);
+		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+			retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
+			blk->index = args->index;
+			args->blkno = blk->blkno;
+		} else {
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+		if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
+		    (blk->hashval == args->hashval)) {
+			error = xfs_da3_path_shift(state, &state->path, 1, 1,
+							 &retval);
+			if (error)
+				return error;
+			if (retval == 0) {
+				continue;
+			} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+				/* path_shift() gives ENOENT */
+				retval = -ENOATTR;
+			}
+		}
+		break;
+	}
+	*result = retval;
+	return 0;
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+	struct xfs_inode *dp,
+	struct xfs_buf	*node1_bp,
+	struct xfs_buf	*node2_bp)
+{
+	struct xfs_da_intnode	*node1;
+	struct xfs_da_intnode	*node2;
+	struct xfs_da_node_entry *btree1;
+	struct xfs_da_node_entry *btree2;
+	struct xfs_da3_icnode_hdr node1hdr;
+	struct xfs_da3_icnode_hdr node2hdr;
+
+	node1 = node1_bp->b_addr;
+	node2 = node2_bp->b_addr;
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &node1hdr, node1);
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &node2hdr, node2);
+	btree1 = node1hdr.btree;
+	btree2 = node2hdr.btree;
+
+	if (node1hdr.count > 0 && node2hdr.count > 0 &&
+	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+	     (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+	      be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int							/* error */
+xfs_da3_blk_link(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*old_blk,
+	struct xfs_da_state_blk	*new_blk)
+{
+	struct xfs_da_blkinfo	*old_info;
+	struct xfs_da_blkinfo	*new_info;
+	struct xfs_da_blkinfo	*tmp_info;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	int			before = 0;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	old_info = old_blk->bp->b_addr;
+	new_info = new_blk->bp->b_addr;
+	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+	switch (old_blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DA_NODE_MAGIC:
+		before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
+		break;
+	}
+
+	/*
+	 * Link blocks in appropriate order.
+	 */
+	if (before) {
+		/*
+		 * Link new block in before existing block.
+		 */
+		trace_xfs_da_link_before(args);
+		new_info->forw = cpu_to_be32(old_blk->blkno);
+		new_info->back = old_info->back;
+		if (old_info->back) {
+			error = xfs_da3_node_read(args->trans, dp,
+						be32_to_cpu(old_info->back),
+						&bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == old_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+			tmp_info->forw = cpu_to_be32(new_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+		}
+		old_info->back = cpu_to_be32(new_blk->blkno);
+	} else {
+		/*
+		 * Link new block in after existing block.
+		 */
+		trace_xfs_da_link_after(args);
+		new_info->forw = old_info->forw;
+		new_info->back = cpu_to_be32(old_blk->blkno);
+		if (old_info->forw) {
+			error = xfs_da3_node_read(args->trans, dp,
+						be32_to_cpu(old_info->forw),
+						&bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == old_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+			tmp_info->back = cpu_to_be32(new_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+		}
+		old_info->forw = cpu_to_be32(new_blk->blkno);
+	}
+
+	xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+	xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+	return 0;
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+STATIC int						/* error */
+xfs_da3_blk_unlink(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_da_blkinfo	*drop_info;
+	struct xfs_da_blkinfo	*save_info;
+	struct xfs_da_blkinfo	*tmp_info;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	int			error;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	save_info = save_blk->bp->b_addr;
+	drop_info = drop_blk->bp->b_addr;
+	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == drop_blk->magic);
+	ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+	       (be32_to_cpu(save_info->back) == drop_blk->blkno));
+	ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+	       (be32_to_cpu(drop_info->back) == save_blk->blkno));
+
+	/*
+	 * Unlink the leaf block from the doubly linked chain of leaves.
+	 */
+	if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+		trace_xfs_da_unlink_back(args);
+		save_info->back = drop_info->back;
+		if (drop_info->back) {
+			error = xfs_da3_node_read(args->trans, args->dp,
+						be32_to_cpu(drop_info->back),
+						&bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == save_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+			tmp_info->forw = cpu_to_be32(save_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+		}
+	} else {
+		trace_xfs_da_unlink_forward(args);
+		save_info->forw = drop_info->forw;
+		if (drop_info->forw) {
+			error = xfs_da3_node_read(args->trans, args->dp,
+						be32_to_cpu(drop_info->forw),
+						&bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == save_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+			tmp_info->back = cpu_to_be32(save_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+		}
+	}
+
+	xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+	return 0;
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int							/* error */
+xfs_da3_path_shift(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_path *path,
+	int			forward,
+	int			release,
+	int			*result)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*info;
+	struct xfs_da_args	*args;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		blkno = 0;
+	int			level;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_path_shift(state->args);
+
+	/*
+	 * Roll up the Btree looking for the first block where our
+	 * current index is not at the edge of the block.  Note that
+	 * we skip the bottom layer because we want the sibling block.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(path != NULL);
+	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	level = (path->active-1) - 1;	/* skip bottom layer in path */
+	for (; level >= 0; level--) {
+		blk = &path->blk[level];
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
+					   blk->bp->b_addr);
+
+		if (forward && (blk->index < nodehdr.count - 1)) {
+			blk->index++;
+			blkno = be32_to_cpu(nodehdr.btree[blk->index].before);
+			break;
+		} else if (!forward && (blk->index > 0)) {
+			blk->index--;
+			blkno = be32_to_cpu(nodehdr.btree[blk->index].before);
+			break;
+		}
+	}
+	if (level < 0) {
+		*result = -ENOENT;	/* we're out of our tree */
+		ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+		return 0;
+	}
+
+	/*
+	 * Roll down the edge of the subtree until we reach the
+	 * same depth we were at originally.
+	 */
+	for (blk++, level++; level < path->active; blk++, level++) {
+		/*
+		 * Read the next child block into a local buffer.
+		 */
+		error = xfs_da3_node_read(args->trans, dp, blkno, &bp,
+					  args->whichfork);
+		if (error)
+			return error;
+
+		/*
+		 * Release the old block (if it's dirty, the trans doesn't
+		 * actually let go) and swap the local buffer into the path
+		 * structure. This ensures failure of the above read doesn't set
+		 * a NULL buffer in an active slot in the path.
+		 */
+		if (release)
+			xfs_trans_brelse(args->trans, blk->bp);
+		blk->blkno = blkno;
+		blk->bp = bp;
+
+		info = blk->bp->b_addr;
+		ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+		/*
+		 * Note: we flatten the magic number to a single type so we
+		 * don't have to compare against crc/non-crc types elsewhere.
+		 */
+		switch (be16_to_cpu(info->magic)) {
+		case XFS_DA_NODE_MAGIC:
+		case XFS_DA3_NODE_MAGIC:
+			blk->magic = XFS_DA_NODE_MAGIC;
+			xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
+						   bp->b_addr);
+			btree = nodehdr.btree;
+			blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+			if (forward)
+				blk->index = 0;
+			else
+				blk->index = nodehdr.count - 1;
+			blkno = be32_to_cpu(btree[blk->index].before);
+			break;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			blk->magic = XFS_ATTR_LEAF_MAGIC;
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			blk->magic = XFS_DIR2_LEAFN_MAGIC;
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
+							      blk->bp, NULL);
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
+	}
+	*result = 0;
+	return 0;
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(const uint8_t *name, int namelen)
+{
+	xfs_dahash_t hash;
+
+	/*
+	 * Do four characters at a time as long as we can.
+	 */
+	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+		       (name[3] << 0) ^ rol32(hash, 7 * 4);
+
+	/*
+	 * Now do the rest of the characters.
+	 */
+	switch (namelen) {
+	case 3:
+		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+		       rol32(hash, 7 * 3);
+	case 2:
+		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+	case 1:
+		return (name[0] << 0) ^ rol32(hash, 7 * 1);
+	default: /* case 0: */
+		return hash;
+	}
+}
+
+enum xfs_dacmp
+xfs_da_compname(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+					XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+int
+xfs_da_grow_inode_int(
+	struct xfs_da_args	*args,
+	xfs_fileoff_t		*bno,
+	int			count)
+{
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	int			w = args->whichfork;
+	xfs_rfsblock_t		nblks = dp->i_nblocks;
+	struct xfs_bmbt_irec	map, *mapp;
+	int			nmap, error, got, i, mapi;
+
+	/*
+	 * Find a spot in the file space to put the new block.
+	 */
+	error = xfs_bmap_first_unused(tp, dp, count, bno, w);
+	if (error)
+		return error;
+
+	/*
+	 * Try mapping it in one filesystem block.
+	 */
+	nmap = 1;
+	error = xfs_bmapi_write(tp, dp, *bno, count,
+			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+			args->total, &map, &nmap);
+	if (error)
+		return error;
+
+	ASSERT(nmap <= 1);
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	} else if (nmap == 0 && count > 1) {
+		xfs_fileoff_t		b;
+		int			c;
+
+		/*
+		 * If we didn't get it and the block might work if fragmented,
+		 * try without the CONTIG flag.  Loop until we get it all.
+		 */
+		mapp = kmem_alloc(sizeof(*mapp) * count, 0);
+		for (b = *bno, mapi = 0; b < *bno + count; ) {
+			c = (int)(*bno + count - b);
+			nmap = min(XFS_BMAP_MAX_NMAP, c);
+			error = xfs_bmapi_write(tp, dp, b, c,
+					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+					args->total, &mapp[mapi], &nmap);
+			if (error)
+				goto out_free_map;
+			if (nmap < 1)
+				break;
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	} else {
+		mapi = 0;
+		mapp = NULL;
+	}
+
+	/*
+	 * Count the blocks we got, make sure it matches the total.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	if (got != count || mapp[0].br_startoff != *bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    *bno + count) {
+		error = -ENOSPC;
+		goto out_free_map;
+	}
+
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_nblocks - nblks;
+
+out_free_map:
+	if (mapp != &map)
+		kmem_free(mapp);
+	return error;
+}
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		*new_blkno)
+{
+	xfs_fileoff_t		bno;
+	int			error;
+
+	trace_xfs_da_grow_inode(args);
+
+	bno = args->geo->leafblk;
+	error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
+	if (!error)
+		*new_blkno = (xfs_dablk_t)bno;
+	return error;
+}
+
+/*
+ * Ick.  We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da3_swap_lastblock(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		*dead_blknop,
+	struct xfs_buf		**dead_bufp)
+{
+	struct xfs_da_blkinfo	*dead_info;
+	struct xfs_da_blkinfo	*sib_info;
+	struct xfs_da_intnode	*par_node;
+	struct xfs_da_intnode	*dead_node;
+	struct xfs_dir2_leaf	*dead_leaf2;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr par_hdr;
+	struct xfs_inode	*dp;
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp;
+	struct xfs_buf		*dead_buf;
+	struct xfs_buf		*last_buf;
+	struct xfs_buf		*sib_buf;
+	struct xfs_buf		*par_buf;
+	xfs_dahash_t		dead_hash;
+	xfs_fileoff_t		lastoff;
+	xfs_dablk_t		dead_blkno;
+	xfs_dablk_t		last_blkno;
+	xfs_dablk_t		sib_blkno;
+	xfs_dablk_t		par_blkno;
+	int			error;
+	int			w;
+	int			entno;
+	int			level;
+	int			dead_level;
+
+	trace_xfs_da_swap_lastblock(args);
+
+	dead_buf = *dead_bufp;
+	dead_blkno = *dead_blknop;
+	tp = args->trans;
+	dp = args->dp;
+	w = args->whichfork;
+	ASSERT(w == XFS_DATA_FORK);
+	mp = dp->i_mount;
+	lastoff = args->geo->freeblk;
+	error = xfs_bmap_last_before(tp, dp, &lastoff, w);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(mp, lastoff == 0))
+		return -EFSCORRUPTED;
+	/*
+	 * Read the last block in the btree space.
+	 */
+	last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+	error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w);
+	if (error)
+		return error;
+	/*
+	 * Copy the last block into the dead buffer and log it.
+	 */
+	memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+	xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
+	dead_info = dead_buf->b_addr;
+	/*
+	 * Get values from the moved block.
+	 */
+	if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	    dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_dir3_icleaf_hdr leafhdr;
+		struct xfs_dir2_leaf_entry *ents;
+
+		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+		xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr,
+					    dead_leaf2);
+		ents = leafhdr.ents;
+		dead_level = 0;
+		dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
+	} else {
+		struct xfs_da3_icnode_hdr deadhdr;
+
+		dead_node = (xfs_da_intnode_t *)dead_info;
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &deadhdr, dead_node);
+		btree = deadhdr.btree;
+		dead_level = deadhdr.level;
+		dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
+	}
+	sib_buf = par_buf = NULL;
+	/*
+	 * If the moved block has a left sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+		error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
+		if (error)
+			goto done;
+		sib_info = sib_buf->b_addr;
+		if (XFS_IS_CORRUPT(mp,
+				   be32_to_cpu(sib_info->forw) != last_blkno ||
+				   sib_info->magic != dead_info->magic)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		sib_info->forw = cpu_to_be32(dead_blkno);
+		xfs_trans_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+					sizeof(sib_info->forw)));
+		sib_buf = NULL;
+	}
+	/*
+	 * If the moved block has a right sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+		error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
+		if (error)
+			goto done;
+		sib_info = sib_buf->b_addr;
+		if (XFS_IS_CORRUPT(mp,
+				   be32_to_cpu(sib_info->back) != last_blkno ||
+				   sib_info->magic != dead_info->magic)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		sib_info->back = cpu_to_be32(dead_blkno);
+		xfs_trans_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+					sizeof(sib_info->back)));
+		sib_buf = NULL;
+	}
+	par_blkno = args->geo->leafblk;
+	level = -1;
+	/*
+	 * Walk down the tree looking for the parent of the moved block.
+	 */
+	for (;;) {
+		error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
+		if (error)
+			goto done;
+		par_node = par_buf->b_addr;
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
+		if (XFS_IS_CORRUPT(mp,
+				   level >= 0 && level != par_hdr.level + 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		level = par_hdr.level;
+		btree = par_hdr.btree;
+		for (entno = 0;
+		     entno < par_hdr.count &&
+		     be32_to_cpu(btree[entno].hashval) < dead_hash;
+		     entno++)
+			continue;
+		if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		par_blkno = be32_to_cpu(btree[entno].before);
+		if (level == dead_level + 1)
+			break;
+		xfs_trans_brelse(tp, par_buf);
+		par_buf = NULL;
+	}
+	/*
+	 * We're in the right parent block.
+	 * Look for the right entry.
+	 */
+	for (;;) {
+		for (;
+		     entno < par_hdr.count &&
+		     be32_to_cpu(btree[entno].before) != last_blkno;
+		     entno++)
+			continue;
+		if (entno < par_hdr.count)
+			break;
+		par_blkno = par_hdr.forw;
+		xfs_trans_brelse(tp, par_buf);
+		par_buf = NULL;
+		if (XFS_IS_CORRUPT(mp, par_blkno == 0)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
+		if (error)
+			goto done;
+		par_node = par_buf->b_addr;
+		xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
+		if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		btree = par_hdr.btree;
+		entno = 0;
+	}
+	/*
+	 * Update the parent entry pointing to the moved block.
+	 */
+	btree[entno].before = cpu_to_be32(dead_blkno);
+	xfs_trans_log_buf(tp, par_buf,
+		XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+				sizeof(btree[entno].before)));
+	*dead_blknop = last_blkno;
+	*dead_bufp = last_buf;
+	return 0;
+done:
+	if (par_buf)
+		xfs_trans_brelse(tp, par_buf);
+	if (sib_buf)
+		xfs_trans_brelse(tp, sib_buf);
+	xfs_trans_brelse(tp, last_buf);
+	return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		dead_blkno,
+	struct xfs_buf		*dead_buf)
+{
+	struct xfs_inode	*dp;
+	int			done, error, w, count;
+	struct xfs_trans	*tp;
+
+	trace_xfs_da_shrink_inode(args);
+
+	dp = args->dp;
+	w = args->whichfork;
+	tp = args->trans;
+	count = args->geo->fsbcount;
+	for (;;) {
+		/*
+		 * Remove extents.  If we get ENOSPC for a dir we have to move
+		 * the last block to the place we want to kill.
+		 */
+		error = xfs_bunmapi(tp, dp, dead_blkno, count,
+				    xfs_bmapi_aflag(w), 0, &done);
+		if (error == -ENOSPC) {
+			if (w != XFS_DATA_FORK)
+				break;
+			error = xfs_da3_swap_lastblock(args, &dead_blkno,
+						      &dead_buf);
+			if (error)
+				break;
+		} else {
+			break;
+		}
+	}
+	xfs_trans_binval(tp, dead_buf);
+	return error;
+}
+
+static int
+xfs_dabuf_map(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	unsigned int		flags,
+	int			whichfork,
+	struct xfs_buf_map	**mapp,
+	int			*nmaps)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	int			nfsb = xfs_dabuf_nfsb(mp, whichfork);
+	struct xfs_bmbt_irec	irec, *irecs = &irec;
+	struct xfs_buf_map	*map = *mapp;
+	xfs_fileoff_t		off = bno;
+	int			error = 0, nirecs, i;
+
+	if (nfsb > 1)
+		irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS);
+
+	nirecs = nfsb;
+	error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs,
+			xfs_bmapi_aflag(whichfork));
+	if (error)
+		goto out_free_irecs;
+
+	/*
+	 * Use the caller provided map for the single map case, else allocate a
+	 * larger one that needs to be free by the caller.
+	 */
+	if (nirecs > 1) {
+		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
+		if (!map) {
+			error = -ENOMEM;
+			goto out_free_irecs;
+		}
+		*mapp = map;
+	}
+
+	for (i = 0; i < nirecs; i++) {
+		if (irecs[i].br_startblock == HOLESTARTBLOCK ||
+		    irecs[i].br_startblock == DELAYSTARTBLOCK)
+			goto invalid_mapping;
+		if (off != irecs[i].br_startoff)
+			goto invalid_mapping;
+
+		map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+		map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+		off += irecs[i].br_blockcount;
+	}
+
+	if (off != bno + nfsb)
+		goto invalid_mapping;
+
+	*nmaps = nirecs;
+out_free_irecs:
+	if (irecs != &irec)
+		kmem_free(irecs);
+	return error;
+
+invalid_mapping:
+	/* Caller ok with no mapping. */
+	if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) {
+		error = -EFSCORRUPTED;
+		if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+			xfs_alert(mp, "%s: bno %u inode %llu",
+					__func__, bno, dp->i_ino);
+
+			for (i = 0; i < nirecs; i++) {
+				xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+					i, irecs[i].br_startoff,
+					irecs[i].br_startblock,
+					irecs[i].br_blockcount,
+					irecs[i].br_state);
+			}
+		}
+	} else {
+		*nmaps = 0;
+	}
+	goto out_free_irecs;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	struct xfs_buf		**bpp,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	struct xfs_buf_map	map, *mapp = &map;
+	int			nmap = 1;
+	int			error;
+
+	*bpp = NULL;
+	error = xfs_dabuf_map(dp, bno, 0, whichfork, &mapp, &nmap);
+	if (error || nmap == 0)
+		goto out_free;
+
+	error = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0, &bp);
+	if (error)
+		goto out_free;
+
+	*bpp = bp;
+
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	unsigned int		flags,
+	struct xfs_buf		**bpp,
+	int			whichfork,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	struct xfs_buf_map	map, *mapp = &map;
+	int			nmap = 1;
+	int			error;
+
+	*bpp = NULL;
+	error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap);
+	if (error || !nmap)
+		goto out_free;
+
+	error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0,
+			&bp, ops);
+	if (error)
+		goto out_free;
+
+	if (whichfork == XFS_ATTR_FORK)
+		xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+	else
+		xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+	*bpp = bp;
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	return error;
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+int
+xfs_da_reada_buf(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	unsigned int		flags,
+	int			whichfork,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf_map	map;
+	struct xfs_buf_map	*mapp;
+	int			nmap;
+	int			error;
+
+	mapp = &map;
+	nmap = 1;
+	error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap);
+	if (error || !nmap)
+		goto out_free;
+
+	xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
+
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
new file mode 100644
index 000000000..ffa3df5b2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define	__XFS_DA_BTREE_H__
+
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
+ */
+struct xfs_da_geometry {
+	unsigned int	blksize;	/* da block size in bytes */
+	unsigned int	fsbcount;	/* da block size in filesystem blocks */
+	uint8_t		fsblog;		/* log2 of _filesystem_ block size */
+	uint8_t		blklog;		/* log2 of da block size */
+	unsigned int	node_hdr_size;	/* danode header size in bytes */
+	unsigned int	node_ents;	/* # of entries in a danode */
+	unsigned int	magicpct;	/* 37% of block size in bytes */
+	xfs_dablk_t	datablk;	/* blockno of dir data v2 */
+	unsigned int	leaf_hdr_size;	/* dir2 leaf header size */
+	unsigned int	leaf_max_ents;	/* # of entries in dir2 leaf */
+	xfs_dablk_t	leafblk;	/* blockno of leaf data v2 */
+	unsigned int	free_hdr_size;	/* dir2 free header size */
+	unsigned int	free_max_bests;	/* # of bests entries in dir2 free */
+	xfs_dablk_t	freeblk;	/* blockno of free data v2 */
+	xfs_extnum_t	max_extents;	/* Max. extents in corresponding fork */
+
+	xfs_dir2_data_aoff_t data_first_offset;
+	size_t		data_entry_offset;
+};
+
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+
+/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+	XFS_CMP_DIFFERENT,	/* names are completely different */
+	XFS_CMP_EXACT,		/* names are exactly the same */
+	XFS_CMP_CASE		/* names are same but differ in case */
+};
+
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+	struct xfs_da_geometry *geo;	/* da block geometry */
+	const uint8_t		*name;		/* string (maybe not NULL terminated) */
+	int		namelen;	/* length of string (maybe no NULL) */
+	uint8_t		filetype;	/* filetype of inode for directories */
+	void		*value;		/* set of bytes (maybe contain NULLs) */
+	int		valuelen;	/* length of value */
+	unsigned int	attr_filter;	/* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
+	unsigned int	attr_flags;	/* XATTR_{CREATE,REPLACE} */
+	xfs_dahash_t	hashval;	/* hash value of name */
+	xfs_ino_t	inumber;	/* input/output inode number */
+	struct xfs_inode *dp;		/* directory inode to manipulate */
+	struct xfs_trans *trans;	/* current trans (changes over time) */
+	xfs_extlen_t	total;		/* total blocks needed, for 1st bmap */
+	int		whichfork;	/* data or attribute fork */
+	xfs_dablk_t	blkno;		/* blkno of attr leaf of interest */
+	int		index;		/* index of attr of interest in blk */
+	xfs_dablk_t	rmtblkno;	/* remote attr value starting blkno */
+	int		rmtblkcnt;	/* remote attr value block count */
+	int		rmtvaluelen;	/* remote attr value length in bytes */
+	xfs_dablk_t	blkno2;		/* blkno of 2nd attr leaf of interest */
+	int		index2;		/* index of 2nd attr in blk */
+	xfs_dablk_t	rmtblkno2;	/* remote attr value starting blkno */
+	int		rmtblkcnt2;	/* remote attr value block count */
+	int		rmtvaluelen2;	/* remote attr value length in bytes */
+	uint32_t	op_flags;	/* operation flags */
+	enum xfs_dacmp	cmpresult;	/* name compare result for lookups */
+} xfs_da_args_t;
+
+/*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK	(1u << 0) /* check for ok with no space */
+#define XFS_DA_OP_REPLACE	(1u << 1) /* this is an atomic replace op */
+#define XFS_DA_OP_ADDNAME	(1u << 2) /* this is an add operation */
+#define XFS_DA_OP_OKNOENT	(1u << 3) /* lookup op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP	(1u << 4) /* lookup returns CI name if found */
+#define XFS_DA_OP_NOTIME	(1u << 5) /* don't update inode timestamps */
+#define XFS_DA_OP_REMOVE	(1u << 6) /* this is a remove operation */
+#define XFS_DA_OP_RECOVERY	(1u << 7) /* Log recovery operation */
+#define XFS_DA_OP_LOGGED	(1u << 8) /* Use intent items to track op */
+
+#define XFS_DA_OP_FLAGS \
+	{ XFS_DA_OP_JUSTCHECK,	"JUSTCHECK" }, \
+	{ XFS_DA_OP_REPLACE,	"REPLACE" }, \
+	{ XFS_DA_OP_ADDNAME,	"ADDNAME" }, \
+	{ XFS_DA_OP_OKNOENT,	"OKNOENT" }, \
+	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }, \
+	{ XFS_DA_OP_NOTIME,	"NOTIME" }, \
+	{ XFS_DA_OP_REMOVE,	"REMOVE" }, \
+	{ XFS_DA_OP_RECOVERY,	"RECOVERY" }, \
+	{ XFS_DA_OP_LOGGED,	"LOGGED" }
+
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes.  With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+	struct xfs_buf	*bp;		/* buffer containing block */
+	xfs_dablk_t	blkno;		/* filesystem blkno of buffer */
+	xfs_daddr_t	disk_blkno;	/* on-disk blkno (in BBs) of buffer */
+	int		index;		/* relevant index into block */
+	xfs_dahash_t	hashval;	/* last hash value in block */
+	int		magic;		/* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+
+typedef struct xfs_da_state_path {
+	int			active;		/* number of active levels */
+	xfs_da_state_blk_t	blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+
+typedef struct xfs_da_state {
+	xfs_da_args_t		*args;		/* filename arguments */
+	struct xfs_mount	*mp;		/* filesystem mount point */
+	xfs_da_state_path_t	path;		/* search/split paths */
+	xfs_da_state_path_t	altpath;	/* alternate path for join */
+	unsigned char		inleaf;		/* insert into 1->lf, 0->splf */
+	unsigned char		extravalid;	/* T/F: extrablk is in use */
+	unsigned char		extraafter;	/* T/F: extrablk is after new */
+	xfs_da_state_blk_t	extrablk;	/* for double-splits on leaves */
+						/* for dirv2 extrablk is data */
+} xfs_da_state_t;
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+	uint32_t		forw;
+	uint32_t		back;
+	uint16_t		magic;
+	uint16_t		count;
+	uint16_t		level;
+
+	/*
+	 * Pointer to the on-disk format entries, which are behind the
+	 * variable size (v4 vs v5) header in the on-disk block.
+	 */
+	struct xfs_da_node_entry *btree;
+};
+
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR)	((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)	\
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+
+/*========================================================================
+ * Function prototypes.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+			    int level, struct xfs_buf **bpp, int whichfork);
+int	xfs_da3_split(xfs_da_state_t *state);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_da3_join(xfs_da_state_t *state);
+void	xfs_da3_fixhashpath(struct xfs_da_state *state,
+			    struct xfs_da_state_path *path_to_to_fix);
+
+/*
+ * Routines used for finding things in the Btree.
+ */
+int	xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int	xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+					 int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int	xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+				       xfs_da_state_blk_t *new_blk);
+int	xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork);
+int	xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
+			xfs_daddr_t mappedbno, struct xfs_buf **bpp,
+			int whichfork);
+
+/*
+ * Utility routines.
+ */
+
+#define XFS_DABUF_MAP_HOLE_OK	(1u << 0)
+
+int	xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int	xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
+			      int count);
+int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+		xfs_dablk_t bno, struct xfs_buf **bp, int whichfork);
+int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+		xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp,
+		int whichfork, const struct xfs_buf_ops *ops);
+int	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+		unsigned int flags, int whichfork,
+		const struct xfs_buf_ops *ops);
+int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+					  struct xfs_buf *dead_buf);
+
+uint xfs_da_hashname(const uint8_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+				const unsigned char *name, int len);
+
+
+struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
+void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args);
+
+void	xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
+		struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
+void	xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
+		struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
+
+extern struct kmem_cache	*xfs_da_state_cache;
+
+#endif	/* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
new file mode 100644
index 000000000..25e284108
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -0,0 +1,805 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+	__be32		forw;			/* previous block in list */
+	__be32		back;			/* following block in list */
+	__be16		magic;			/* validity check on block */
+	__be16		pad;			/* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC	0x3ebe	/* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC	0x3bee	/* magic number: attribute leaf blks */
+#define XFS_DIR3_LEAF1_MAGIC	0x3df1	/* magic number: v3 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC	0x3dff	/* magic number: v3 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+	/*
+	 * the node link manipulation code relies on the fact that the first
+	 * element of this structure is the struct xfs_da_blkinfo so it can
+	 * ignore the differences in the rest of the structures.
+	 */
+	struct xfs_da_blkinfo	hdr;
+	__be32			crc;	/* CRC of block */
+	__be64			blkno;	/* first block of the buffer */
+	__be64			lsn;	/* sequence number of last write */
+	uuid_t			uuid;	/* filesystem we belong to */
+	__be64			owner;	/* inode that owns the block */
+};
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH	5	/* max depth of Btree */
+
+typedef struct xfs_da_node_hdr {
+	struct xfs_da_blkinfo	info;	/* block type, links, etc. */
+	__be16			__count; /* count of active entries */
+	__be16			__level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+	struct xfs_da3_blkinfo	info;	/* block type, links, etc. */
+	__be16			__count; /* count of active entries */
+	__be16			__level; /* level above leaves (leaf == 0) */
+	__be32			__pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF	(offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+	__be32	hashval;	/* hash value for this descendant */
+	__be32	before;		/* Btree block before this key */
+} xfs_da_node_entry_t;
+
+typedef struct xfs_da_intnode {
+	struct xfs_da_node_hdr	hdr;
+	struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+
+struct xfs_da3_intnode {
+	struct xfs_da3_node_hdr	hdr;
+	struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * Directory version 2.
+ *
+ * There are 4 possible formats:
+ *  - shortform - embedded into the inode
+ *  - single block - data with embedded leaf at the end
+ *  - multiple data blocks, single leaf+freeindex block
+ *  - data blocks, node and leaf blocks (btree), freeindex blocks
+ *
+ * Note: many node blocks structures and constants are shared with the attr
+ * code and defined in xfs_da_btree.h.
+ */
+
+#define	XFS_DIR2_BLOCK_MAGIC	0x58443242	/* XD2B: single block dirs */
+#define	XFS_DIR2_DATA_MAGIC	0x58443244	/* XD2D: multiblock dirs */
+#define	XFS_DIR2_FREE_MAGIC	0x58443246	/* XD2F: free index blocks */
+
+/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories.  The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ *	- a larger block header for CRC and identification purposes and so the
+ *	offsets of all the structures inside the blocks are different.
+ *
+ *	- new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define	XFS_DIR3_BLOCK_MAGIC	0x58444233	/* XDB3: single block dirs */
+#define	XFS_DIR3_DATA_MAGIC	0x58444433	/* XDD3: multiblock dirs */
+#define	XFS_DIR3_FREE_MAGIC	0x58444633	/* XDF3: free index blocks */
+
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN		0
+#define XFS_DIR3_FT_REG_FILE		1
+#define XFS_DIR3_FT_DIR			2
+#define XFS_DIR3_FT_CHRDEV		3
+#define XFS_DIR3_FT_BLKDEV		4
+#define XFS_DIR3_FT_FIFO		5
+#define XFS_DIR3_FT_SOCK		6
+#define XFS_DIR3_FT_SYMLINK		7
+#define XFS_DIR3_FT_WHT			8
+
+#define XFS_DIR3_FT_MAX			9
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef uint16_t	xfs_dir2_data_off_t;
+#define	NULLDATAOFF	0xffffU
+typedef uint		xfs_dir2_data_aoff_t;	/* argument form */
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef uint32_t	xfs_dir2_dataptr_t;
+#define	XFS_DIR2_MAX_DATAPTR	((xfs_dir2_dataptr_t)0xffffffff)
+#define	XFS_DIR2_NULL_DATAPTR	((xfs_dir2_dataptr_t)0)
+
+/*
+ * Byte offset in a directory.
+ */
+typedef	xfs_off_t	xfs_dir2_off_t;
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef uint32_t	xfs_dir2_db_t;
+
+#define XFS_INO32_SIZE	4
+#define XFS_INO64_SIZE	8
+#define XFS_INO64_DIFF	(XFS_INO64_SIZE - XFS_INO32_SIZE)
+
+#define	XFS_DIR2_MAX_SHORT_INUM	((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to fit into the
+ * literal area of the inode.  These "shortform" directories consist of a
+ * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
+ * structures.  Due the different inode number storage size and the variable
+ * length name field in the xfs_dir2_sf_entry all these structure are
+ * variable length, and the accessors in this file should be used to iterate
+ * over them.
+ */
+typedef struct xfs_dir2_sf_hdr {
+	uint8_t			count;		/* count of entries */
+	uint8_t			i8count;	/* count of 8-byte inode #s */
+	uint8_t			parent[8];	/* parent dir inode number */
+} __packed xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+	__u8			namelen;	/* actual name length */
+	__u8			offset[2];	/* saved offset */
+	__u8			name[];		/* name, variable size */
+	/*
+	 * A single byte containing the file type field follows the inode
+	 * number for version 3 directory entries.
+	 *
+	 * A 64-bit or 32-bit inode number follows here, at a variable offset
+	 * after the name.
+	 */
+} __packed xfs_dir2_sf_entry_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+	return sizeof(struct xfs_dir2_sf_hdr) -
+		(i8count == 0) * XFS_INO64_DIFF;
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+	return get_unaligned_be16(sfep->offset);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+	put_unaligned_be16(off, sfep->offset);
+}
+
+static inline struct xfs_dir2_sf_entry *
+xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
+}
+
+/*
+ * Data block structures.
+ *
+ * A pure data block looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ *
+ * In addition to the pure data blocks for the data and node formats,
+ * most structures are also used for the combined data/freespace "block"
+ * format below.
+ */
+
+#define	XFS_DIR2_DATA_ALIGN_LOG	3		/* i.e., 8 bytes */
+#define	XFS_DIR2_DATA_ALIGN	(1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define	XFS_DIR2_DATA_FREE_TAG	0xffff
+#define	XFS_DIR2_DATA_FD_COUNT	3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define	XFS_DIR2_MAX_SPACES	3
+#define	XFS_DIR2_SPACE_SIZE	(1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define	XFS_DIR2_DATA_SPACE	0
+#define	XFS_DIR2_DATA_OFFSET	(XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Describe a free area in the data block.
+ *
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+	__be16			offset;		/* start of freespace */
+	__be16			length;		/* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ *
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+	__be32			magic;		/* XFS_DIR2_DATA_MAGIC or */
+						/* XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_free_t	bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+	__be32			magic;	/* magic number */
+	__be32			crc;	/* CRC of block */
+	__be64			blkno;	/* first block of the buffer */
+	__be64			lsn;	/* sequence number of last write */
+	uuid_t			uuid;	/* filesystem we belong to */
+	__be64			owner;	/* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+	struct xfs_dir3_blk_hdr	hdr;
+	xfs_dir2_data_free_t	best_free[XFS_DIR2_DATA_FD_COUNT];
+	__be32			pad;	/* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+/*
+ * Active entry in a data block.
+ *
+ * Aligned to 8 bytes.  After the variable length name field there is a
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
+ */
+typedef struct xfs_dir2_data_entry {
+	__be64			inumber;	/* inode number */
+	__u8			namelen;	/* name length */
+	__u8			name[];		/* name bytes, no null */
+     /* __u8			filetype; */	/* type of inode we point to */
+     /*	__be16                  tag; */		/* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.
+ *
+ * Aligned to 8 bytes.  Tag appears as the last 2 bytes and must be accessed
+ * using xfs_dir2_data_unused_tag_p.
+ */
+typedef struct xfs_dir2_data_unused {
+	__be16			freetag;	/* XFS_DIR2_DATA_FREE_TAG */
+	__be16			length;		/* total free length */
+						/* variable offset */
+	__be16			tag;		/* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
+{
+	return (__be16 *)((char *)dup +
+			be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+/*
+ * Leaf block structures.
+ *
+ * A pure leaf block looks like the following drawing on disk:
+ *
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_hdr_t       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_tail_t      |
+ *    +---------------------------+
+ *
+ * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
+ * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
+ * for directories with separate leaf nodes and free space blocks
+ * (magic = XFS_DIR2_LEAFN_MAGIC).
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define	XFS_DIR2_LEAF_SPACE	1
+#define	XFS_DIR2_LEAF_OFFSET	(XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+	xfs_da_blkinfo_t	info;		/* header for da routines */
+	__be16			count;		/* count of entries */
+	__be16			stale;		/* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+struct xfs_dir3_leaf_hdr {
+	struct xfs_da3_blkinfo	info;		/* header for da routines */
+	__be16			count;		/* count of entries */
+	__be16			stale;		/* count of stale entries */
+	__be32			pad;		/* 64 bit alignment */
+};
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+	__be32			hashval;	/* hash value of name */
+	__be32			address;	/* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+	__be32			bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ */
+typedef struct xfs_dir2_leaf {
+	xfs_dir2_leaf_hdr_t	hdr;			/* leaf header */
+	xfs_dir2_leaf_entry_t	__ents[];		/* entries */
+} xfs_dir2_leaf_t;
+
+struct xfs_dir3_leaf {
+	struct xfs_dir3_leaf_hdr	hdr;		/* leaf header */
+	struct xfs_dir2_leaf_entry	__ents[];	/* entries */
+};
+
+#define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
+{
+	return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * Free space block definitions for the node format.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define	XFS_DIR2_FREE_SPACE	2
+#define	XFS_DIR2_FREE_OFFSET	(XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+
+typedef	struct xfs_dir2_free_hdr {
+	__be32			magic;		/* XFS_DIR2_FREE_MAGIC */
+	__be32			firstdb;	/* db of first entry */
+	__be32			nvalid;		/* count of valid entries */
+	__be32			nused;		/* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+	xfs_dir2_free_hdr_t	hdr;		/* block header */
+	__be16			bests[];	/* best free counts */
+						/* unused entries are -1 */
+} xfs_dir2_free_t;
+
+struct xfs_dir3_free_hdr {
+	struct xfs_dir3_blk_hdr	hdr;
+	__be32			firstdb;	/* db of first entry */
+	__be32			nvalid;		/* count of valid entries */
+	__be32			nused;		/* count of used entries */
+	__be32			pad;		/* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+	struct xfs_dir3_free_hdr hdr;
+	__be16			bests[];	/* best free counts */
+						/* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * Single block format.
+ *
+ * The single block format looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *    | ...                                             |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_block_tail_t                           |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+typedef struct xfs_dir2_block_tail {
+	__be32		count;			/* count of leaf entries */
+	__be32		stale;			/* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+{
+	return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+
+/*
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Struct leaf_entry's are packed from the top.  Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry.  It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.  We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
+
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+struct xfs_attr_shortform {
+	struct xfs_attr_sf_hdr {	/* constant-structure header block */
+		__be16	totsize;	/* total bytes in shortform list */
+		__u8	count;	/* count of active entries */
+		__u8	padding;
+	} hdr;
+	struct xfs_attr_sf_entry {
+		uint8_t namelen;	/* actual length of name (no NULL) */
+		uint8_t valuelen;	/* actual length of value (no NULL) */
+		uint8_t flags;	/* flags bits (see xfs_attr_leaf.h) */
+		uint8_t nameval[];	/* name & value bytes concatenated */
+	} list[1];			/* variable sized array */
+};
+
+typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
+	__be16	base;			  /* base of free region */
+	__be16	size;			  /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr {	/* constant-structure header block */
+	xfs_da_blkinfo_t info;		/* block type, links, etc. */
+	__be16	count;			/* count of active leaf_entry's */
+	__be16	usedbytes;		/* num bytes of names/values stored */
+	__be16	firstused;		/* first used byte in name area */
+	__u8	holes;			/* != 0 if blk needs compaction */
+	__u8	pad1;
+	xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+					/* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry {	/* sorted on key, not name */
+	__be32	hashval;		/* hash value of name */
+	__be16	nameidx;		/* index into buffer of name/value */
+	__u8	flags;			/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+	__u8	pad2;			/* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+	__be16	valuelen;		/* number of bytes in value */
+	__u8	namelen;		/* length of name bytes */
+	__u8	nameval[1];		/* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+	__be32	valueblk;		/* block number of value bytes */
+	__be32	valuelen;		/* number of bytes in value */
+	__u8	namelen;		/* length of name bytes */
+	__u8	name[1];		/* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+	xfs_attr_leaf_hdr_t	hdr;	/* constant-structure header block */
+	xfs_attr_leaf_entry_t	entries[1];	/* sorted on key, not name */
+	/*
+	 * The rest of the block contains the following structures after the
+	 * leaf entries, growing from the bottom up. The variables are never
+	 * referenced and definining them can actually make gcc optimize away
+	 * accesses to the 'entries' array above index 0 so don't do that.
+	 *
+	 * xfs_attr_leaf_name_local_t namelist;
+	 * xfs_attr_leaf_name_remote_t valuelist;
+	 */
+} xfs_attr_leafblock_t;
+
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+	struct xfs_da3_blkinfo	info;
+	__be16			count;
+	__be16			usedbytes;
+	__be16			firstused;
+	__u8			holes;
+	__u8			pad1;
+	struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+	__be32			pad2;		/* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF	(offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+	struct xfs_attr3_leaf_hdr	hdr;
+	struct xfs_attr_leaf_entry	entries[1];
+
+	/*
+	 * The rest of the block contains the following structures after the
+	 * leaf entries, growing from the bottom up. The variables are never
+	 * referenced, the locations accessed purely from helper functions.
+	 *
+	 * struct xfs_attr_leaf_name_local
+	 * struct xfs_attr_leaf_name_remote
+	 */
+};
+
+/*
+ * Special value to represent fs block size in the leaf header firstused field.
+ * Only used when block size overflows the 2-bytes available on disk.
+ */
+#define XFS_ATTR3_LEAF_NULLOFF	0
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ */
+#define	XFS_ATTR_LOCAL_BIT	0	/* attr is stored locally */
+#define	XFS_ATTR_ROOT_BIT	1	/* limit access to trusted attrs */
+#define	XFS_ATTR_SECURE_BIT	2	/* limit access to secure attrs */
+#define	XFS_ATTR_INCOMPLETE_BIT	7	/* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL		(1u << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT		(1u << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE		(1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE	(1u << XFS_ATTR_INCOMPLETE_BIT)
+#define XFS_ATTR_NSP_ONDISK_MASK	(XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define	XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))
+
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+		return sizeof(struct xfs_attr3_leaf_hdr);
+	return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+		return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+	return &leafp->entries[0];
+}
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+	struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+	return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+	return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
+			nlen, XFS_ATTR_LEAF_NAME_ALIGN);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+	return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
+			nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+	return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+
+
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC	0x5841524d	/* XARM */
+
+struct xfs_attr3_rmt_hdr {
+	__be32	rm_magic;
+	__be32	rm_offset;
+	__be32	rm_bytes;
+	__be32	rm_crc;
+	uuid_t	rm_uuid;
+	__be64	rm_owner;
+	__be64	rm_blkno;
+	__be64	rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF	offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize)	\
+	((bufsize) - (xfs_has_crc((mp)) ? \
+			sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+/* Number of bytes in a directory block. */
+static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
+{
+	return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
+}
+
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+				      struct xfs_da3_blkinfo *hdr3);
+
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
new file mode 100644
index 000000000..5a321b783
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+#include "xfs_buf.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+
+static struct kmem_cache	*xfs_defer_pending_cache;
+
+/*
+ * Deferred Operations in XFS
+ *
+ * Due to the way locking rules work in XFS, certain transactions (block
+ * mapping and unmapping, typically) have permanent reservations so that
+ * we can roll the transaction to adhere to AG locking order rules and
+ * to unlock buffers between metadata updates.  Prior to rmap/reflink,
+ * the mapping code had a mechanism to perform these deferrals for
+ * extents that were going to be freed; this code makes that facility
+ * more generic.
+ *
+ * When adding the reverse mapping and reflink features, it became
+ * necessary to perform complex remapping multi-transactions to comply
+ * with AG locking order rules, and to be able to spread a single
+ * refcount update operation (an operation on an n-block extent can
+ * update as many as n records!) among multiple transactions.  XFS can
+ * roll a transaction to facilitate this, but using this facility
+ * requires us to log "intent" items in case log recovery needs to
+ * redo the operation, and to log "done" items to indicate that redo
+ * is not necessary.
+ *
+ * Deferred work is tracked in xfs_defer_pending items.  Each pending
+ * item tracks one type of deferred work.  Incoming work items (which
+ * have not yet had an intent logged) are attached to a pending item
+ * on the dop_intake list, where they wait for the caller to finish
+ * the deferred operations.
+ *
+ * Finishing a set of deferred operations is an involved process.  To
+ * start, we define "rolling a deferred-op transaction" as follows:
+ *
+ * > For each xfs_defer_pending item on the dop_intake list,
+ *   - Sort the work items in AG order.  XFS locking
+ *     order rules require us to lock buffers in AG order.
+ *   - Create a log intent item for that type.
+ *   - Attach it to the pending item.
+ *   - Move the pending item from the dop_intake list to the
+ *     dop_pending list.
+ * > Roll the transaction.
+ *
+ * NOTE: To avoid exceeding the transaction reservation, we limit the
+ * number of items that we attach to a given xfs_defer_pending.
+ *
+ * The actual finishing process looks like this:
+ *
+ * > For each xfs_defer_pending in the dop_pending list,
+ *   - Roll the deferred-op transaction as above.
+ *   - Create a log done item for that type, and attach it to the
+ *     log intent item.
+ *   - For each work item attached to the log intent item,
+ *     * Perform the described action.
+ *     * Attach the work item to the log done item.
+ *     * If the result of doing the work was -EAGAIN, ->finish work
+ *       wants a new transaction.  See the "Requesting a Fresh
+ *       Transaction while Finishing Deferred Work" section below for
+ *       details.
+ *
+ * The key here is that we must log an intent item for all pending
+ * work items every time we roll the transaction, and that we must log
+ * a done item as soon as the work is completed.  With this mechanism
+ * we can perform complex remapping operations, chaining intent items
+ * as needed.
+ *
+ * Requesting a Fresh Transaction while Finishing Deferred Work
+ *
+ * If ->finish_item decides that it needs a fresh transaction to
+ * finish the work, it must ask its caller (xfs_defer_finish) for a
+ * continuation.  The most likely cause of this circumstance are the
+ * refcount adjust functions deciding that they've logged enough items
+ * to be at risk of exceeding the transaction reservation.
+ *
+ * To get a fresh transaction, we want to log the existing log done
+ * item to prevent the log intent item from replaying, immediately log
+ * a new log intent item with the unfinished work items, roll the
+ * transaction, and re-call ->finish_item wherever it left off.  The
+ * log done item and the new log intent item must be in the same
+ * transaction or atomicity cannot be guaranteed; defer_finish ensures
+ * that this happens.
+ *
+ * This requires some coordination between ->finish_item and
+ * defer_finish.  Upon deciding to request a new transaction,
+ * ->finish_item should update the current work item to reflect the
+ * unfinished work.  Next, it should reset the log done item's list
+ * count to the number of items finished, and return -EAGAIN.
+ * defer_finish sees the -EAGAIN, logs the new log intent item
+ * with the remaining work items, and leaves the xfs_defer_pending
+ * item at the head of the dop_work queue.  Then it rolls the
+ * transaction and picks up processing where it left off.  It is
+ * required that ->finish_item must be careful to leave enough
+ * transaction reservation to fit the new log intent item.
+ *
+ * This is an example of remapping the extent (E, E+B) into file X at
+ * offset A and dealing with the extent (C, C+B) already being mapped
+ * there:
+ * +-------------------------------------------------+
+ * | Unmap file X startblock C offset A length B     | t0
+ * | Intent to reduce refcount for extent (C, B)     |
+ * | Intent to remove rmap (X, C, A, B)              |
+ * | Intent to free extent (D, 1) (bmbt block)       |
+ * | Intent to map (X, A, B) at startblock E         |
+ * +-------------------------------------------------+
+ * | Map file X startblock E offset A length B       | t1
+ * | Done mapping (X, E, A, B)                       |
+ * | Intent to increase refcount for extent (E, B)   |
+ * | Intent to add rmap (X, E, A, B)                 |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C, B)               | t2
+ * | Done reducing refcount for extent (C, 9)        |
+ * | Intent to reduce refcount for extent (C+9, B-9) |
+ * | (ran out of space after 9 refcount updates)     |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C+9, B+9)           | t3
+ * | Done reducing refcount for extent (C+9, B-9)    |
+ * | Increase refcount for extent (E, B)             |
+ * | Done increasing refcount for extent (E, B)      |
+ * | Intent to free extent (C, B)                    |
+ * | Intent to free extent (F, 1) (refcountbt block) |
+ * | Intent to remove rmap (F, 1, REFC)              |
+ * +-------------------------------------------------+
+ * | Remove rmap (X, C, A, B)                        | t4
+ * | Done removing rmap (X, C, A, B)                 |
+ * | Add rmap (X, E, A, B)                           |
+ * | Done adding rmap (X, E, A, B)                   |
+ * | Remove rmap (F, 1, REFC)                        |
+ * | Done removing rmap (F, 1, REFC)                 |
+ * +-------------------------------------------------+
+ * | Free extent (C, B)                              | t5
+ * | Done freeing extent (C, B)                      |
+ * | Free extent (D, 1)                              |
+ * | Done freeing extent (D, 1)                      |
+ * | Free extent (F, 1)                              |
+ * | Done freeing extent (F, 1)                      |
+ * +-------------------------------------------------+
+ *
+ * If we should crash before t2 commits, log recovery replays
+ * the following intent items:
+ *
+ * - Intent to reduce refcount for extent (C, B)
+ * - Intent to remove rmap (X, C, A, B)
+ * - Intent to free extent (D, 1) (bmbt block)
+ * - Intent to increase refcount for extent (E, B)
+ * - Intent to add rmap (X, E, A, B)
+ *
+ * In the process of recovering, it should also generate and take care
+ * of these intent items:
+ *
+ * - Intent to free extent (C, B)
+ * - Intent to free extent (F, 1) (refcountbt block)
+ * - Intent to remove rmap (F, 1, REFC)
+ *
+ * Note that the continuation requested between t2 and t3 is likely to
+ * reoccur.
+ */
+
+static const struct xfs_defer_op_type *defer_op_types[] = {
+	[XFS_DEFER_OPS_TYPE_BMAP]	= &xfs_bmap_update_defer_type,
+	[XFS_DEFER_OPS_TYPE_REFCOUNT]	= &xfs_refcount_update_defer_type,
+	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
+	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
+	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
+	[XFS_DEFER_OPS_TYPE_ATTR]	= &xfs_attr_defer_type,
+};
+
+/*
+ * Ensure there's a log intent item associated with this deferred work item if
+ * the operation must be restarted on crash.  Returns 1 if there's a log item;
+ * 0 if there isn't; or a negative errno.
+ */
+static int
+xfs_defer_create_intent(
+	struct xfs_trans		*tp,
+	struct xfs_defer_pending	*dfp,
+	bool				sort)
+{
+	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
+	struct xfs_log_item		*lip;
+
+	if (dfp->dfp_intent)
+		return 1;
+
+	lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+	if (!lip)
+		return 0;
+	if (IS_ERR(lip))
+		return PTR_ERR(lip);
+
+	dfp->dfp_intent = lip;
+	return 1;
+}
+
+/*
+ * For each pending item in the intake list, log its intent item and the
+ * associated extents, then add the entire intake list to the end of
+ * the pending list.
+ *
+ * Returns 1 if at least one log item was associated with the deferred work;
+ * 0 if there are no log items; or a negative errno.
+ */
+static int
+xfs_defer_create_intents(
+	struct xfs_trans		*tp)
+{
+	struct xfs_defer_pending	*dfp;
+	int				ret = 0;
+
+	list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
+		int			ret2;
+
+		trace_xfs_defer_create_intent(tp->t_mountp, dfp);
+		ret2 = xfs_defer_create_intent(tp, dfp, true);
+		if (ret2 < 0)
+			return ret2;
+		ret |= ret2;
+	}
+	return ret;
+}
+
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+	struct xfs_trans		*tp,
+	struct list_head		*dop_pending)
+{
+	struct xfs_defer_pending	*dfp;
+	const struct xfs_defer_op_type	*ops;
+
+	trace_xfs_defer_trans_abort(tp, _RET_IP_);
+
+	/* Abort intent items that don't have a done item. */
+	list_for_each_entry(dfp, dop_pending, dfp_list) {
+		ops = defer_op_types[dfp->dfp_type];
+		trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+		if (dfp->dfp_intent && !dfp->dfp_done) {
+			ops->abort_intent(dfp->dfp_intent);
+			dfp->dfp_intent = NULL;
+		}
+	}
+}
+
+/*
+ * Capture resources that the caller said not to release ("held") when the
+ * transaction commits.  Caller is responsible for zero-initializing @dres.
+ */
+static int
+xfs_defer_save_resources(
+	struct xfs_defer_resources	*dres,
+	struct xfs_trans		*tp)
+{
+	struct xfs_buf_log_item		*bli;
+	struct xfs_inode_log_item	*ili;
+	struct xfs_log_item		*lip;
+
+	BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
+
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
+		switch (lip->li_type) {
+		case XFS_LI_BUF:
+			bli = container_of(lip, struct xfs_buf_log_item,
+					   bli_item);
+			if (bli->bli_flags & XFS_BLI_HOLD) {
+				if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
+					ASSERT(0);
+					return -EFSCORRUPTED;
+				}
+				if (bli->bli_flags & XFS_BLI_ORDERED)
+					dres->dr_ordered |=
+							(1U << dres->dr_bufs);
+				else
+					xfs_trans_dirty_buf(tp, bli->bli_buf);
+				dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
+			}
+			break;
+		case XFS_LI_INODE:
+			ili = container_of(lip, struct xfs_inode_log_item,
+					   ili_item);
+			if (ili->ili_lock_flags == 0) {
+				if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
+					ASSERT(0);
+					return -EFSCORRUPTED;
+				}
+				xfs_trans_log_inode(tp, ili->ili_inode,
+						    XFS_ILOG_CORE);
+				dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/* Attach the held resources to the transaction. */
+static void
+xfs_defer_restore_resources(
+	struct xfs_trans		*tp,
+	struct xfs_defer_resources	*dres)
+{
+	unsigned short			i;
+
+	/* Rejoin the joined inodes. */
+	for (i = 0; i < dres->dr_inos; i++)
+		xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
+
+	/* Rejoin the buffers and dirty them so the log moves forward. */
+	for (i = 0; i < dres->dr_bufs; i++) {
+		xfs_trans_bjoin(tp, dres->dr_bp[i]);
+		if (dres->dr_ordered & (1U << i))
+			xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
+		xfs_trans_bhold(tp, dres->dr_bp[i]);
+	}
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+	struct xfs_trans		**tpp)
+{
+	struct xfs_defer_resources	dres = { };
+	int				error;
+
+	error = xfs_defer_save_resources(&dres, *tpp);
+	if (error)
+		return error;
+
+	trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
+
+	/*
+	 * Roll the transaction.  Rolling always given a new transaction (even
+	 * if committing the old one fails!) to hand back to the caller, so we
+	 * join the held resources to the new transaction so that we always
+	 * return with the held resources joined to @tpp, no matter what
+	 * happened.
+	 */
+	error = xfs_trans_roll(tpp);
+
+	xfs_defer_restore_resources(*tpp, &dres);
+
+	if (error)
+		trace_xfs_defer_trans_roll_error(*tpp, error);
+	return error;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+static void
+xfs_defer_cancel_list(
+	struct xfs_mount		*mp,
+	struct list_head		*dop_list)
+{
+	struct xfs_defer_pending	*dfp;
+	struct xfs_defer_pending	*pli;
+	struct list_head		*pwi;
+	struct list_head		*n;
+	const struct xfs_defer_op_type	*ops;
+
+	/*
+	 * Free the pending items.  Caller should already have arranged
+	 * for the intent items to be released.
+	 */
+	list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
+		ops = defer_op_types[dfp->dfp_type];
+		trace_xfs_defer_cancel_list(mp, dfp);
+		list_del(&dfp->dfp_list);
+		list_for_each_safe(pwi, n, &dfp->dfp_work) {
+			list_del(pwi);
+			dfp->dfp_count--;
+			ops->cancel_item(pwi);
+		}
+		ASSERT(dfp->dfp_count == 0);
+		kmem_cache_free(xfs_defer_pending_cache, dfp);
+	}
+}
+
+/*
+ * Prevent a log intent item from pinning the tail of the log by logging a
+ * done item to release the intent item; and then log a new intent item.
+ * The caller should provide a fresh transaction and roll it after we're done.
+ */
+static int
+xfs_defer_relog(
+	struct xfs_trans		**tpp,
+	struct list_head		*dfops)
+{
+	struct xlog			*log = (*tpp)->t_mountp->m_log;
+	struct xfs_defer_pending	*dfp;
+	xfs_lsn_t			threshold_lsn = NULLCOMMITLSN;
+
+
+	ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+	list_for_each_entry(dfp, dfops, dfp_list) {
+		/*
+		 * If the log intent item for this deferred op is not a part of
+		 * the current log checkpoint, relog the intent item to keep
+		 * the log tail moving forward.  We're ok with this being racy
+		 * because an incorrect decision means we'll be a little slower
+		 * at pushing the tail.
+		 */
+		if (dfp->dfp_intent == NULL ||
+		    xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+			continue;
+
+		/*
+		 * Figure out where we need the tail to be in order to maintain
+		 * the minimum required free space in the log.  Only sample
+		 * the log threshold once per call.
+		 */
+		if (threshold_lsn == NULLCOMMITLSN) {
+			threshold_lsn = xlog_grant_push_threshold(log, 0);
+			if (threshold_lsn == NULLCOMMITLSN)
+				break;
+		}
+		if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
+			continue;
+
+		trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+		XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+		dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
+	}
+
+	if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
+		return xfs_defer_trans_roll(tpp);
+	return 0;
+}
+
+/*
+ * Log an intent-done item for the first pending intent, and finish the work
+ * items.
+ */
+static int
+xfs_defer_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_defer_pending	*dfp)
+{
+	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
+	struct xfs_btree_cur		*state = NULL;
+	struct list_head		*li, *n;
+	int				error;
+
+	trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
+
+	dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+	list_for_each_safe(li, n, &dfp->dfp_work) {
+		list_del(li);
+		dfp->dfp_count--;
+		error = ops->finish_item(tp, dfp->dfp_done, li, &state);
+		if (error == -EAGAIN) {
+			int		ret;
+
+			/*
+			 * Caller wants a fresh transaction; put the work item
+			 * back on the list and log a new log intent item to
+			 * replace the old one.  See "Requesting a Fresh
+			 * Transaction while Finishing Deferred Work" above.
+			 */
+			list_add(li, &dfp->dfp_work);
+			dfp->dfp_count++;
+			dfp->dfp_done = NULL;
+			dfp->dfp_intent = NULL;
+			ret = xfs_defer_create_intent(tp, dfp, false);
+			if (ret < 0)
+				error = ret;
+		}
+
+		if (error)
+			goto out;
+	}
+
+	/* Done with the dfp, free it. */
+	list_del(&dfp->dfp_list);
+	kmem_cache_free(xfs_defer_pending_cache, dfp);
+out:
+	if (ops->finish_cleanup)
+		ops->finish_cleanup(tp, state, error);
+	return error;
+}
+
+/*
+ * Finish all the pending work.  This involves logging intent items for
+ * any work items that wandered in since the last transaction roll (if
+ * one has even happened), rolling the transaction, and finishing the
+ * work items in the first item on the logged-and-pending list.
+ *
+ * If an inode is provided, relog it to the new transaction.
+ */
+int
+xfs_defer_finish_noroll(
+	struct xfs_trans		**tp)
+{
+	struct xfs_defer_pending	*dfp = NULL;
+	int				error = 0;
+	LIST_HEAD(dop_pending);
+
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+	trace_xfs_defer_finish(*tp, _RET_IP_);
+
+	/* Until we run out of pending work to finish... */
+	while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
+		/*
+		 * Deferred items that are created in the process of finishing
+		 * other deferred work items should be queued at the head of
+		 * the pending list, which puts them ahead of the deferred work
+		 * that was created by the caller.  This keeps the number of
+		 * pending work items to a minimum, which decreases the amount
+		 * of time that any one intent item can stick around in memory,
+		 * pinning the log tail.
+		 */
+		int has_intents = xfs_defer_create_intents(*tp);
+
+		list_splice_init(&(*tp)->t_dfops, &dop_pending);
+
+		if (has_intents < 0) {
+			error = has_intents;
+			goto out_shutdown;
+		}
+		if (has_intents || dfp) {
+			error = xfs_defer_trans_roll(tp);
+			if (error)
+				goto out_shutdown;
+
+			/* Relog intent items to keep the log moving. */
+			error = xfs_defer_relog(tp, &dop_pending);
+			if (error)
+				goto out_shutdown;
+		}
+
+		dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
+				       dfp_list);
+		error = xfs_defer_finish_one(*tp, dfp);
+		if (error && error != -EAGAIN)
+			goto out_shutdown;
+	}
+
+	trace_xfs_defer_finish_done(*tp, _RET_IP_);
+	return 0;
+
+out_shutdown:
+	xfs_defer_trans_abort(*tp, &dop_pending);
+	xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+	trace_xfs_defer_finish_error(*tp, error);
+	xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
+	xfs_defer_cancel(*tp);
+	return error;
+}
+
+int
+xfs_defer_finish(
+	struct xfs_trans	**tp)
+{
+	int			error;
+
+	/*
+	 * Finish and roll the transaction once more to avoid returning to the
+	 * caller with a dirty transaction.
+	 */
+	error = xfs_defer_finish_noroll(tp);
+	if (error)
+		return error;
+	if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
+		error = xfs_defer_trans_roll(tp);
+		if (error) {
+			xfs_force_shutdown((*tp)->t_mountp,
+					   SHUTDOWN_CORRUPT_INCORE);
+			return error;
+		}
+	}
+
+	/* Reset LOWMODE now that we've finished all the dfops. */
+	ASSERT(list_empty(&(*tp)->t_dfops));
+	(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
+	return 0;
+}
+
+void
+xfs_defer_cancel(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	trace_xfs_defer_cancel(tp, _RET_IP_);
+	xfs_defer_cancel_list(mp, &tp->t_dfops);
+}
+
+/* Add an item for later deferred processing. */
+void
+xfs_defer_add(
+	struct xfs_trans		*tp,
+	enum xfs_defer_ops_type		type,
+	struct list_head		*li)
+{
+	struct xfs_defer_pending	*dfp = NULL;
+	const struct xfs_defer_op_type	*ops;
+
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+	BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
+
+	/*
+	 * Add the item to a pending item at the end of the intake list.
+	 * If the last pending item has the same type, reuse it.  Else,
+	 * create a new pending item at the end of the intake list.
+	 */
+	if (!list_empty(&tp->t_dfops)) {
+		dfp = list_last_entry(&tp->t_dfops,
+				struct xfs_defer_pending, dfp_list);
+		ops = defer_op_types[dfp->dfp_type];
+		if (dfp->dfp_type != type ||
+		    (ops->max_items && dfp->dfp_count >= ops->max_items))
+			dfp = NULL;
+	}
+	if (!dfp) {
+		dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+				GFP_NOFS | __GFP_NOFAIL);
+		dfp->dfp_type = type;
+		dfp->dfp_intent = NULL;
+		dfp->dfp_done = NULL;
+		dfp->dfp_count = 0;
+		INIT_LIST_HEAD(&dfp->dfp_work);
+		list_add_tail(&dfp->dfp_list, &tp->t_dfops);
+	}
+
+	list_add_tail(li, &dfp->dfp_work);
+	dfp->dfp_count++;
+}
+
+/*
+ * Move deferred ops from one transaction to another and reset the source to
+ * initial state. This is primarily used to carry state forward across
+ * transaction rolls with pending dfops.
+ */
+void
+xfs_defer_move(
+	struct xfs_trans	*dtp,
+	struct xfs_trans	*stp)
+{
+	list_splice_init(&stp->t_dfops, &dtp->t_dfops);
+
+	/*
+	 * Low free space mode was historically controlled by a dfops field.
+	 * This meant that low mode state potentially carried across multiple
+	 * transaction rolls. Transfer low mode on a dfops move to preserve
+	 * that behavior.
+	 */
+	dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
+	stp->t_flags &= ~XFS_TRANS_LOWMODE;
+}
+
+/*
+ * Prepare a chain of fresh deferred ops work items to be completed later.  Log
+ * recovery requires the ability to put off until later the actual finishing
+ * work so that it can process unfinished items recovered from the log in
+ * correct order.
+ *
+ * Create and log intent items for all the work that we're capturing so that we
+ * can be assured that the items will get replayed if the system goes down
+ * before log recovery gets a chance to finish the work it put off.  The entire
+ * deferred ops state is transferred to the capture structure and the
+ * transaction is then ready for the caller to commit it.  If there are no
+ * intent items to capture, this function returns NULL.
+ *
+ * If capture_ip is not NULL, the capture structure will obtain an extra
+ * reference to the inode.
+ */
+static struct xfs_defer_capture *
+xfs_defer_ops_capture(
+	struct xfs_trans		*tp)
+{
+	struct xfs_defer_capture	*dfc;
+	unsigned short			i;
+	int				error;
+
+	if (list_empty(&tp->t_dfops))
+		return NULL;
+
+	error = xfs_defer_create_intents(tp);
+	if (error < 0)
+		return ERR_PTR(error);
+
+	/* Create an object to capture the defer ops. */
+	dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+	INIT_LIST_HEAD(&dfc->dfc_list);
+	INIT_LIST_HEAD(&dfc->dfc_dfops);
+
+	/* Move the dfops chain and transaction state to the capture struct. */
+	list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
+	dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+	tp->t_flags &= ~XFS_TRANS_LOWMODE;
+
+	/* Capture the remaining block reservations along with the dfops. */
+	dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+	dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+
+	/* Preserve the log reservation size. */
+	dfc->dfc_logres = tp->t_log_res;
+
+	error = xfs_defer_save_resources(&dfc->dfc_held, tp);
+	if (error) {
+		/*
+		 * Resource capture should never fail, but if it does, we
+		 * still have to shut down the log and release things
+		 * properly.
+		 */
+		xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+	}
+
+	/*
+	 * Grab extra references to the inodes and buffers because callers are
+	 * expected to release their held references after we commit the
+	 * transaction.
+	 */
+	for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
+		ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+		ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
+	}
+
+	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+		xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
+
+	return dfc;
+}
+
+/* Release all resources that we used to capture deferred ops. */
+void
+xfs_defer_ops_capture_free(
+	struct xfs_mount		*mp,
+	struct xfs_defer_capture	*dfc)
+{
+	unsigned short			i;
+
+	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+
+	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+		xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
+
+	for (i = 0; i < dfc->dfc_held.dr_inos; i++)
+		xfs_irele(dfc->dfc_held.dr_ip[i]);
+
+	kmem_free(dfc);
+}
+
+/*
+ * Capture any deferred ops and commit the transaction.  This is the last step
+ * needed to finish a log intent item that we recovered from the log.  If any
+ * of the deferred ops operate on an inode, the caller must pass in that inode
+ * so that the reference can be transferred to the capture structure.  The
+ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
+ * xfs_defer_ops_continue.
+ */
+int
+xfs_defer_ops_capture_and_commit(
+	struct xfs_trans		*tp,
+	struct list_head		*capture_list)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_defer_capture	*dfc;
+	int				error;
+
+	/* If we don't capture anything, commit transaction and exit. */
+	dfc = xfs_defer_ops_capture(tp);
+	if (IS_ERR(dfc)) {
+		xfs_trans_cancel(tp);
+		return PTR_ERR(dfc);
+	}
+	if (!dfc)
+		return xfs_trans_commit(tp);
+
+	/* Commit the transaction and add the capture structure to the list. */
+	error = xfs_trans_commit(tp);
+	if (error) {
+		xfs_defer_ops_capture_free(mp, dfc);
+		return error;
+	}
+
+	list_add_tail(&dfc->dfc_list, capture_list);
+	return 0;
+}
+
+/*
+ * Attach a chain of captured deferred ops to a new transaction and free the
+ * capture structure.  If an inode was captured, it will be passed back to the
+ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
+ * The caller now owns the inode reference.
+ */
+void
+xfs_defer_ops_continue(
+	struct xfs_defer_capture	*dfc,
+	struct xfs_trans		*tp,
+	struct xfs_defer_resources	*dres)
+{
+	unsigned int			i;
+
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+	/* Lock the captured resources to the new transaction. */
+	if (dfc->dfc_held.dr_inos == 2)
+		xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
+				    dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
+	else if (dfc->dfc_held.dr_inos == 1)
+		xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+
+	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+		xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
+
+	/* Join the captured resources to the new transaction. */
+	xfs_defer_restore_resources(tp, &dfc->dfc_held);
+	memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
+	dres->dr_bufs = 0;
+
+	/* Move captured dfops chain and state to the transaction. */
+	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+	tp->t_flags |= dfc->dfc_tpflags;
+
+	kmem_free(dfc);
+}
+
+/* Release the resources captured and continued during recovery. */
+void
+xfs_defer_resources_rele(
+	struct xfs_defer_resources	*dres)
+{
+	unsigned short			i;
+
+	for (i = 0; i < dres->dr_inos; i++) {
+		xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
+		xfs_irele(dres->dr_ip[i]);
+		dres->dr_ip[i] = NULL;
+	}
+
+	for (i = 0; i < dres->dr_bufs; i++) {
+		xfs_buf_relse(dres->dr_bp[i]);
+		dres->dr_bp[i] = NULL;
+	}
+
+	dres->dr_inos = 0;
+	dres->dr_bufs = 0;
+	dres->dr_ordered = 0;
+}
+
+static inline int __init
+xfs_defer_init_cache(void)
+{
+	xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
+			sizeof(struct xfs_defer_pending),
+			0, 0, NULL);
+
+	return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
+}
+
+static inline void
+xfs_defer_destroy_cache(void)
+{
+	kmem_cache_destroy(xfs_defer_pending_cache);
+	xfs_defer_pending_cache = NULL;
+}
+
+/* Set up caches for deferred work items. */
+int __init
+xfs_defer_init_item_caches(void)
+{
+	int				error;
+
+	error = xfs_defer_init_cache();
+	if (error)
+		return error;
+	error = xfs_rmap_intent_init_cache();
+	if (error)
+		goto err;
+	error = xfs_refcount_intent_init_cache();
+	if (error)
+		goto err;
+	error = xfs_bmap_intent_init_cache();
+	if (error)
+		goto err;
+	error = xfs_extfree_intent_init_cache();
+	if (error)
+		goto err;
+	error = xfs_attr_intent_init_cache();
+	if (error)
+		goto err;
+	return 0;
+err:
+	xfs_defer_destroy_item_caches();
+	return error;
+}
+
+/* Destroy all the deferred work item caches, if they've been allocated. */
+void
+xfs_defer_destroy_item_caches(void)
+{
+	xfs_attr_intent_destroy_cache();
+	xfs_extfree_intent_destroy_cache();
+	xfs_bmap_intent_destroy_cache();
+	xfs_refcount_intent_destroy_cache();
+	xfs_rmap_intent_destroy_cache();
+	xfs_defer_destroy_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
new file mode 100644
index 000000000..114a3a493
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_DEFER_H__
+#define	__XFS_DEFER_H__
+
+struct xfs_btree_cur;
+struct xfs_defer_op_type;
+struct xfs_defer_capture;
+
+/*
+ * Header for deferred operation list.
+ */
+enum xfs_defer_ops_type {
+	XFS_DEFER_OPS_TYPE_BMAP,
+	XFS_DEFER_OPS_TYPE_REFCOUNT,
+	XFS_DEFER_OPS_TYPE_RMAP,
+	XFS_DEFER_OPS_TYPE_FREE,
+	XFS_DEFER_OPS_TYPE_AGFL_FREE,
+	XFS_DEFER_OPS_TYPE_ATTR,
+	XFS_DEFER_OPS_TYPE_MAX,
+};
+
+/*
+ * Save a log intent item and a list of extents, so that we can replay
+ * whatever action had to happen to the extent list and file the log done
+ * item.
+ */
+struct xfs_defer_pending {
+	struct list_head		dfp_list;	/* pending items */
+	struct list_head		dfp_work;	/* work items */
+	struct xfs_log_item		*dfp_intent;	/* log intent item */
+	struct xfs_log_item		*dfp_done;	/* log done item */
+	unsigned int			dfp_count;	/* # extent items */
+	enum xfs_defer_ops_type		dfp_type;
+};
+
+void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
+		struct list_head *h);
+int xfs_defer_finish_noroll(struct xfs_trans **tp);
+int xfs_defer_finish(struct xfs_trans **tp);
+void xfs_defer_cancel(struct xfs_trans *);
+void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
+
+/* Description of a deferred type. */
+struct xfs_defer_op_type {
+	struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
+			struct list_head *items, unsigned int count, bool sort);
+	void (*abort_intent)(struct xfs_log_item *intent);
+	struct xfs_log_item *(*create_done)(struct xfs_trans *tp,
+			struct xfs_log_item *intent, unsigned int count);
+	int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done,
+			struct list_head *item, struct xfs_btree_cur **state);
+	void (*finish_cleanup)(struct xfs_trans *tp,
+			struct xfs_btree_cur *state, int error);
+	void (*cancel_item)(struct list_head *item);
+	unsigned int		max_items;
+};
+
+extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
+extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_attr_defer_type;
+
+
+/*
+ * Deferred operation item relogging limits.
+ */
+#define XFS_DEFER_OPS_NR_INODES	2	/* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS	2	/* join up to two buffers */
+
+/* Resources that must be held across a transaction roll. */
+struct xfs_defer_resources {
+	/* held buffers */
+	struct xfs_buf		*dr_bp[XFS_DEFER_OPS_NR_BUFS];
+
+	/* inodes with no unlock flags */
+	struct xfs_inode	*dr_ip[XFS_DEFER_OPS_NR_INODES];
+
+	/* number of held buffers */
+	unsigned short		dr_bufs;
+
+	/* bitmap of ordered buffers */
+	unsigned short		dr_ordered;
+
+	/* number of held inodes */
+	unsigned short		dr_inos;
+};
+
+/*
+ * This structure enables a dfops user to detach the chain of deferred
+ * operations from a transaction so that they can be continued later.
+ */
+struct xfs_defer_capture {
+	/* List of other capture structures. */
+	struct list_head	dfc_list;
+
+	/* Deferred ops state saved from the transaction. */
+	struct list_head	dfc_dfops;
+	unsigned int		dfc_tpflags;
+
+	/* Block reservations for the data and rt devices. */
+	unsigned int		dfc_blkres;
+	unsigned int		dfc_rtxres;
+
+	/* Log reservation saved from the transaction. */
+	unsigned int		dfc_logres;
+
+	struct xfs_defer_resources dfc_held;
+};
+
+/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+		struct list_head *capture_list);
+void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
+		struct xfs_defer_resources *dres);
+void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+		struct xfs_defer_capture *d);
+void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+
+int __init xfs_defer_init_item_caches(void);
+void xfs_defer_destroy_item_caches(void);
+
+#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
new file mode 100644
index 000000000..92bac3373
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -0,0 +1,769 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+const struct xfs_name xfs_name_dotdot = {
+	.name	= (const unsigned char *)"..",
+	.len	= 2,
+	.type	= XFS_DIR3_FT_DIR,
+};
+
+/*
+ * Convert inode mode to directory entry filetype
+ */
+unsigned char
+xfs_mode_to_ftype(
+	int		mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		return XFS_DIR3_FT_REG_FILE;
+	case S_IFDIR:
+		return XFS_DIR3_FT_DIR;
+	case S_IFCHR:
+		return XFS_DIR3_FT_CHRDEV;
+	case S_IFBLK:
+		return XFS_DIR3_FT_BLKDEV;
+	case S_IFIFO:
+		return XFS_DIR3_FT_FIFO;
+	case S_IFSOCK:
+		return XFS_DIR3_FT_SOCK;
+	case S_IFLNK:
+		return XFS_DIR3_FT_SYMLINK;
+	default:
+		return XFS_DIR3_FT_UNKNOWN;
+	}
+}
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+xfs_dahash_t
+xfs_ascii_ci_hashname(
+	const struct xfs_name	*name)
+{
+	xfs_dahash_t		hash;
+	int			i;
+
+	for (i = 0, hash = 0; i < name->len; i++)
+		hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+	return hash;
+}
+
+enum xfs_dacmp
+xfs_ascii_ci_compname(
+	struct xfs_da_args	*args,
+	const unsigned char	*name,
+	int			len)
+{
+	enum xfs_dacmp		result;
+	int			i;
+
+	if (args->namelen != len)
+		return XFS_CMP_DIFFERENT;
+
+	result = XFS_CMP_EXACT;
+	for (i = 0; i < len; i++) {
+		if (args->name[i] == name[i])
+			continue;
+		if (tolower(args->name[i]) != tolower(name[i]))
+			return XFS_CMP_DIFFERENT;
+		result = XFS_CMP_CASE;
+	}
+
+	return result;
+}
+
+int
+xfs_da_mount(
+	struct xfs_mount	*mp)
+{
+	struct xfs_da_geometry	*dageo;
+
+
+	ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+	ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE);
+
+	mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+				    KM_MAYFAIL);
+	mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+				     KM_MAYFAIL);
+	if (!mp->m_dir_geo || !mp->m_attr_geo) {
+		kmem_free(mp->m_dir_geo);
+		kmem_free(mp->m_attr_geo);
+		return -ENOMEM;
+	}
+
+	/* set up directory geometry */
+	dageo = mp->m_dir_geo;
+	dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+	dageo->fsblog = mp->m_sb.sb_blocklog;
+	dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb);
+	dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+	if (xfs_has_crc(mp)) {
+		dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr);
+		dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr);
+		dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr);
+		dageo->data_entry_offset =
+				sizeof(struct xfs_dir3_data_hdr);
+	} else {
+		dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr);
+		dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr);
+		dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr);
+		dageo->data_entry_offset =
+				sizeof(struct xfs_dir2_data_hdr);
+	}
+	dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) /
+			sizeof(struct xfs_dir2_leaf_entry);
+	dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) /
+			sizeof(xfs_dir2_data_off_t);
+
+	dageo->data_first_offset = dageo->data_entry_offset +
+			xfs_dir2_data_entsize(mp, 1) +
+			xfs_dir2_data_entsize(mp, 2);
+
+	/*
+	 * Now we've set up the block conversion variables, we can calculate the
+	 * segment block constants using the geometry structure.
+	 */
+	dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+	dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+	dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+	dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
+				(uint)sizeof(xfs_da_node_entry_t);
+	dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
+					mp->m_sb.sb_blocklog;
+	dageo->magicpct = (dageo->blksize * 37) / 100;
+
+	/* set up attribute geometry - single fsb only */
+	dageo = mp->m_attr_geo;
+	dageo->blklog = mp->m_sb.sb_blocklog;
+	dageo->fsblog = mp->m_sb.sb_blocklog;
+	dageo->blksize = 1 << dageo->blklog;
+	dageo->fsbcount = 1;
+	dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
+	dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
+				(uint)sizeof(xfs_da_node_entry_t);
+
+	if (xfs_has_large_extent_counts(mp))
+		dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+	else
+		dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+	dageo->magicpct = (dageo->blksize * 37) / 100;
+	return 0;
+}
+
+void
+xfs_da_unmount(
+	struct xfs_mount	*mp)
+{
+	kmem_free(mp->m_dir_geo);
+	kmem_free(mp->m_attr_geo);
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+int
+xfs_dir_isempty(
+	xfs_inode_t	*dp)
+{
+	xfs_dir2_sf_hdr_t	*sfp;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+	if (dp->i_disk_size == 0)	/* might happen during shutdown. */
+		return 1;
+	if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
+		return 0;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	return !sfp->count;
+}
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	bool		ino_ok = xfs_verify_dir_ino(mp, ino);
+
+	if (XFS_IS_CORRUPT(mp, !ino_ok) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
+		xfs_warn(mp, "Invalid inode number 0x%Lx",
+				(unsigned long long) ino);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+int
+xfs_dir_init(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	xfs_inode_t	*pdp)
+{
+	struct xfs_da_args *args;
+	int		error;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+	error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+	if (error)
+		return error;
+
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->dp = dp;
+	args->trans = tp;
+	error = xfs_dir2_sf_create(args, pdp->i_ino);
+	kmem_free(args);
+	return error;
+}
+
+/*
+ * Enter a name in a directory, or check for available space.
+ * If inum is 0, only the available space test is performed.
+ */
+int
+xfs_dir_createname(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	const struct xfs_name	*name,
+	xfs_ino_t		inum,		/* new entry inode number */
+	xfs_extlen_t		total)		/* bmap's total block count */
+{
+	struct xfs_da_args	*args;
+	int			rval;
+	bool			v;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+	if (inum) {
+		rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+		if (rval)
+			return rval;
+		XFS_STATS_INC(dp->i_mount, xs_dir_create);
+	}
+
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = xfs_dir2_hashname(dp->i_mount, name);
+	args->inumber = inum;
+	args->dp = dp;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+	args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+	if (!inum)
+		args->op_flags |= XFS_DA_OP_JUSTCHECK;
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_addname(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_addname(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_addname(args);
+	else
+		rval = xfs_dir2_node_addname(args);
+
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	if (args->cmpresult == XFS_CMP_DIFFERENT)
+		return -ENOENT;
+	if (args->cmpresult != XFS_CMP_CASE ||
+					!(args->op_flags & XFS_DA_OP_CILOOKUP))
+		return -EEXIST;
+
+	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+	if (!args->value)
+		return -ENOMEM;
+
+	memcpy(args->value, name, len);
+	args->valuelen = len;
+	return -EEXIST;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
+ */
+
+int
+xfs_dir_lookup(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	const struct xfs_name	*name,
+	xfs_ino_t		*inum,	  /* out: inode number */
+	struct xfs_name		*ci_name) /* out: actual name if CI match */
+{
+	struct xfs_da_args	*args;
+	int			rval;
+	bool			v;
+	int			lock_mode;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+	XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
+
+	/*
+	 * We need to use KM_NOFS here so that lockdep will not throw false
+	 * positive deadlock warnings on a non-transactional lookup path. It is
+	 * safe to recurse into inode recalim in that case, but lockdep can't
+	 * easily be taught about it. Hence KM_NOFS avoids having to add more
+	 * lockdep Doing this avoids having to add a bunch of lockdep class
+	 * annotations into the reclaim path for the ilock.
+	 */
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = xfs_dir2_hashname(dp->i_mount, name);
+	args->dp = dp;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+	args->op_flags = XFS_DA_OP_OKNOENT;
+	if (ci_name)
+		args->op_flags |= XFS_DA_OP_CILOOKUP;
+
+	lock_mode = xfs_ilock_data_map_shared(dp);
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_lookup(args);
+		goto out_check_rval;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_lookup(args);
+		goto out_check_rval;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_lookup(args);
+	else
+		rval = xfs_dir2_node_lookup(args);
+
+out_check_rval:
+	if (rval == -EEXIST)
+		rval = 0;
+	if (!rval) {
+		*inum = args->inumber;
+		if (ci_name) {
+			ci_name->name = args->value;
+			ci_name->len = args->valuelen;
+		}
+	}
+out_free:
+	xfs_iunlock(dp, lock_mode);
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+int
+xfs_dir_removename(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_name		*name,
+	xfs_ino_t		ino,
+	xfs_extlen_t		total)		/* bmap's total block count */
+{
+	struct xfs_da_args	*args;
+	int			rval;
+	bool			v;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+	XFS_STATS_INC(dp->i_mount, xs_dir_remove);
+
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = xfs_dir2_hashname(dp->i_mount, name);
+	args->inumber = ino;
+	args->dp = dp;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_removename(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_removename(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_removename(args);
+	else
+		rval = xfs_dir2_node_removename(args);
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+int
+xfs_dir_replace(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	const struct xfs_name	*name,		/* name of entry to replace */
+	xfs_ino_t		inum,		/* new inode number */
+	xfs_extlen_t		total)		/* bmap's total block count */
+{
+	struct xfs_da_args	*args;
+	int			rval;
+	bool			v;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+	rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+	if (rval)
+		return rval;
+
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = xfs_dir2_hashname(dp->i_mount, name);
+	args->inumber = inum;
+	args->dp = dp;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_replace(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_replace(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_replace(args);
+	else
+		rval = xfs_dir2_node_replace(args);
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ */
+int
+xfs_dir_canenter(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name)		/* name of entry to add */
+{
+	return xfs_dir_createname(tp, dp, name, 0, 0);
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ *
+ * This routine is for data and free blocks, not leaf/node blocks which are
+ * handled by xfs_da_grow_inode.
+ */
+int
+xfs_dir2_grow_inode(
+	struct xfs_da_args	*args,
+	int			space,	/* v2 dir's space XFS_DIR2_xxx_SPACE */
+	xfs_dir2_db_t		*dbp)	/* out: block number added */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	xfs_fileoff_t		bno;	/* directory offset of new block */
+	int			count;	/* count of filesystem blocks */
+	int			error;
+
+	trace_xfs_dir2_grow_inode(args, space);
+
+	/*
+	 * Set lowest possible block in the space requested.
+	 */
+	bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+	count = args->geo->fsbcount;
+
+	error = xfs_da_grow_inode_int(args, &bno, count);
+	if (error)
+		return error;
+
+	*dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
+
+	/*
+	 * Update file's size if this is the data space and it grew.
+	 */
+	if (space == XFS_DIR2_DATA_SPACE) {
+		xfs_fsize_t	size;		/* directory file (data) size */
+
+		size = XFS_FSB_TO_B(mp, bno + count);
+		if (size > dp->i_disk_size) {
+			dp->i_disk_size = size;
+			xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int
+xfs_dir2_isblock(
+	struct xfs_da_args	*args,
+	bool			*isblock)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+	xfs_fileoff_t		eof;
+	int			error;
+
+	error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	*isblock = false;
+	if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize)
+		return 0;
+
+	*isblock = true;
+	if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
+		return -EFSCORRUPTED;
+	return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int
+xfs_dir2_isleaf(
+	struct xfs_da_args	*args,
+	bool			*isleaf)
+{
+	xfs_fileoff_t		eof;
+	int			error;
+
+	error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	*isleaf = false;
+	if (eof != args->geo->leafblk + args->geo->fsbcount)
+		return 0;
+
+	*isleaf = true;
+	return 0;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+	struct xfs_da_args	*args,
+	xfs_dir2_db_t		db,
+	struct xfs_buf		*bp)
+{
+	xfs_fileoff_t		bno;		/* directory file offset */
+	xfs_dablk_t		da;		/* directory file offset */
+	int			done;		/* bunmap is finished */
+	struct xfs_inode	*dp;
+	int			error;
+	struct xfs_mount	*mp;
+	struct xfs_trans	*tp;
+
+	trace_xfs_dir2_shrink_inode(args, db);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	da = xfs_dir2_db_to_da(args->geo, db);
+
+	/* Unmap the fsblock(s). */
+	error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, &done);
+	if (error) {
+		/*
+		 * ENOSPC actually can happen if we're in a removename with no
+		 * space reservation, and the resulting block removal would
+		 * cause a bmap btree split or conversion from extents to btree.
+		 * This can only happen for un-fragmented directory blocks,
+		 * since you need to be punching out the middle of an extent.
+		 * In this case we need to leave the block in the file, and not
+		 * binval it.  So the block has to be in a consistent empty
+		 * state and appropriately logged.  We don't free up the buffer,
+		 * the caller can tell it hasn't happened since it got an error
+		 * back.
+		 */
+		return error;
+	}
+	ASSERT(done);
+	/*
+	 * Invalidate the buffer from the transaction.
+	 */
+	xfs_trans_binval(tp, bp);
+	/*
+	 * If it's not a data block, we're done.
+	 */
+	if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
+		return 0;
+	/*
+	 * If the block isn't the last one in the directory, we're done.
+	 */
+	if (dp->i_disk_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
+		return 0;
+	bno = da;
+	if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+		/*
+		 * This can't really happen unless there's kernel corruption.
+		 */
+		return error;
+	}
+	if (db == args->geo->datablk)
+		ASSERT(bno == 0);
+	else
+		ASSERT(bno > 0);
+	/*
+	 * Set the size to the new last block.
+	 */
+	dp->i_disk_size = XFS_FSB_TO_B(mp, bno);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	return 0;
+}
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+	const void	*name,
+	size_t		length)
+{
+	/*
+	 * MAXNAMELEN includes the trailing null, but (name/length) leave it
+	 * out, so use >= for the length check.
+	 */
+	if (length >= MAXNAMELEN)
+		return false;
+
+	/* There shouldn't be any slashes or nulls here */
+	return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
+
+xfs_dahash_t
+xfs_dir2_hashname(
+	struct xfs_mount	*mp,
+	const struct xfs_name	*name)
+{
+	if (unlikely(xfs_has_asciici(mp)))
+		return xfs_ascii_ci_hashname(name);
+	return xfs_da_hashname(name->name, name->len);
+}
+
+enum xfs_dacmp
+xfs_dir2_compname(
+	struct xfs_da_args	*args,
+	const unsigned char	*name,
+	int			len)
+{
+	if (unlikely(xfs_has_asciici(args->dp->i_mount)))
+		return xfs_ascii_ci_compname(args, name, len);
+	return xfs_da_compname(args, name, len);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
new file mode 100644
index 000000000..dd39f17dd
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
+struct xfs_dir3_icfree_hdr;
+struct xfs_dir3_icleaf_hdr;
+
+extern const struct xfs_name	xfs_name_dotdot;
+
+/*
+ * Convert inode mode to directory entry filetype
+ */
+extern unsigned char xfs_mode_to_ftype(int mode);
+
+/*
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+				const struct xfs_name *name, xfs_ino_t inum,
+				xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+				const struct xfs_name *name, xfs_ino_t *inum,
+				struct xfs_name *ci_name);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t ino,
+				xfs_extlen_t tot);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+				const struct xfs_name *name, xfs_ino_t inum,
+				xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name);
+
+/*
+ * Direct call from the bmap code, bypassing the generic directory layer.
+ */
+extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
+
+/*
+ * Interface routines used by userspace utilities
+ */
+extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+				struct xfs_buf *bp);
+
+extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
+		struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
+		struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
+		struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+		struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
+		struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern int xfs_dir2_data_use_free(struct xfs_da_args *args,
+		struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
+		xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+		int *needlogp, int *needscanp);
+
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+		struct xfs_dir2_data_unused *dup);
+
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+	return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+	return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o)
+{
+	return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o)
+{
+	return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+	return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+	return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+	return ((struct xfs_dir2_block_tail *)
+		((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+	return (struct xfs_dir2_leaf_tail *)
+		((char *)lp + geo->blksize -
+		  sizeof(struct xfs_dir2_leaf_tail));
+}
+
+/*
+ * The Linux API doesn't pass down the total size of the buffer
+ * we read into down to the filesystem.  With the filldir concept
+ * it's not needed for correct information, but the XFS dir2 leaf
+ * code wants an estimate of the buffer size to calculate it's
+ * readahead window and size the buffers used for mapping to
+ * physical blocks.
+ *
+ * Try to give it an estimate that's good enough, maybe at some
+ * point we can change the ->readdir prototype to include the
+ * buffer size.  For now we use the current glibc buffer size.
+ * musl libc hardcodes 2k and dietlibc uses PAGE_SIZE.
+ */
+#define XFS_READDIR_BUFSIZE	(32768)
+
+unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
+unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo,
+		struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
+
+#endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
new file mode 100644
index 000000000..00f960a70
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -0,0 +1,1275 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
+				    int first, int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
+				     int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+	xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+
+static xfs_failaddr_t
+xfs_dir3_block_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_verify_magic(bp, hdr3->magic))
+		return __this_address;
+
+	if (xfs_has_crc(mp)) {
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
+			return __this_address;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+			return __this_address;
+	}
+	return __xfs_dir3_data_check(NULL, bp);
+}
+
+static void
+xfs_dir3_block_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	xfs_failaddr_t		fa;
+
+	if (xfs_has_crc(mp) &&
+	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_dir3_block_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_dir3_block_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_dir3_block_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+	.name = "xfs_dir3_block",
+	.magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+		   cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
+	.verify_read = xfs_dir3_block_read_verify,
+	.verify_write = xfs_dir3_block_write_verify,
+	.verify_struct = xfs_dir3_block_verify,
+};
+
+static xfs_failaddr_t
+xfs_dir3_block_header_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		if (be64_to_cpu(hdr3->owner) != dp->i_ino)
+			return __this_address;
+	}
+
+	return NULL;
+}
+
+int
+xfs_dir3_block_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	xfs_failaddr_t		fa;
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp,
+				XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+	if (err || !*bpp)
+		return err;
+
+	/* Check things that we can't do in the verifier. */
+	fa = xfs_dir3_block_header_check(dp, *bpp);
+	if (fa) {
+		__xfs_buf_mark_corrupt(*bpp, fa);
+		xfs_trans_brelse(tp, *bpp);
+		*bpp = NULL;
+		return -EFSCORRUPTED;
+	}
+
+	xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+	return err;
+}
+
+static void
+xfs_dir3_block_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+	bp->b_ops = &xfs_dir3_block_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+	if (xfs_has_crc(mp)) {
+		memset(hdr3, 0, sizeof(*hdr3));
+		hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+		hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
+		return;
+
+	}
+	hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+}
+
+static void
+xfs_dir2_block_need_space(
+	struct xfs_inode		*dp,
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_block_tail	*btp,
+	struct xfs_dir2_leaf_entry	*blp,
+	__be16				**tagpp,
+	struct xfs_dir2_data_unused	**dupp,
+	struct xfs_dir2_data_unused	**enddupp,
+	int				*compact,
+	int				len)
+{
+	struct xfs_dir2_data_free	*bf;
+	__be16				*tagp = NULL;
+	struct xfs_dir2_data_unused	*dup = NULL;
+	struct xfs_dir2_data_unused	*enddup = NULL;
+
+	*compact = 0;
+	bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
+	/*
+	 * If there are stale entries we'll use one for the leaf.
+	 */
+	if (btp->stale) {
+		if (be16_to_cpu(bf[0].length) >= len) {
+			/*
+			 * The biggest entry enough to avoid compaction.
+			 */
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)hdr + be16_to_cpu(bf[0].offset));
+			goto out;
+		}
+
+		/*
+		 * Will need to compact to make this work.
+		 * Tag just before the first leaf entry.
+		 */
+		*compact = 1;
+		tagp = (__be16 *)blp - 1;
+
+		/* Data object just before the first leaf entry.  */
+		dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+		/*
+		 * If it's not free then the data will go where the
+		 * leaf data starts now, if it works at all.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+			    (uint)sizeof(*blp) < len)
+				dup = NULL;
+		} else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+			dup = NULL;
+		else
+			dup = (xfs_dir2_data_unused_t *)blp;
+		goto out;
+	}
+
+	/*
+	 * no stale entries, so just use free space.
+	 * Tag just before the first leaf entry.
+	 */
+	tagp = (__be16 *)blp - 1;
+
+	/* Data object just before the first leaf entry.  */
+	enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+	/*
+	 * If it's not free then can't do this add without cleaning up:
+	 * the space before the first leaf entry needs to be free so it
+	 * can be expanded to hold the pointer to the new entry.
+	 */
+	if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+		/*
+		 * Check out the biggest freespace and see if it's the same one.
+		 */
+		dup = (xfs_dir2_data_unused_t *)
+		      ((char *)hdr + be16_to_cpu(bf[0].offset));
+		if (dup != enddup) {
+			/*
+			 * Not the same free entry, just check its length.
+			 */
+			if (be16_to_cpu(dup->length) < len)
+				dup = NULL;
+			goto out;
+		}
+
+		/*
+		 * It is the biggest freespace, can it hold the leaf too?
+		 */
+		if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+			/*
+			 * Yes, use the second-largest entry instead if it works.
+			 */
+			if (be16_to_cpu(bf[1].length) >= len)
+				dup = (xfs_dir2_data_unused_t *)
+				      ((char *)hdr + be16_to_cpu(bf[1].offset));
+			else
+				dup = NULL;
+		}
+	}
+out:
+	*tagpp = tagp;
+	*dupp = dup;
+	*enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+	struct xfs_da_args		*args,
+	struct xfs_buf			*bp,
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_block_tail	*btp,
+	struct xfs_dir2_leaf_entry	*blp,
+	int				*needlog,
+	int				*lfloghigh,
+	int				*lfloglow)
+{
+	int			fromidx;	/* source leaf index */
+	int			toidx;		/* target leaf index */
+	int			needscan = 0;
+	int			highstale;	/* high stale index */
+
+	fromidx = toidx = be32_to_cpu(btp->count) - 1;
+	highstale = *lfloghigh = -1;
+	for (; fromidx >= 0; fromidx--) {
+		if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+			if (highstale == -1)
+				highstale = toidx;
+			else {
+				if (*lfloghigh == -1)
+					*lfloghigh = toidx;
+				continue;
+			}
+		}
+		if (fromidx < toidx)
+			blp[toidx] = blp[fromidx];
+		toidx--;
+	}
+	*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+	*lfloghigh -= be32_to_cpu(btp->stale) - 1;
+	be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+	xfs_dir2_data_make_free(args, bp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+		(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+		needlog, &needscan);
+	btp->stale = cpu_to_be32(1);
+	/*
+	 * If we now need to rebuild the bestfree map, do so.
+	 * This needs to happen before the next call to use_free.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(args->dp->i_mount, hdr, needlog);
+}
+
+/*
+ * Add an entry to a block directory.
+ */
+int						/* error */
+xfs_dir2_block_addname(
+	xfs_da_args_t		*args)		/* directory op arguments */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	int			compact;	/* need to compact leaf ents */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	int			error;		/* error return value */
+	xfs_dir2_data_unused_t	*enddup=NULL;	/* unused at end of data */
+	xfs_dahash_t		hash;		/* hash value of found entry */
+	int			high;		/* high index for binary srch */
+	int			highstale;	/* high stale index */
+	int			lfloghigh=0;	/* last final leaf to log */
+	int			lfloglow=0;	/* first final leaf to log */
+	int			len;		/* length of the new entry */
+	int			low;		/* low index for binary srch */
+	int			lowstale;	/* low stale index */
+	int			mid=0;		/* midpoint for binary srch */
+	int			needlog;	/* need to log header */
+	int			needscan;	/* need to rescan freespace */
+	__be16			*tagp;		/* pointer to tag value */
+	xfs_trans_t		*tp;		/* transaction structure */
+
+	trace_xfs_dir2_block_addname(args);
+
+	dp = args->dp;
+	tp = args->trans;
+
+	/* Read the (one and only) directory block into bp. */
+	error = xfs_dir3_block_read(tp, dp, &bp);
+	if (error)
+		return error;
+
+	len = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
+
+	/*
+	 * Set up pointers to parts of the block.
+	 */
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Find out if we can reuse stale entries or whether we need extra
+	 * space for entry and new leaf.
+	 */
+	xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
+				  &enddup, &compact, len);
+
+	/*
+	 * Done everything we need for a space check now.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		xfs_trans_brelse(tp, bp);
+		if (!dup)
+			return -ENOSPC;
+		return 0;
+	}
+
+	/*
+	 * If we don't have space for the new entry & leaf ...
+	 */
+	if (!dup) {
+		/* Don't have a space reservation: return no-space.  */
+		if (args->total == 0)
+			return -ENOSPC;
+		/*
+		 * Convert to the next larger format.
+		 * Then add the new entry in that format.
+		 */
+		error = xfs_dir2_block_to_leaf(args, bp);
+		if (error)
+			return error;
+		return xfs_dir2_leaf_addname(args);
+	}
+
+	needlog = needscan = 0;
+
+	/*
+	 * If need to compact the leaf entries, do it now.
+	 */
+	if (compact) {
+		xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
+				      &lfloghigh, &lfloglow);
+		/* recalculate blp post-compaction */
+		blp = xfs_dir2_block_leaf_p(btp);
+	} else if (btp->stale) {
+		/*
+		 * Set leaf logging boundaries to impossible state.
+		 * For the no-stale case they're set explicitly.
+		 */
+		lfloglow = be32_to_cpu(btp->count);
+		lfloghigh = -1;
+	}
+
+	/*
+	 * Find the slot that's first lower than our hash value, -1 if none.
+	 */
+	for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
+		mid--;
+	}
+	/*
+	 * No stale entries, will use enddup space to hold new leaf.
+	 */
+	if (!btp->stale) {
+		xfs_dir2_data_aoff_t	aoff;
+
+		/*
+		 * Mark the space needed for the new leaf entry, now in use.
+		 */
+		aoff = (xfs_dir2_data_aoff_t)((char *)enddup - (char *)hdr +
+				be16_to_cpu(enddup->length) - sizeof(*blp));
+		error = xfs_dir2_data_use_free(args, bp, enddup, aoff,
+				(xfs_dir2_data_aoff_t)sizeof(*blp), &needlog,
+				&needscan);
+		if (error)
+			return error;
+
+		/*
+		 * Update the tail (entry count).
+		 */
+		be32_add_cpu(&btp->count, 1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+			needscan = 0;
+		}
+		/*
+		 * Adjust pointer to the first leaf entry, we're about to move
+		 * the table up one to open up space for the new leaf entry.
+		 * Then adjust our index to match.
+		 */
+		blp--;
+		mid++;
+		if (mid)
+			memmove(blp, &blp[1], mid * sizeof(*blp));
+		lfloglow = 0;
+		lfloghigh = mid;
+	}
+	/*
+	 * Use a stale leaf for our new entry.
+	 */
+	else {
+		for (lowstale = mid;
+		     lowstale >= 0 &&
+			blp[lowstale].address !=
+			cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+		     lowstale--)
+			continue;
+		for (highstale = mid + 1;
+		     highstale < be32_to_cpu(btp->count) &&
+			blp[highstale].address !=
+			cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
+			(lowstale < 0 || mid - lowstale > highstale - mid);
+		     highstale++)
+			continue;
+		/*
+		 * Move entries toward the low-numbered stale entry.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == be32_to_cpu(btp->count) ||
+		     mid - lowstale <= highstale - mid)) {
+			if (mid - lowstale)
+				memmove(&blp[lowstale], &blp[lowstale + 1],
+					(mid - lowstale) * sizeof(*blp));
+			lfloglow = min(lowstale, lfloglow);
+			lfloghigh = max(mid, lfloghigh);
+		}
+		/*
+		 * Move entries toward the high-numbered stale entry.
+		 */
+		else {
+			ASSERT(highstale < be32_to_cpu(btp->count));
+			mid++;
+			if (highstale - mid)
+				memmove(&blp[mid + 1], &blp[mid],
+					(highstale - mid) * sizeof(*blp));
+			lfloglow = min(mid, lfloglow);
+			lfloghigh = max(highstale, lfloghigh);
+		}
+		be32_add_cpu(&btp->stale, -1);
+	}
+	/*
+	 * Point to the new data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	/*
+	 * Fill in the leaf entry.
+	 */
+	blp[mid].hashval = cpu_to_be32(args->hashval);
+	blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+				(char *)dep - (char *)hdr));
+	xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+	/*
+	 * Mark space for the data entry used.
+	 */
+	error = xfs_dir2_data_use_free(args, bp, dup,
+			(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+			(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+	if (error)
+		return error;
+	/*
+	 * Create the new data entry.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, args->namelen);
+	xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+	tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	/*
+	 * Clean up the bestfree array and log the header, tail, and entry.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, bp);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+	xfs_trans_t		*tp,		/* transaction structure */
+	struct xfs_buf		*bp,		/* block buffer */
+	int			first,		/* index of first logged leaf */
+	int			last)		/* index of last logged leaf */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+	xfs_dir2_leaf_entry_t	*blp;
+	xfs_dir2_block_tail_t	*btp;
+
+	btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
+		(uint)((char *)&blp[last + 1] - (char *)hdr - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+	xfs_trans_t		*tp,		/* transaction structure */
+	struct xfs_buf		*bp)		/* block buffer */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+	xfs_dir2_block_tail_t	*btp;
+
+	btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+	xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
+		(uint)((char *)(btp + 1) - (char *)hdr - 1));
+}
+
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int						/* error */
+xfs_dir2_block_lookup(
+	xfs_da_args_t		*args)		/* dir lookup arguments */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* entry index */
+	int			error;		/* error return value */
+
+	trace_xfs_dir2_block_lookup(args);
+
+	/*
+	 * Get the buffer, look up the entry.
+	 * If not found (ENOENT) then return, have no buffer.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+		return error;
+	dp = args->dp;
+	hdr = bp->b_addr;
+	xfs_dir3_data_check(dp, bp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Get the offset from the leaf entry, to point to the data.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	/*
+	 * Fill in inode number, CI name if appropriate, release the block.
+	 */
+	args->inumber = be64_to_cpu(dep->inumber);
+	args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	xfs_trans_brelse(args->trans, bp);
+	return error;
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int					/* error */
+xfs_dir2_block_lookup_int(
+	xfs_da_args_t		*args,		/* dir lookup arguments */
+	struct xfs_buf		**bpp,		/* returned block buffer */
+	int			*entno)		/* returned entry number */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			error;		/* error return value */
+	xfs_dahash_t		hash;		/* found hash value */
+	int			high;		/* binary search high index */
+	int			low;		/* binary search low index */
+	int			mid;		/* binary search current idx */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+
+	dp = args->dp;
+	tp = args->trans;
+
+	error = xfs_dir3_block_read(tp, dp, &bp);
+	if (error)
+		return error;
+
+	hdr = bp->b_addr;
+	xfs_dir3_data_check(dp, bp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Loop doing a binary search for our hash value.
+	 * Find our entry, ENOENT if it's not there.
+	 */
+	for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
+		ASSERT(low <= high);
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+		if (low > high) {
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+			xfs_trans_brelse(tp, bp);
+			return -ENOENT;
+		}
+	}
+	/*
+	 * Back up to the first one with the right hash value.
+	 */
+	while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
+		mid--;
+	}
+	/*
+	 * Now loop forward through all the entries with the
+	 * right hash value looking for our name.
+	 */
+	do {
+		if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get pointer to the entry from the leaf.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
+		/*
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
+		 */
+		cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			*bpp = bp;
+			*entno = mid;
+			if (cmp == XFS_CMP_EXACT)
+				return 0;
+		}
+	} while (++mid < be32_to_cpu(btp->count) &&
+			be32_to_cpu(blp[mid].hashval) == hash);
+
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was found earlier, return success.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE)
+		return 0;
+	/*
+	 * No match, release the buffer and return ENOENT.
+	 */
+	xfs_trans_brelse(tp, bp);
+	return -ENOENT;
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int						/* error */
+xfs_dir2_block_removename(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf pointer */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* block leaf entry index */
+	int			error;		/* error return value */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to fixup bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* shortform size */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_block_removename(args);
+
+	/*
+	 * Look up the entry in the block.  Gets the buffer and entry index.
+	 * It will always be there, the vnodeops level does a lookup first.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Point to the data entry using the leaf entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	/*
+	 * Mark the data entry's space free.
+	 */
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(args, bp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+		xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+		&needscan);
+	/*
+	 * Fix up the block tail.
+	 */
+	be32_add_cpu(&btp->stale, 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	/*
+	 * Remove the leaf entry by marking it stale.
+	 */
+	blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+	/*
+	 * Fix up bestfree, log the header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, bp);
+	xfs_dir3_data_check(dp, bp);
+	/*
+	 * See if the size as a shortform is good enough.
+	 */
+	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+	if (size > xfs_inode_data_fork_size(dp))
+		return 0;
+
+	/*
+	 * If it works, do the conversion.
+	 */
+	return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int						/* error */
+xfs_dir2_block_replace(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* leaf entry index */
+	int			error;		/* error return value */
+
+	trace_xfs_dir2_block_replace(args);
+
+	/*
+	 * Lookup the entry in the directory.  Get buffer and entry index.
+	 * This will always succeed since the caller has already done a lookup.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Point to the data entry we need to change.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
+	/*
+	 * Change the inode number to the new value.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int					/* sort order */
+xfs_dir2_block_sort(
+	const void			*a,	/* first leaf entry */
+	const void			*b)	/* second leaf entry */
+{
+	const xfs_dir2_leaf_entry_t	*la;	/* first leaf entry */
+	const xfs_dir2_leaf_entry_t	*lb;	/* second leaf entry */
+
+	la = a;
+	lb = b;
+	return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+		(be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int						/* error */
+xfs_dir2_leaf_to_block(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp,		/* leaf buffer */
+	struct xfs_buf		*dbp)		/* data buffer */
+{
+	__be16			*bestsp;	/* leaf bests table */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	int			error;		/* error return value */
+	int			from;		/* leaf from index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to scan for bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* bytes used */
+	__be16			*tagp;		/* end of entry (tag) */
+	int			to;		/* block/leaf to index */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_to_block(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->b_addr;
+	xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
+	/*
+	 * If there are data blocks other than the first one, take this
+	 * opportunity to remove trailing empty data blocks that may have
+	 * been left behind during no-space-reservation operations.
+	 * These will show up in the leaf bests table.
+	 */
+	while (dp->i_disk_size > args->geo->blksize) {
+		int hdrsz;
+
+		hdrsz = args->geo->data_entry_offset;
+		bestsp = xfs_dir2_leaf_bests_p(ltp);
+		if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+					    args->geo->blksize - hdrsz) {
+			if ((error =
+			    xfs_dir2_leaf_trim_data(args, lbp,
+				    (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+				return error;
+		} else
+			return 0;
+	}
+	/*
+	 * Read the data block if we don't already have it, give up if it fails.
+	 */
+	if (!dbp) {
+		error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp);
+		if (error)
+			return error;
+	}
+	hdr = dbp->b_addr;
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
+	/*
+	 * Size of the "leaf" area in the block.
+	 */
+	size = (uint)sizeof(xfs_dir2_block_tail_t) +
+	       (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
+	/*
+	 * Look at the last data entry.
+	 */
+	tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
+	dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+	/*
+	 * If it's not free or is too short we can't do it.
+	 */
+	if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+	    be16_to_cpu(dup->length) < size)
+		return 0;
+
+	/*
+	 * Start converting it to block form.
+	 */
+	xfs_dir3_block_init(mp, tp, dbp, dp);
+
+	needlog = 1;
+	needscan = 0;
+	/*
+	 * Use up the space at the end of the block (blp/btp).
+	 */
+	error = xfs_dir2_data_use_free(args, dbp, dup,
+			args->geo->blksize - size, size, &needlog, &needscan);
+	if (error)
+		return error;
+	/*
+	 * Initialize the block tail.
+	 */
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
+	btp->stale = 0;
+	xfs_dir2_block_log_tail(tp, dbp);
+	/*
+	 * Initialize the block leaf area.  We compact out stale entries.
+	 */
+	lep = xfs_dir2_block_leaf_p(btp);
+	for (from = to = 0; from < leafhdr.count; from++) {
+		if (leafhdr.ents[from].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			continue;
+		lep[to++] = leafhdr.ents[from];
+	}
+	ASSERT(to == be32_to_cpu(btp->count));
+	xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
+	/*
+	 * Scan the bestfree if we need it and log the data block header.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	/*
+	 * Pitch the old leaf block.
+	 */
+	error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
+	if (error)
+		return error;
+
+	/*
+	 * Now see if the resulting block can be shrunken to shortform.
+	 */
+	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+	if (size > xfs_inode_data_fork_size(dp))
+		return 0;
+
+	return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int						/* error */
+xfs_dir2_sf_to_block(
+	struct xfs_da_args	*args)
+{
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
+	struct xfs_da_geometry	*geo = args->geo;
+	xfs_dir2_db_t		blkno;		/* dir-relative block # (0) */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	int			dummy;		/* trash */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			endoffset;	/* end of data objects */
+	int			error;		/* error return value */
+	int			i;		/* index */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to scan block freespc */
+	int			newoffset;	/* offset from current entry */
+	unsigned int		offset = geo->data_entry_offset;
+	xfs_dir2_sf_entry_t	*sfep;		/* sf entry pointer */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old shortform header  */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform header  */
+	__be16			*tagp;		/* end of data entry */
+	struct xfs_name		name;
+
+	trace_xfs_dir2_sf_to_block(args);
+
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+	ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+
+	oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
+
+	ASSERT(ifp->if_bytes == dp->i_disk_size);
+	ASSERT(ifp->if_u1.if_data != NULL);
+	ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+	ASSERT(dp->i_df.if_nextents == 0);
+
+	/*
+	 * Copy the directory into a temporary buffer.
+	 * Then pitch the incore inode data so we can make extents.
+	 */
+	sfp = kmem_alloc(ifp->if_bytes, 0);
+	memcpy(sfp, oldsfp, ifp->if_bytes);
+
+	xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+	xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
+	dp->i_disk_size = 0;
+
+	/*
+	 * Add block 0 to the inode.
+	 */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+	if (error)
+		goto out_free;
+	/*
+	 * Initialize the data block, then convert it to block format.
+	 */
+	error = xfs_dir3_data_init(args, blkno, &bp);
+	if (error)
+		goto out_free;
+	xfs_dir3_block_init(mp, tp, bp, dp);
+	hdr = bp->b_addr;
+
+	/*
+	 * Compute size of block "tail" area.
+	 */
+	i = (uint)sizeof(*btp) +
+	    (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+	/*
+	 * The whole thing is initialized to free by the init routine.
+	 * Say we're using the leaf and tail area.
+	 */
+	dup = bp->b_addr + offset;
+	needlog = needscan = 0;
+	error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+			i, &needlog, &needscan);
+	if (error)
+		goto out_free;
+	ASSERT(needscan == 0);
+	/*
+	 * Fill in the tail.
+	 */
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	btp->count = cpu_to_be32(sfp->count + 2);	/* ., .. */
+	btp->stale = 0;
+	blp = xfs_dir2_block_leaf_p(btp);
+	endoffset = (uint)((char *)blp - (char *)hdr);
+	/*
+	 * Remove the freespace, we'll manage it.
+	 */
+	error = xfs_dir2_data_use_free(args, bp, dup,
+			(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+			be16_to_cpu(dup->length), &needlog, &needscan);
+	if (error)
+		goto out_free;
+
+	/*
+	 * Create entry for .
+	 */
+	dep = bp->b_addr + offset;
+	dep->inumber = cpu_to_be64(dp->i_ino);
+	dep->namelen = 1;
+	dep->name[0] = '.';
+	xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+	tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+	*tagp = cpu_to_be16(offset);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+	blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset));
+	offset += xfs_dir2_data_entsize(mp, dep->namelen);
+
+	/*
+	 * Create entry for ..
+	 */
+	dep = bp->b_addr + offset;
+	dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
+	dep->namelen = 2;
+	dep->name[0] = dep->name[1] = '.';
+	xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+	tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+	*tagp = cpu_to_be16(offset);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset));
+	offset += xfs_dir2_data_entsize(mp, dep->namelen);
+
+	/*
+	 * Loop over existing entries, stuff them in.
+	 */
+	i = 0;
+	if (!sfp->count)
+		sfep = NULL;
+	else
+		sfep = xfs_dir2_sf_firstentry(sfp);
+
+	/*
+	 * Need to preserve the existing offset values in the sf directory.
+	 * Insert holes (unused entries) where necessary.
+	 */
+	while (offset < endoffset) {
+		/*
+		 * sfep is null when we reach the end of the list.
+		 */
+		if (sfep == NULL)
+			newoffset = endoffset;
+		else
+			newoffset = xfs_dir2_sf_get_offset(sfep);
+		/*
+		 * There should be a hole here, make one.
+		 */
+		if (offset < newoffset) {
+			dup = bp->b_addr + offset;
+			dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+			dup->length = cpu_to_be16(newoffset - offset);
+			*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(offset);
+			xfs_dir2_data_log_unused(args, bp, dup);
+			xfs_dir2_data_freeinsert(hdr,
+					xfs_dir2_data_bestfree_p(mp, hdr),
+					dup, &dummy);
+			offset += be16_to_cpu(dup->length);
+			continue;
+		}
+		/*
+		 * Copy a real entry.
+		 */
+		dep = bp->b_addr + newoffset;
+		dep->inumber = cpu_to_be64(xfs_dir2_sf_get_ino(mp, sfp, sfep));
+		dep->namelen = sfep->namelen;
+		xfs_dir2_data_put_ftype(mp, dep,
+				xfs_dir2_sf_get_ftype(mp, sfep));
+		memcpy(dep->name, sfep->name, dep->namelen);
+		tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+		*tagp = cpu_to_be16(newoffset);
+		xfs_dir2_data_log_entry(args, bp, dep);
+		name.name = sfep->name;
+		name.len = sfep->namelen;
+		blp[2 + i].hashval = cpu_to_be32(xfs_dir2_hashname(mp, &name));
+		blp[2 + i].address =
+			cpu_to_be32(xfs_dir2_byte_to_dataptr(newoffset));
+		offset = (int)((char *)(tagp + 1) - (char *)hdr);
+		if (++i == sfp->count)
+			sfep = NULL;
+		else
+			sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+	}
+	/* Done with the temporary buffer */
+	kmem_free(sfp);
+	/*
+	 * Sort the leaf entries by hash value.
+	 */
+	xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
+	/*
+	 * Log the leaf entry area and tail.
+	 * Already logged the header in data_init, ignore needlog.
+	 */
+	ASSERT(needscan == 0);
+	xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+out_free:
+	kmem_free(sfp);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
new file mode 100644
index 000000000..dbcf58979
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -0,0 +1,1223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+static xfs_failaddr_t xfs_dir2_data_freefind_verify(
+		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+		struct xfs_dir2_data_unused *dup,
+		struct xfs_dir2_data_free **bf_ent);
+
+struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_data_hdr	*hdr)
+{
+	if (xfs_has_crc(mp))
+		return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+	return hdr->bestfree;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+__be16 *
+xfs_dir2_data_entry_tag_p(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_data_entry	*dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir2_data_entsize(mp, dep->namelen) - sizeof(__be16));
+}
+
+uint8_t
+xfs_dir2_data_get_ftype(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_data_entry	*dep)
+{
+	if (xfs_has_ftype(mp)) {
+		uint8_t			ftype = dep->name[dep->namelen];
+
+		if (likely(ftype < XFS_DIR3_FT_MAX))
+			return ftype;
+	}
+
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+void
+xfs_dir2_data_put_ftype(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_data_entry	*dep,
+	uint8_t				ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+	ASSERT(dep->namelen != 0);
+
+	if (xfs_has_ftype(mp))
+		dep->name[dep->namelen] = ftype;
+}
+
+/*
+ * The number of leaf entries is limited by the size of the block and the amount
+ * of space used by the data entries.  We don't know how much space is used by
+ * the data entries yet, so just ensure that the count falls somewhere inside
+ * the block right now.
+ */
+static inline unsigned int
+xfs_dir2_data_max_leaf_entries(
+	struct xfs_da_geometry		*geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir2_block_tail) -
+		geo->data_entry_offset) /
+			sizeof(struct xfs_dir2_leaf_entry);
+}
+
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Return NULL if the buffer is good, otherwise the address of the error.
+ */
+xfs_failaddr_t
+__xfs_dir3_data_check(
+	struct xfs_inode	*dp,		/* incore inode pointer */
+	struct xfs_buf		*bp)		/* data block's buffer */
+{
+	xfs_dir2_dataptr_t	addr;		/* addr for leaf lookup */
+	xfs_dir2_data_free_t	*bf;		/* bestfree table */
+	xfs_dir2_block_tail_t	*btp=NULL;	/* block tail */
+	int			count;		/* count of entries found */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	int			freeseen;	/* mask of bestfrees seen */
+	xfs_dahash_t		hash;		/* hash of current name */
+	int			i;		/* leaf index */
+	int			lastfree;	/* last entry was unused */
+	xfs_dir2_leaf_entry_t	*lep=NULL;	/* block leaf entries */
+	struct xfs_mount	*mp = bp->b_mount;
+	int			stale;		/* count of stale leaves */
+	struct xfs_name		name;
+	unsigned int		offset;
+	unsigned int		end;
+	struct xfs_da_geometry	*geo = mp->m_dir_geo;
+
+	/*
+	 * If this isn't a directory, something is seriously wrong.  Bail out.
+	 */
+	if (dp && !S_ISDIR(VFS_I(dp)->i_mode))
+		return __this_address;
+
+	hdr = bp->b_addr;
+	offset = geo->data_entry_offset;
+
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+		btp = xfs_dir2_block_tail_p(geo, hdr);
+		lep = xfs_dir2_block_leaf_p(btp);
+
+		if (be32_to_cpu(btp->count) >=
+		    xfs_dir2_data_max_leaf_entries(geo))
+			return __this_address;
+		break;
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+		break;
+	default:
+		return __this_address;
+	}
+	end = xfs_dir3_data_end_offset(geo, hdr);
+	if (!end)
+		return __this_address;
+
+	/*
+	 * Account for zero bestfree entries.
+	 */
+	bf = xfs_dir2_data_bestfree_p(mp, hdr);
+	count = lastfree = freeseen = 0;
+	if (!bf[0].length) {
+		if (bf[0].offset)
+			return __this_address;
+		freeseen |= 1 << 0;
+	}
+	if (!bf[1].length) {
+		if (bf[1].offset)
+			return __this_address;
+		freeseen |= 1 << 1;
+	}
+	if (!bf[2].length) {
+		if (bf[2].offset)
+			return __this_address;
+		freeseen |= 1 << 2;
+	}
+
+	if (be16_to_cpu(bf[0].length) < be16_to_cpu(bf[1].length))
+		return __this_address;
+	if (be16_to_cpu(bf[1].length) < be16_to_cpu(bf[2].length))
+		return __this_address;
+	/*
+	 * Loop over the data/unused entries.
+	 */
+	while (offset < end) {
+		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
+		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;
+
+		/*
+		 * If it's unused, look for the space in the bestfree table.
+		 * If we find it, account for that, else make sure it
+		 * doesn't need to be there.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			xfs_failaddr_t	fa;
+
+			if (lastfree != 0)
+				return __this_address;
+			if (offset + be16_to_cpu(dup->length) > end)
+				return __this_address;
+			if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) !=
+			    offset)
+				return __this_address;
+			fa = xfs_dir2_data_freefind_verify(hdr, bf, dup, &dfp);
+			if (fa)
+				return fa;
+			if (dfp) {
+				i = (int)(dfp - bf);
+				if ((freeseen & (1 << i)) != 0)
+					return __this_address;
+				freeseen |= 1 << i;
+			} else {
+				if (be16_to_cpu(dup->length) >
+				    be16_to_cpu(bf[2].length))
+					return __this_address;
+			}
+			offset += be16_to_cpu(dup->length);
+			lastfree = 1;
+			continue;
+		}
+		/*
+		 * It's a real entry.  Validate the fields.
+		 * If this is a block directory then make sure it's
+		 * in the leaf section of the block.
+		 * The linear search is crude but this is DEBUG code.
+		 */
+		if (dep->namelen == 0)
+			return __this_address;
+		if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber)))
+			return __this_address;
+		if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end)
+			return __this_address;
+		if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset)
+			return __this_address;
+		if (xfs_dir2_data_get_ftype(mp, dep) >= XFS_DIR3_FT_MAX)
+			return __this_address;
+		count++;
+		lastfree = 0;
+		if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+			addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+						(xfs_dir2_data_aoff_t)
+						((char *)dep - (char *)hdr));
+			name.name = dep->name;
+			name.len = dep->namelen;
+			hash = xfs_dir2_hashname(mp, &name);
+			for (i = 0; i < be32_to_cpu(btp->count); i++) {
+				if (be32_to_cpu(lep[i].address) == addr &&
+				    be32_to_cpu(lep[i].hashval) == hash)
+					break;
+			}
+			if (i >= be32_to_cpu(btp->count))
+				return __this_address;
+		}
+		offset += xfs_dir2_data_entsize(mp, dep->namelen);
+	}
+	/*
+	 * Need to have seen all the entries and all the bestfree slots.
+	 */
+	if (freeseen != 7)
+		return __this_address;
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+		for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+			if (lep[i].address ==
+			    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+				stale++;
+			if (i > 0 && be32_to_cpu(lep[i].hashval) <
+				     be32_to_cpu(lep[i - 1].hashval))
+				return __this_address;
+		}
+		if (count != be32_to_cpu(btp->count) - be32_to_cpu(btp->stale))
+			return __this_address;
+		if (stale != be32_to_cpu(btp->stale))
+			return __this_address;
+	}
+	return NULL;
+}
+
+#ifdef DEBUG
+void
+xfs_dir3_data_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	xfs_failaddr_t		fa;
+
+	fa = __xfs_dir3_data_check(dp, bp);
+	if (!fa)
+		return;
+	xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount,
+			bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__,
+			fa);
+	ASSERT(0);
+}
+#endif
+
+static xfs_failaddr_t
+xfs_dir3_data_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_verify_magic(bp, hdr3->magic))
+		return __this_address;
+
+	if (xfs_has_crc(mp)) {
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
+			return __this_address;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+			return __this_address;
+	}
+	return __xfs_dir3_data_check(NULL, bp);
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir3_data_reada_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+		bp->b_ops = &xfs_dir3_block_buf_ops;
+		bp->b_ops->verify_read(bp);
+		return;
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+		bp->b_ops = &xfs_dir3_data_buf_ops;
+		bp->b_ops->verify_read(bp);
+		return;
+	default:
+		xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+		break;
+	}
+}
+
+static void
+xfs_dir3_data_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	xfs_failaddr_t		fa;
+
+	if (xfs_has_crc(mp) &&
+	    !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_dir3_data_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_dir3_data_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_dir3_data_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+	.name = "xfs_dir3_data",
+	.magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+		   cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
+	.verify_read = xfs_dir3_data_read_verify,
+	.verify_write = xfs_dir3_data_write_verify,
+	.verify_struct = xfs_dir3_data_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+	.name = "xfs_dir3_data_reada",
+	.magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+		   cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
+	.verify_read = xfs_dir3_data_reada_verify,
+	.verify_write = xfs_dir3_data_write_verify,
+};
+
+static xfs_failaddr_t
+xfs_dir3_data_header_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
+
+		if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+			return __this_address;
+	}
+
+	return NULL;
+}
+
+int
+xfs_dir3_data_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	unsigned int		flags,
+	struct xfs_buf		**bpp)
+{
+	xfs_failaddr_t		fa;
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK,
+			&xfs_dir3_data_buf_ops);
+	if (err || !*bpp)
+		return err;
+
+	/* Check things that we can't do in the verifier. */
+	fa = xfs_dir3_data_header_check(dp, *bpp);
+	if (fa) {
+		__xfs_buf_mark_corrupt(*bpp, fa);
+		xfs_trans_brelse(tp, *bpp);
+		*bpp = NULL;
+		return -EFSCORRUPTED;
+	}
+
+	xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+	return err;
+}
+
+int
+xfs_dir3_data_readahead(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	unsigned int		flags)
+{
+	return xfs_da_reada_buf(dp, bno, flags, XFS_DATA_FORK,
+				&xfs_dir3_data_reada_buf_ops);
+}
+
+/*
+ * Find the bestfree entry that exactly coincides with unused directory space
+ * or a verifier error because the bestfree data are bad.
+ */
+static xfs_failaddr_t
+xfs_dir2_data_freefind_verify(
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_data_free	*bf,
+	struct xfs_dir2_data_unused	*dup,
+	struct xfs_dir2_data_free	**bf_ent)
+{
+	struct xfs_dir2_data_free	*dfp;
+	xfs_dir2_data_aoff_t		off;
+	bool				matched = false;
+	bool				seenzero = false;
+
+	*bf_ent = NULL;
+	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+
+	/*
+	 * Validate some consistency in the bestfree table.
+	 * Check order, non-overlapping entries, and if we find the
+	 * one we're looking for it has to be exact.
+	 */
+	for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+		if (!dfp->offset) {
+			if (dfp->length)
+				return __this_address;
+			seenzero = true;
+			continue;
+		}
+		if (seenzero)
+			return __this_address;
+		if (be16_to_cpu(dfp->offset) == off) {
+			matched = true;
+			if (dfp->length != dup->length)
+				return __this_address;
+		} else if (be16_to_cpu(dfp->offset) > off) {
+			if (off + be16_to_cpu(dup->length) >
+					be16_to_cpu(dfp->offset))
+				return __this_address;
+		} else {
+			if (be16_to_cpu(dfp->offset) +
+					be16_to_cpu(dfp->length) > off)
+				return __this_address;
+		}
+		if (!matched &&
+		    be16_to_cpu(dfp->length) < be16_to_cpu(dup->length))
+			return __this_address;
+		if (dfp > &bf[0] &&
+		    be16_to_cpu(dfp[-1].length) < be16_to_cpu(dfp[0].length))
+			return __this_address;
+	}
+
+	/* Looks ok so far; now try to match up with a bestfree entry. */
+	*bf_ent = xfs_dir2_data_freefind(hdr, bf, dup);
+	return NULL;
+}
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+	struct xfs_dir2_data_hdr *hdr,		/* data block header */
+	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
+	struct xfs_dir2_data_unused *dup)	/* unused space */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_aoff_t	off;		/* offset value needed */
+
+	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+
+	/*
+	 * If this is smaller than the smallest bestfree entry,
+	 * it can't be there since they're sorted.
+	 */
+	if (be16_to_cpu(dup->length) <
+	    be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+		return NULL;
+	/*
+	 * Look at the three bestfree entries for our guy.
+	 */
+	for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+		if (!dfp->offset)
+			return NULL;
+		if (be16_to_cpu(dfp->offset) == off)
+			return dfp;
+	}
+	/*
+	 * Didn't find it.  This only happens if there are duplicate lengths.
+	 */
+	return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *				/* entry inserted */
+xfs_dir2_data_freeinsert(
+	struct xfs_dir2_data_hdr *hdr,		/* data block pointer */
+	struct xfs_dir2_data_free *dfp,		/* bestfree table pointer */
+	struct xfs_dir2_data_unused *dup,	/* unused space */
+	int			*loghead)	/* log the data header (out) */
+{
+	xfs_dir2_data_free_t	new;		/* new bestfree entry */
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	new.length = dup->length;
+	new.offset = cpu_to_be16((char *)dup - (char *)hdr);
+
+	/*
+	 * Insert at position 0, 1, or 2; or not at all.
+	 */
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
+		dfp[2] = dfp[1];
+		dfp[1] = dfp[0];
+		dfp[0] = new;
+		*loghead = 1;
+		return &dfp[0];
+	}
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
+		dfp[2] = dfp[1];
+		dfp[1] = new;
+		*loghead = 1;
+		return &dfp[1];
+	}
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
+		dfp[2] = new;
+		*loghead = 1;
+		return &dfp[2];
+	}
+	return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+STATIC void
+xfs_dir2_data_freeremove(
+	struct xfs_dir2_data_hdr *hdr,		/* data block header */
+	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
+	struct xfs_dir2_data_free *dfp,		/* bestfree entry pointer */
+	int			*loghead)	/* out: log data header */
+{
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * It's the first entry, slide the next 2 up.
+	 */
+	if (dfp == &bf[0]) {
+		bf[0] = bf[1];
+		bf[1] = bf[2];
+	}
+	/*
+	 * It's the second entry, slide the 3rd entry up.
+	 */
+	else if (dfp == &bf[1])
+		bf[1] = bf[2];
+	/*
+	 * Must be the last entry.
+	 */
+	else
+		ASSERT(dfp == &bf[2]);
+	/*
+	 * Clear the 3rd entry, must be zero now.
+	 */
+	bf[2].length = 0;
+	bf[2].offset = 0;
+	*loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_data_hdr	*hdr,
+	int				*loghead)
+{
+	struct xfs_da_geometry		*geo = mp->m_dir_geo;
+	struct xfs_dir2_data_free	*bf = xfs_dir2_data_bestfree_p(mp, hdr);
+	void				*addr = hdr;
+	unsigned int			offset = geo->data_entry_offset;
+	unsigned int			end;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * Start by clearing the table.
+	 */
+	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
+	*loghead = 1;
+
+	end = xfs_dir3_data_end_offset(geo, addr);
+	while (offset < end) {
+		struct xfs_dir2_data_unused	*dup = addr + offset;
+		struct xfs_dir2_data_entry	*dep = addr + offset;
+
+		/*
+		 * If it's a free entry, insert it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT(offset ==
+			       be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+			xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
+			offset += be16_to_cpu(dup->length);
+			continue;
+		}
+
+		/*
+		 * For active entries, check their tags and skip them.
+		 */
+		ASSERT(offset ==
+		       be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)));
+		offset += xfs_dir2_data_entsize(mp, dep->namelen);
+	}
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int						/* error */
+xfs_dir3_data_init(
+	struct xfs_da_args		*args,	/* directory operation args */
+	xfs_dir2_db_t			blkno,	/* logical dir block number */
+	struct xfs_buf			**bpp)	/* output block buffer */
+{
+	struct xfs_trans		*tp = args->trans;
+	struct xfs_inode		*dp = args->dp;
+	struct xfs_mount		*mp = dp->i_mount;
+	struct xfs_da_geometry		*geo = args->geo;
+	struct xfs_buf			*bp;
+	struct xfs_dir2_data_hdr	*hdr;
+	struct xfs_dir2_data_unused	*dup;
+	struct xfs_dir2_data_free 	*bf;
+	int				error;
+	int				i;
+
+	/*
+	 * Get the buffer set up for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+			       &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_dir3_data_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
+
+	/*
+	 * Initialize the header.
+	 */
+	hdr = bp->b_addr;
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		memset(hdr3, 0, sizeof(*hdr3));
+		hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+		hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
+
+	} else
+		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+	bf = xfs_dir2_data_bestfree_p(mp, hdr);
+	bf[0].offset = cpu_to_be16(geo->data_entry_offset);
+	bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset);
+	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+		bf[i].length = 0;
+		bf[i].offset = 0;
+	}
+
+	/*
+	 * Set up an unused entry for the block's body.
+	 */
+	dup = bp->b_addr + geo->data_entry_offset;
+	dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+	dup->length = bf[0].length;
+	*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
+
+	/*
+	 * Log it and return it.
+	 */
+	xfs_dir2_data_log_header(args, bp);
+	xfs_dir2_data_log_unused(args, bp, dup);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_entry_t	*dep)		/* data entry pointer */
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+		(uint)((char *)(xfs_dir2_data_entry_tag_p(mp, dep) + 1) -
+		       (char *)hdr - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+#ifdef DEBUG
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
+
+	xfs_trans_log_buf(args->trans, bp, 0, args->geo->data_entry_offset - 1);
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_unused_t	*dup)		/* data unused pointer */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * Log the first part of the unused entry.
+	 */
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
+		(uint)((char *)&dup->length + sizeof(dup->length) -
+		       1 - (char *)hdr));
+	/*
+	 * Log the end (tag) of the unused entry.
+	 */
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
+		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
+		       sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_aoff_t	offset,		/* starting byte offset */
+	xfs_dir2_data_aoff_t	len,		/* length in bytes */
+	int			*needlogp,	/* out: log header */
+	int			*needscanp)	/* out: regen bestfree */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*postdup;	/* unused entry after us */
+	xfs_dir2_data_unused_t	*prevdup;	/* unused entry before us */
+	unsigned int		end;
+	struct xfs_dir2_data_free *bf;
+
+	hdr = bp->b_addr;
+
+	/*
+	 * Figure out where the end of the data area is.
+	 */
+	end = xfs_dir3_data_end_offset(args->geo, hdr);
+	ASSERT(end != 0);
+
+	/*
+	 * If this isn't the start of the block, then back up to
+	 * the previous entry and see if it's free.
+	 */
+	if (offset > args->geo->data_entry_offset) {
+		__be16			*tagp;	/* tag just before us */
+
+		tagp = (__be16 *)((char *)hdr + offset) - 1;
+		prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+		if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			prevdup = NULL;
+	} else
+		prevdup = NULL;
+	/*
+	 * If this isn't the end of the block, see if the entry after
+	 * us is free.
+	 */
+	if (offset + len < end) {
+		postdup =
+			(xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			postdup = NULL;
+	} else
+		postdup = NULL;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * Previous and following entries are both free,
+	 * merge everything into a single free entry.
+	 */
+	bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr);
+	if (prevdup && postdup) {
+		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
+
+		/*
+		 * See if prevdup and/or postdup are in bestfree table.
+		 */
+		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+		dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
+		/*
+		 * We need a rescan unless there are exactly 2 free entries
+		 * namely our two.  Then we know what's happening, otherwise
+		 * since the third bestfree is there, there might be more
+		 * entries.
+		 */
+		needscan = (bf[2].length != 0);
+		/*
+		 * Fix up the new big freespace.
+		 */
+		be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+		*xfs_dir2_data_unused_tag_p(prevdup) =
+			cpu_to_be16((char *)prevdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, prevdup);
+		if (!needscan) {
+			/*
+			 * Has to be the case that entries 0 and 1 are
+			 * dfp and dfp2 (don't know which is which), and
+			 * entry 2 is empty.
+			 * Remove entry 1 first then entry 0.
+			 */
+			ASSERT(dfp && dfp2);
+			if (dfp == &bf[1]) {
+				dfp = &bf[0];
+				ASSERT(dfp2 == dfp);
+				dfp2 = &bf[1];
+			}
+			xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			/*
+			 * Now insert the new entry.
+			 */
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+						       needlogp);
+			ASSERT(dfp == &bf[0]);
+			ASSERT(dfp->length == prevdup->length);
+			ASSERT(!dfp[1].length);
+			ASSERT(!dfp[2].length);
+		}
+	}
+	/*
+	 * The entry before us is free, merge with it.
+	 */
+	else if (prevdup) {
+		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+		be16_add_cpu(&prevdup->length, len);
+		*xfs_dir2_data_unused_tag_p(prevdup) =
+			cpu_to_be16((char *)prevdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, prevdup);
+		/*
+		 * If the previous entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else {
+			needscan = be16_to_cpu(prevdup->length) >
+				   be16_to_cpu(bf[2].length);
+		}
+	}
+	/*
+	 * The following entry is free, merge with it.
+	 */
+	else if (postdup) {
+		dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If the following entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else {
+			needscan = be16_to_cpu(newdup->length) >
+				   be16_to_cpu(bf[2].length);
+		}
+	}
+	/*
+	 * Neither neighbor is free.  Make a new entry.
+	 */
+	else {
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(len);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+	}
+	*needscanp = needscan;
+}
+
+/* Check our free data for obvious signs of corruption. */
+static inline xfs_failaddr_t
+xfs_dir2_data_check_free(
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_data_unused	*dup,
+	xfs_dir2_data_aoff_t		offset,
+	xfs_dir2_data_aoff_t		len)
+{
+	if (hdr->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC) &&
+	    hdr->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC) &&
+	    hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) &&
+	    hdr->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+		return __this_address;
+	if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+		return __this_address;
+	if (offset < (char *)dup - (char *)hdr)
+		return __this_address;
+	if (offset + len > (char *)dup + be16_to_cpu(dup->length) - (char *)hdr)
+		return __this_address;
+	if ((char *)dup - (char *)hdr !=
+			be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)))
+		return __this_address;
+	return NULL;
+}
+
+/* Sanity-check a new bestfree entry. */
+static inline xfs_failaddr_t
+xfs_dir2_data_check_new_free(
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_data_free	*dfp,
+	struct xfs_dir2_data_unused	*newdup)
+{
+	if (dfp == NULL)
+		return __this_address;
+	if (dfp->length != newdup->length)
+		return __this_address;
+	if (be16_to_cpu(dfp->offset) != (char *)newdup - (char *)hdr)
+		return __this_address;
+	return NULL;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+int
+xfs_dir2_data_use_free(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_unused_t	*dup,		/* unused entry */
+	xfs_dir2_data_aoff_t	offset,		/* starting offset to use */
+	xfs_dir2_data_aoff_t	len,		/* length to use */
+	int			*needlogp,	/* out: need to log header */
+	int			*needscanp)	/* out: need regen bestfree */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*newdup2;	/* another new unused entry */
+	struct xfs_dir2_data_free *bf;
+	xfs_failaddr_t		fa;
+	int			matchback;	/* matches end of freespace */
+	int			matchfront;	/* matches start of freespace */
+	int			needscan;	/* need to regen bestfree */
+	int			oldlen;		/* old unused entry's length */
+
+	hdr = bp->b_addr;
+	fa = xfs_dir2_data_check_free(hdr, dup, offset, len);
+	if (fa)
+		goto corrupt;
+	/*
+	 * Look up the entry in the bestfree table.
+	 */
+	oldlen = be16_to_cpu(dup->length);
+	bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr);
+	dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+	ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
+	/*
+	 * Check for alignment with front and back of the entry.
+	 */
+	matchfront = (char *)dup - (char *)hdr == offset;
+	matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * If we matched it exactly we just need to get rid of it from
+	 * the bestfree table.
+	 */
+	if (matchfront && matchback) {
+		if (dfp) {
+			needscan = (bf[2].offset != 0);
+			if (!needscan)
+				xfs_dir2_data_freeremove(hdr, bf, dfp,
+							 needlogp);
+		}
+	}
+	/*
+	 * We match the first part of the entry.
+	 * Make a new entry with the remaining freespace.
+	 */
+	else if (matchfront) {
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(oldlen - len);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+						       needlogp);
+			fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup);
+			if (fa)
+				goto corrupt;
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &bf[2];
+		}
+	}
+	/*
+	 * We match the last part of the entry.
+	 * Trim the allocated space off the tail of the entry.
+	 */
+	else if (matchback) {
+		newdup = dup;
+		newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+						       needlogp);
+			fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup);
+			if (fa)
+				goto corrupt;
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &bf[2];
+		}
+	}
+	/*
+	 * Poking out the middle of an entry.
+	 * Make two new entries.
+	 */
+	else {
+		newdup = dup;
+		newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+		*xfs_dir2_data_unused_tag_p(newdup2) =
+			cpu_to_be16((char *)newdup2 - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup2);
+		/*
+		 * If the old entry was in the table, we need to scan
+		 * if the 3rd entry was valid, since these entries
+		 * are smaller than the old one.
+		 * If we don't need to scan that means there were 1 or 2
+		 * entries in the table, and removing the old and adding
+		 * the 2 new will work.
+		 */
+		if (dfp) {
+			needscan = (bf[2].length != 0);
+			if (!needscan) {
+				xfs_dir2_data_freeremove(hdr, bf, dfp,
+							 needlogp);
+				xfs_dir2_data_freeinsert(hdr, bf, newdup,
+							 needlogp);
+				xfs_dir2_data_freeinsert(hdr, bf, newdup2,
+							 needlogp);
+			}
+		}
+	}
+	*needscanp = needscan;
+	return 0;
+corrupt:
+	xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount,
+			hdr, sizeof(*hdr), __FILE__, __LINE__, fa);
+	return -EFSCORRUPTED;
+}
+
+/* Find the end of the entry data in a data/block format dir block. */
+unsigned int
+xfs_dir3_data_end_offset(
+	struct xfs_da_geometry		*geo,
+	struct xfs_dir2_data_hdr	*hdr)
+{
+	void				*p;
+
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+		p = xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr));
+		return p - (void *)hdr;
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+		return geo->blksize;
+	default:
+		return 0;
+	}
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000..cb9e950a9
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -0,0 +1,1824 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+
+/*
+ * Local function declarations.
+ */
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+				    int *indexp, struct xfs_buf **dbpp,
+				    struct xfs_dir3_icleaf_hdr *leafhdr);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+				    struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+				   struct xfs_buf *bp);
+
+void
+xfs_dir2_leaf_hdr_from_disk(
+	struct xfs_mount		*mp,
+	struct xfs_dir3_icleaf_hdr	*to,
+	struct xfs_dir2_leaf		*from)
+{
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from;
+
+		to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
+		to->back = be32_to_cpu(from3->hdr.info.hdr.back);
+		to->magic = be16_to_cpu(from3->hdr.info.hdr.magic);
+		to->count = be16_to_cpu(from3->hdr.count);
+		to->stale = be16_to_cpu(from3->hdr.stale);
+		to->ents = from3->__ents;
+
+		ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+		       to->magic == XFS_DIR3_LEAFN_MAGIC);
+	} else {
+		to->forw = be32_to_cpu(from->hdr.info.forw);
+		to->back = be32_to_cpu(from->hdr.info.back);
+		to->magic = be16_to_cpu(from->hdr.info.magic);
+		to->count = be16_to_cpu(from->hdr.count);
+		to->stale = be16_to_cpu(from->hdr.stale);
+		to->ents = from->__ents;
+
+		ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+		       to->magic == XFS_DIR2_LEAFN_MAGIC);
+	}
+}
+
+void
+xfs_dir2_leaf_hdr_to_disk(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_leaf		*to,
+	struct xfs_dir3_icleaf_hdr	*from)
+{
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to;
+
+		ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+		       from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+		to3->hdr.info.hdr.forw = cpu_to_be32(from->forw);
+		to3->hdr.info.hdr.back = cpu_to_be32(from->back);
+		to3->hdr.info.hdr.magic = cpu_to_be16(from->magic);
+		to3->hdr.count = cpu_to_be16(from->count);
+		to3->hdr.stale = cpu_to_be16(from->stale);
+	} else {
+		ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+		       from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+		to->hdr.info.forw = cpu_to_be32(from->forw);
+		to->hdr.info.back = cpu_to_be32(from->back);
+		to->hdr.info.magic = cpu_to_be16(from->magic);
+		to->hdr.count = cpu_to_be16(from->count);
+		to->hdr.stale = cpu_to_be16(from->stale);
+	}
+}
+
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+#ifdef DEBUG
+static xfs_failaddr_t
+xfs_dir3_leaf1_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+
+	if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp))
+			return __this_address;
+	} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+		return __this_address;
+
+	return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false);
+}
+
+static inline void
+xfs_dir3_leaf_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	xfs_failaddr_t		fa;
+
+	fa = xfs_dir3_leaf1_check(dp, bp);
+	if (!fa)
+		return;
+	xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount,
+			bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__,
+			fa);
+	ASSERT(0);
+}
+#else
+#define	xfs_dir3_leaf_check(dp, bp)
+#endif
+
+xfs_failaddr_t
+xfs_dir3_leaf_check_int(
+	struct xfs_mount		*mp,
+	struct xfs_dir3_icleaf_hdr	*hdr,
+	struct xfs_dir2_leaf		*leaf,
+	bool				expensive_checking)
+{
+	struct xfs_da_geometry		*geo = mp->m_dir_geo;
+	xfs_dir2_leaf_tail_t		*ltp;
+	int				stale;
+	int				i;
+	bool				isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+						   hdr->magic == XFS_DIR3_LEAF1_MAGIC);
+
+	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+	/*
+	 * XXX (dgc): This value is not restrictive enough.
+	 * Should factor in the size of the bests table as well.
+	 * We can deduce a value for that from i_disk_size.
+	 */
+	if (hdr->count > geo->leaf_max_ents)
+		return __this_address;
+
+	/* Leaves and bests don't overlap in leaf format. */
+	if (isleaf1 &&
+	    (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+		return __this_address;
+
+	if (!expensive_checking)
+		return NULL;
+
+	/* Check hash value order, count stale entries.  */
+	for (i = stale = 0; i < hdr->count; i++) {
+		if (i + 1 < hdr->count) {
+			if (be32_to_cpu(hdr->ents[i].hashval) >
+					be32_to_cpu(hdr->ents[i + 1].hashval))
+				return __this_address;
+		}
+		if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			stale++;
+		if (isleaf1 && xfs_dir2_dataptr_to_db(geo,
+				be32_to_cpu(hdr->ents[i].address)) >=
+				be32_to_cpu(ltp->bestcount))
+			return __this_address;
+	}
+	if (hdr->stale != stale)
+		return __this_address;
+	return NULL;
+}
+
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
+static xfs_failaddr_t
+xfs_dir3_leaf_verify(
+	struct xfs_buf			*bp)
+{
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_dir3_icleaf_hdr	leafhdr;
+	xfs_failaddr_t			fa;
+
+	fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+	if (fa)
+		return fa;
+
+	xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr);
+	return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true);
+}
+
+static void
+xfs_dir3_leaf_read_verify(
+	struct xfs_buf  *bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	xfs_failaddr_t		fa;
+
+	if (xfs_has_crc(mp) &&
+	     !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_dir3_leaf_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_dir3_leaf_write_verify(
+	struct xfs_buf  *bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_dir3_leaf_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+	.name = "xfs_dir3_leaf1",
+	.magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+		     cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+	.verify_read = xfs_dir3_leaf_read_verify,
+	.verify_write = xfs_dir3_leaf_write_verify,
+	.verify_struct = xfs_dir3_leaf_verify,
+};
+
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+	.name = "xfs_dir3_leafn",
+	.magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+		     cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+	.verify_read = xfs_dir3_leaf_read_verify,
+	.verify_write = xfs_dir3_leaf_write_verify,
+	.verify_struct = xfs_dir3_leaf_verify,
+};
+
+int
+xfs_dir3_leaf_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
+			&xfs_dir3_leaf1_buf_ops);
+	if (!err && tp && *bpp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+	return err;
+}
+
+int
+xfs_dir3_leafn_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
+			&xfs_dir3_leafn_buf_ops);
+	if (!err && tp && *bpp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+	return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	xfs_ino_t		owner,
+	uint16_t		type)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+		memset(leaf3, 0, sizeof(*leaf3));
+
+		leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+					 ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+					 : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+		leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		leaf3->info.owner = cpu_to_be64(owner);
+		uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
+	} else {
+		memset(leaf, 0, sizeof(*leaf));
+		leaf->hdr.info.magic = cpu_to_be16(type);
+	}
+
+	/*
+	 * If it's a leaf-format directory initialize the tail.
+	 * Caller is responsible for initialising the bests table.
+	 */
+	if (type == XFS_DIR2_LEAF1_MAGIC) {
+		struct xfs_dir2_leaf_tail *ltp;
+
+		ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
+		ltp->bestcount = 0;
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+	} else {
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+	}
+}
+
+int
+xfs_dir3_leaf_get_buf(
+	xfs_da_args_t		*args,
+	xfs_dir2_db_t		bno,
+	struct xfs_buf		**bpp,
+	uint16_t		magic)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+	       bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+			       &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+	xfs_dir3_leaf_log_header(args, bp);
+	if (magic == XFS_DIR2_LEAF1_MAGIC)
+		xfs_dir3_leaf_log_tail(args, bp);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_block_to_leaf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*dbp)		/* input block's buffer */
+{
+	__be16			*bestsp;	/* leaf's bestsp entries */
+	xfs_dablk_t		blkno;		/* leaf block's bno */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block's leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block's tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	struct xfs_buf		*lbp;		/* leaf block's buffer */
+	xfs_dir2_db_t		ldb;		/* leaf block's bno */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf's tail */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to rescan bestfree */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_block_to_leaf(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Add the leaf block to the inode.
+	 * This interface will only put blocks in the leaf/node range.
+	 * Since that's empty now, we'll get the root (block 0 in range).
+	 */
+	if ((error = xfs_da_grow_inode(args, &blkno))) {
+		return error;
+	}
+	ldb = xfs_dir2_da_to_db(args->geo, blkno);
+	ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
+	/*
+	 * Initialize the leaf block, get a buffer for it.
+	 */
+	error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+	if (error)
+		return error;
+
+	leaf = lbp->b_addr;
+	hdr = dbp->b_addr;
+	xfs_dir3_data_check(dp, dbp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
+	/*
+	 * Set the counts in the leaf header.
+	 */
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+	leafhdr.count = be32_to_cpu(btp->count);
+	leafhdr.stale = be32_to_cpu(btp->stale);
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+
+	/*
+	 * Could compact these but I think we always do the conversion
+	 * after squeezing out stale entries.
+	 */
+	memcpy(leafhdr.ents, blp,
+		be32_to_cpu(btp->count) * sizeof(struct xfs_dir2_leaf_entry));
+	xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, 0, leafhdr.count - 1);
+	needscan = 0;
+	needlog = 1;
+	/*
+	 * Make the space formerly occupied by the leaf entries and block
+	 * tail be free.
+	 */
+	xfs_dir2_data_make_free(args, dbp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+		(xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
+				       (char *)blp),
+		&needlog, &needscan);
+	/*
+	 * Fix up the block header, make it a data block.
+	 */
+	dbp->b_ops = &xfs_dir3_data_buf_ops;
+	xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+	else
+		hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
+	if (needscan)
+		xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+	/*
+	 * Set up leaf tail and bests table.
+	 */
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ltp->bestcount = cpu_to_be32(1);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	bestsp[0] =  bf[0].length;
+	/*
+	 * Log the data header and leaf bests table.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_dir3_data_check(dp, dbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
+	return 0;
+}
+
+STATIC void
+xfs_dir3_leaf_find_stale(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int			index,
+	int			*lowstale,
+	int			*highstale)
+{
+	/*
+	 * Find the first stale entry before our index, if any.
+	 */
+	for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
+		if (ents[*lowstale].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			break;
+	}
+
+	/*
+	 * Find the first stale entry at or after our index, if any.
+	 * Stop if the result would require moving more entries than using
+	 * lowstale.
+	 */
+	for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+		if (ents[*highstale].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			break;
+		if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
+			break;
+	}
+}
+
+struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int			index,		/* leaf table position */
+	int			compact,	/* need to compact leaves */
+	int			lowstale,	/* index of prev stale leaf */
+	int			highstale,	/* index of next stale leaf */
+	int			*lfloglow,	/* low leaf logging index */
+	int			*lfloghigh)	/* high leaf logging index */
+{
+	if (!leafhdr->stale) {
+		xfs_dir2_leaf_entry_t	*lep;	/* leaf entry table pointer */
+
+		/*
+		 * Now we need to make room to insert the leaf entry.
+		 *
+		 * If there are no stale entries, just insert a hole at index.
+		 */
+		lep = &ents[index];
+		if (index < leafhdr->count)
+			memmove(lep + 1, lep,
+				(leafhdr->count - index) * sizeof(*lep));
+
+		/*
+		 * Record low and high logging indices for the leaf.
+		 */
+		*lfloglow = index;
+		*lfloghigh = leafhdr->count++;
+		return lep;
+	}
+
+	/*
+	 * There are stale entries.
+	 *
+	 * We will use one of them for the new entry.  It's probably not at
+	 * the right location, so we'll have to shift some up or down first.
+	 *
+	 * If we didn't compact before, we need to find the nearest stale
+	 * entries before and after our insertion point.
+	 */
+	if (compact == 0)
+		xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+					 &lowstale, &highstale);
+
+	/*
+	 * If the low one is better, use it.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == leafhdr->count ||
+	     index - lowstale - 1 < highstale - index)) {
+		ASSERT(index - lowstale - 1 >= 0);
+		ASSERT(ents[lowstale].address ==
+		       cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+		/*
+		 * Copy entries up to cover the stale entry and make room
+		 * for the new entry.
+		 */
+		if (index - lowstale - 1 > 0) {
+			memmove(&ents[lowstale], &ents[lowstale + 1],
+				(index - lowstale - 1) *
+					sizeof(xfs_dir2_leaf_entry_t));
+		}
+		*lfloglow = min(lowstale, *lfloglow);
+		*lfloghigh = max(index - 1, *lfloghigh);
+		leafhdr->stale--;
+		return &ents[index - 1];
+	}
+
+	/*
+	 * The high one is better, so use that one.
+	 */
+	ASSERT(highstale - index >= 0);
+	ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+	/*
+	 * Copy entries down to cover the stale entry and make room for the
+	 * new entry.
+	 */
+	if (highstale - index > 0) {
+		memmove(&ents[index + 1], &ents[index],
+			(highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
+	}
+	*lfloglow = min(index, *lfloglow);
+	*lfloghigh = max(highstale, *lfloghigh);
+	leafhdr->stale--;
+	return &ents[index];
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_leaf_addname(
+	struct xfs_da_args	*args)		/* operation arguments */
+{
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_trans	*tp = args->trans;
+	__be16			*bestsp;	/* freespace table in leaf */
+	__be16			*tagp;		/* end of data entry */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	struct xfs_buf		*lbp;		/* leaf's buffer */
+	struct xfs_dir2_leaf	*leaf;		/* leaf structure */
+	struct xfs_inode	*dp = args->dp;	/* incore directory inode */
+	struct xfs_dir2_data_hdr *hdr;		/* data block header */
+	struct xfs_dir2_data_entry *dep;	/* data block entry */
+	struct xfs_dir2_leaf_entry *lep;	/* leaf entry table pointer */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir2_data_unused *dup;	/* data unused entry */
+	struct xfs_dir2_leaf_tail *ltp;		/* leaf tail pointer */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	int			compact;	/* need to compact leaves */
+	int			error;		/* error return value */
+	int			grown;		/* allocated new data block */
+	int			highstale = 0;	/* index of next stale leaf */
+	int			i;		/* temporary, index */
+	int			index;		/* leaf table position */
+	int			length;		/* length of new entry */
+	int			lfloglow;	/* low leaf logging index */
+	int			lfloghigh;	/* high leaf logging index */
+	int			lowstale = 0;	/* index of prev stale leaf */
+	int			needbytes;	/* leaf block bytes needed */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data free */
+	xfs_dir2_db_t		use_block;	/* data block number */
+
+	trace_xfs_dir2_leaf_addname(args);
+
+	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
+	if (error)
+		return error;
+
+	/*
+	 * Look up the entry by hash value and name.
+	 * We know it's not there, our caller has already done a lookup.
+	 * So the index is of the entry to insert in front of.
+	 * But if there are dup hash values the index is of the first of those.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+	ents = leafhdr.ents;
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	length = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
+
+	/*
+	 * See if there are any entries with the same hash value
+	 * and space in their block for the new entry.
+	 * This is good because it puts multiple same-hash value entries
+	 * in a data block, improving the lookup of those entries.
+	 */
+	for (use_block = -1, lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     index++, lep++) {
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+		ASSERT(i < be32_to_cpu(ltp->bestcount));
+		ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
+		if (be16_to_cpu(bestsp[i]) >= length) {
+			use_block = i;
+			break;
+		}
+	}
+	/*
+	 * Didn't find a block yet, linear search all the data blocks.
+	 */
+	if (use_block == -1) {
+		for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
+			/*
+			 * Remember a block we see that's missing.
+			 */
+			if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
+			    use_block == -1)
+				use_block = i;
+			else if (be16_to_cpu(bestsp[i]) >= length) {
+				use_block = i;
+				break;
+			}
+		}
+	}
+	/*
+	 * How many bytes do we need in the leaf block?
+	 */
+	needbytes = 0;
+	if (!leafhdr.stale)
+		needbytes += sizeof(xfs_dir2_leaf_entry_t);
+	if (use_block == -1)
+		needbytes += sizeof(xfs_dir2_data_off_t);
+
+	/*
+	 * Now kill use_block if it refers to a missing block, so we
+	 * can use it as an indication of allocation needed.
+	 */
+	if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
+		use_block = -1;
+	/*
+	 * If we don't have enough free bytes but we can make enough
+	 * by compacting out stale entries, we'll do that.
+	 */
+	if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+	    leafhdr.stale > 1)
+		compact = 1;
+
+	/*
+	 * Otherwise if we don't have enough free bytes we need to
+	 * convert to node form.
+	 */
+	else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
+		/*
+		 * Just checking or no space reservation, give up.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+							args->total == 0) {
+			xfs_trans_brelse(tp, lbp);
+			return -ENOSPC;
+		}
+		/*
+		 * Convert to node form.
+		 */
+		error = xfs_dir2_leaf_to_node(args, lbp);
+		if (error)
+			return error;
+		/*
+		 * Then add the new entry.
+		 */
+		return xfs_dir2_node_addname(args);
+	}
+	/*
+	 * Otherwise it will fit without compaction.
+	 */
+	else
+		compact = 0;
+	/*
+	 * If just checking, then it will fit unless we needed to allocate
+	 * a new data block.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		xfs_trans_brelse(tp, lbp);
+		return use_block == -1 ? -ENOSPC : 0;
+	}
+	/*
+	 * If no allocations are allowed, return now before we've
+	 * changed anything.
+	 */
+	if (args->total == 0 && use_block == -1) {
+		xfs_trans_brelse(tp, lbp);
+		return -ENOSPC;
+	}
+	/*
+	 * Need to compact the leaf entries, removing stale ones.
+	 * Leave one stale entry behind - the one closest to our
+	 * insertion index - and we'll shift that one to our insertion
+	 * point later.
+	 */
+	if (compact) {
+		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+			&highstale, &lfloglow, &lfloghigh);
+	}
+	/*
+	 * There are stale entries, so we'll need log-low and log-high
+	 * impossibly bad values later.
+	 */
+	else if (leafhdr.stale) {
+		lfloglow = leafhdr.count;
+		lfloghigh = -1;
+	}
+	/*
+	 * If there was no data block space found, we need to allocate
+	 * a new one.
+	 */
+	if (use_block == -1) {
+		/*
+		 * Add the new data block.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+				&use_block))) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * Initialize the block.
+		 */
+		if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * If we're adding a new data block on the end we need to
+		 * extend the bests table.  Copy it up one entry.
+		 */
+		if (use_block >= be32_to_cpu(ltp->bestcount)) {
+			bestsp--;
+			memmove(&bestsp[0], &bestsp[1],
+				be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+			be32_add_cpu(&ltp->bestcount, 1);
+			xfs_dir3_leaf_log_tail(args, lbp);
+			xfs_dir3_leaf_log_bests(args, lbp, 0,
+						be32_to_cpu(ltp->bestcount) - 1);
+		}
+		/*
+		 * If we're filling in a previously empty block just log it.
+		 */
+		else
+			xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+		hdr = dbp->b_addr;
+		bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+		bestsp[use_block] = bf[0].length;
+		grown = 1;
+	} else {
+		/*
+		 * Already had space in some data block.
+		 * Just read that one in.
+		 */
+		error = xfs_dir3_data_read(tp, dp,
+				   xfs_dir2_db_to_da(args->geo, use_block),
+				   0, &dbp);
+		if (error) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		hdr = dbp->b_addr;
+		bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+		grown = 0;
+	}
+	/*
+	 * Point to the biggest freespace in our data block.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
+	needscan = needlog = 0;
+	/*
+	 * Mark the initial part of our freespace in use for the new entry.
+	 */
+	error = xfs_dir2_data_use_free(args, dbp, dup,
+			(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+			length, &needlog, &needscan);
+	if (error) {
+		xfs_trans_brelse(tp, lbp);
+		return error;
+	}
+	/*
+	 * Initialize our new entry (at last).
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+	tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	/*
+	 * Need to scan fix up the bestfree table.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+	/*
+	 * Need to log the data block's header.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir2_data_log_entry(args, dbp, dep);
+	/*
+	 * If the bests table needs to be changed, do it.
+	 * Log the change unless we've already done that.
+	 */
+	if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+		bestsp[use_block] = bf[0].length;
+		if (!grown)
+			xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+	}
+
+	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+				       highstale, &lfloglow, &lfloghigh);
+
+	/*
+	 * Fill in the new leaf entry.
+	 */
+	lep->hashval = cpu_to_be32(args->hashval);
+	lep->address = cpu_to_be32(
+				xfs_dir2_db_off_to_dataptr(args->geo, use_block,
+				be16_to_cpu(*tagp)));
+	/*
+	 * Log the leaf fields and give up the buffers.
+	 */
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_dir3_data_check(dp, dbp);
+	return 0;
+}
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir3_leaf_compact(
+	xfs_da_args_t	*args,		/* operation arguments */
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_buf	*bp)		/* leaf buffer */
+{
+	int		from;		/* source leaf index */
+	xfs_dir2_leaf_t	*leaf;		/* leaf structure */
+	int		loglow;		/* first leaf entry to log */
+	int		to;		/* target leaf index */
+	struct xfs_inode *dp = args->dp;
+
+	leaf = bp->b_addr;
+	if (!leafhdr->stale)
+		return;
+
+	/*
+	 * Compress out the stale entries in place.
+	 */
+	for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+		if (leafhdr->ents[from].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			continue;
+		/*
+		 * Only actually copy the entries that are different.
+		 */
+		if (from > to) {
+			if (loglow == -1)
+				loglow = to;
+			leafhdr->ents[to] = leafhdr->ents[from];
+		}
+		to++;
+	}
+	/*
+	 * Update and log the header, log the leaf entries.
+	 */
+	ASSERT(leafhdr->stale == from - to);
+	leafhdr->count -= leafhdr->stale;
+	leafhdr->stale = 0;
+
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+	if (loglow != -1)
+		xfs_dir3_leaf_log_ents(args, leafhdr, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir3_leaf_compact_x1(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int		*indexp,	/* insertion index */
+	int		*lowstalep,	/* out: stale entry before us */
+	int		*highstalep,	/* out: stale entry after us */
+	int		*lowlogp,	/* out: low log index */
+	int		*highlogp)	/* out: high log index */
+{
+	int		from;		/* source copy index */
+	int		highstale;	/* stale entry at/after index */
+	int		index;		/* insertion index */
+	int		keepstale;	/* source index of kept stale */
+	int		lowstale;	/* stale entry before index */
+	int		newindex=0;	/* new insertion index */
+	int		to;		/* destination copy index */
+
+	ASSERT(leafhdr->stale > 1);
+	index = *indexp;
+
+	xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
+
+	/*
+	 * Pick the better of lowstale and highstale.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == leafhdr->count ||
+	     index - lowstale <= highstale - index))
+		keepstale = lowstale;
+	else
+		keepstale = highstale;
+	/*
+	 * Copy the entries in place, removing all the stale entries
+	 * except keepstale.
+	 */
+	for (from = to = 0; from < leafhdr->count; from++) {
+		/*
+		 * Notice the new value of index.
+		 */
+		if (index == from)
+			newindex = to;
+		if (from != keepstale &&
+		    ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+			if (from == to)
+				*lowlogp = to;
+			continue;
+		}
+		/*
+		 * Record the new keepstale value for the insertion.
+		 */
+		if (from == keepstale)
+			lowstale = highstale = to;
+		/*
+		 * Copy only the entries that have moved.
+		 */
+		if (from > to)
+			ents[to] = ents[from];
+		to++;
+	}
+	ASSERT(from > to);
+	/*
+	 * If the insertion point was past the last entry,
+	 * set the new insertion point accordingly.
+	 */
+	if (index == from)
+		newindex = to;
+	*indexp = newindex;
+	/*
+	 * Adjust the leaf header values.
+	 */
+	leafhdr->count -= from - to;
+	leafhdr->stale = 1;
+	/*
+	 * Remember the low/high stale value only in the "right"
+	 * direction.
+	 */
+	if (lowstale >= newindex)
+		lowstale = -1;
+	else
+		highstale = leafhdr->count;
+	*highlogp = leafhdr->count - 1;
+	*lowstalep = lowstale;
+	*highstalep = highstale;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+static void
+xfs_dir3_leaf_log_bests(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	__be16			*firstb;	/* pointer to first entry */
+	__be16			*lastb;		/* pointer to last entry */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+	lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)firstb - (char *)leaf),
+		(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_ents(
+	struct xfs_da_args	*args,
+	struct xfs_dir3_icleaf_hdr *hdr,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
+	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	firstlep = &hdr->ents[first];
+	lastlep = &hdr->ents[last];
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)firstlep - (char *)leaf),
+		(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	xfs_trans_log_buf(args->trans, bp,
+			  (uint)((char *)&leaf->hdr - (char *)leaf),
+			  args->geo->leaf_hdr_size - 1);
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+STATIC void
+xfs_dir3_leaf_log_tail(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+		(uint)(args->geo->blksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* found entry index */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_lookup(args);
+
+	/*
+	 * Look up name in the leaf block, returning both buffers and index.
+	 */
+	error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+	if (error)
+		return error;
+
+	tp = args->trans;
+	dp = args->dp;
+	xfs_dir3_leaf_check(dp, lbp);
+
+	/*
+	 * Get to the leaf entry and contained data entry address.
+	 */
+	lep = &leafhdr.ents[index];
+
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->b_addr +
+	       xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+	/*
+	 * Return the found inode number & CI name if appropriate
+	 */
+	args->inumber = be64_to_cpu(dep->inumber);
+	args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	xfs_trans_brelse(tp, dbp);
+	xfs_trans_brelse(tp, lbp);
+	return error;
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int					/* error */
+xfs_dir2_leaf_lookup_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		**lbpp,		/* out: leaf buffer */
+	int			*indexp,	/* out: index in leaf block */
+	struct xfs_buf		**dbpp,		/* out: data buffer */
+	struct xfs_dir3_icleaf_hdr *leafhdr)
+{
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	struct xfs_buf		*dbp = NULL;	/* data buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index in leaf block */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		cidb = -1;	/* case match data block no. */
+	enum xfs_dacmp		cmp;		/* name compare result */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+
+	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
+	if (error)
+		return error;
+
+	*lbpp = lbp;
+	leaf = lbp->b_addr;
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_dir2_leaf_hdr_from_disk(mp, leafhdr, leaf);
+
+	/*
+	 * Look for the first leaf entry with our hash value.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	/*
+	 * Loop over all the entries with the right hash value
+	 * looking to match the name.
+	 */
+	for (lep = &leafhdr->ents[index];
+	     index < leafhdr->count &&
+			be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip over stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get the new data block number.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * If it's not the same as the old data block number,
+		 * need to pitch the old one and read the new one.
+		 */
+		if (newdb != curdb) {
+			if (dbp)
+				xfs_trans_brelse(tp, dbp);
+			error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, newdb),
+					   0, &dbp);
+			if (error) {
+				xfs_trans_brelse(tp, lbp);
+				return error;
+			}
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(lep->address)));
+		/*
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
+		 */
+		cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			*indexp = index;
+			/* case exact match: return the current buffer. */
+			if (cmp == XFS_CMP_EXACT) {
+				*dbpp = dbp;
+				return 0;
+			}
+			cidb = curdb;
+		}
+	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or remove).
+	 * If a case-insensitive match was found earlier, re-read the
+	 * appropriate data block if required and return it.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE) {
+		ASSERT(cidb != -1);
+		if (cidb != curdb) {
+			xfs_trans_brelse(tp, dbp);
+			error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, cidb),
+					   0, &dbp);
+			if (error) {
+				xfs_trans_brelse(tp, lbp);
+				return error;
+			}
+		}
+		*dbpp = dbp;
+		return 0;
+	}
+	/*
+	 * No match found, return -ENOENT.
+	 */
+	ASSERT(cidb == -1);
+	if (dbp)
+		xfs_trans_brelse(tp, dbp);
+	xfs_trans_brelse(tp, lbp);
+	return -ENOENT;
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int						/* error */
+xfs_dir2_leaf_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_da_geometry	*geo = args->geo;
+	__be16			*bestsp;	/* leaf block best freespace */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_db_t		db;		/* data block number */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry structure */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_db_t		i;		/* temporary data block # */
+	int			index;		/* index into leaf entries */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_removename(args);
+
+	/*
+	 * Lookup the leaf entry, get the leaf and data blocks read in.
+	 */
+	error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+	if (error)
+		return error;
+
+	dp = args->dp;
+	leaf = lbp->b_addr;
+	hdr = dbp->b_addr;
+	xfs_dir3_data_check(dp, dbp);
+	bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
+	/*
+	 * Point to the leaf entry, use that to point to the data entry.
+	 */
+	lep = &leafhdr.ents[index];
+	db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address));
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+		xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address)));
+	needscan = needlog = 0;
+	oldbest = be16_to_cpu(bf[0].length);
+	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	if (be16_to_cpu(bestsp[db]) != oldbest) {
+		xfs_buf_mark_corrupt(lbp);
+		return -EFSCORRUPTED;
+	}
+	/*
+	 * Mark the former data entry unused.
+	 */
+	xfs_dir2_data_make_free(args, dbp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+		xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+		&needscan);
+	/*
+	 * We just mark the leaf entry stale by putting a null in it.
+	 */
+	leafhdr.stale++;
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+
+	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, index, index);
+
+	/*
+	 * Scan the freespace in the data block again if necessary,
+	 * log the data block header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	/*
+	 * If the longest freespace in the data block has changed,
+	 * put the new value in the bests table and log that.
+	 */
+	if (be16_to_cpu(bf[0].length) != oldbest) {
+		bestsp[db] = bf[0].length;
+		xfs_dir3_leaf_log_bests(args, lbp, db, db);
+	}
+	xfs_dir3_data_check(dp, dbp);
+	/*
+	 * If the data block is now empty then get rid of the data block.
+	 */
+	if (be16_to_cpu(bf[0].length) ==
+	    geo->blksize - geo->data_entry_offset) {
+		ASSERT(db != geo->datablk);
+		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+			/*
+			 * Nope, can't get rid of it because it caused
+			 * allocation of a bmap btree block to do so.
+			 * Just go on, returning success, leaving the
+			 * empty block in place.
+			 */
+			if (error == -ENOSPC && args->total == 0)
+				error = 0;
+			xfs_dir3_leaf_check(dp, lbp);
+			return error;
+		}
+		dbp = NULL;
+		/*
+		 * If this is the last data block then compact the
+		 * bests table by getting rid of entries.
+		 */
+		if (db == be32_to_cpu(ltp->bestcount) - 1) {
+			/*
+			 * Look for the last active entry (i).
+			 */
+			for (i = db - 1; i > 0; i--) {
+				if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
+					break;
+			}
+			/*
+			 * Copy the table down so inactive entries at the
+			 * end are removed.
+			 */
+			memmove(&bestsp[db - i], bestsp,
+				(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+			be32_add_cpu(&ltp->bestcount, -(db - i));
+			xfs_dir3_leaf_log_tail(args, lbp);
+			xfs_dir3_leaf_log_bests(args, lbp, 0,
+						be32_to_cpu(ltp->bestcount) - 1);
+		} else
+			bestsp[db] = cpu_to_be16(NULLDATAOFF);
+	}
+	/*
+	 * If the data block was not the first one, drop it.
+	 */
+	else if (db != geo->datablk)
+		dbp = NULL;
+
+	xfs_dir3_leaf_check(dp, lbp);
+	/*
+	 * See if we can convert to block form.
+	 */
+	return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int						/* error */
+xfs_dir2_leaf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index of leaf entry */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_replace(args);
+
+	/*
+	 * Look up the entry.
+	 */
+	error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+	if (error)
+		return error;
+
+	dp = args->dp;
+	/*
+	 * Point to the leaf entry, get data address from it.
+	 */
+	lep = &leafhdr.ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->b_addr +
+	       xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+	ASSERT(args->inumber != be64_to_cpu(dep->inumber));
+	/*
+	 * Put the new inode number in, log it.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+	tp = args->trans;
+	xfs_dir2_data_log_entry(args, dbp, dep);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_trans_brelse(tp, lbp);
+	return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int						/* index value */
+xfs_dir2_leaf_search_hash(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp)		/* leaf buffer */
+{
+	xfs_dahash_t		hash=0;		/* hash from this entry */
+	xfs_dahash_t		hashwant;	/* hash value looking for */
+	int			high;		/* high leaf index */
+	int			low;		/* low leaf index */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			mid=0;		/* current leaf index */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	xfs_dir2_leaf_hdr_from_disk(args->dp->i_mount, &leafhdr, lbp->b_addr);
+
+	/*
+	 * Note, the table cannot be empty, so we have to go through the loop.
+	 * Binary search the leaf entries looking for our hash value.
+	 */
+	for (lep = leafhdr.ents, low = 0, high = leafhdr.count - 1,
+		hashwant = args->hashval;
+	     low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
+			break;
+		if (hash < hashwant)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	/*
+	 * Found one, back up through all the equal hash values.
+	 */
+	if (hash == hashwant) {
+		while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
+			mid--;
+		}
+	}
+	/*
+	 * Need to point to an entry higher than ours.
+	 */
+	else if (hash < hashwant)
+		mid++;
+	return mid;
+}
+
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int						/* error */
+xfs_dir2_leaf_trim_data(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp,		/* leaf buffer */
+	xfs_dir2_db_t		db)		/* data block number */
+{
+	struct xfs_da_geometry	*geo = args->geo;
+	__be16			*bestsp;	/* leaf bests table */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Read the offending data block.  We need its buffer.
+	 */
+	error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp);
+	if (error)
+		return error;
+
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+#ifdef DEBUG
+{
+	struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+	struct xfs_dir2_data_free *bf =
+		xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+	ASSERT(be16_to_cpu(bf[0].length) ==
+	       geo->blksize - geo->data_entry_offset);
+	ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+}
+#endif
+
+	/*
+	 * Get rid of the data block.
+	 */
+	if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+		ASSERT(error != -ENOSPC);
+		xfs_trans_brelse(tp, dbp);
+		return error;
+	}
+	/*
+	 * Eliminate the last bests entry from the table.
+	 */
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	be32_add_cpu(&ltp->bestcount, -1);
+	memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+	xfs_dir3_leaf_log_tail(args, lbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	return 0;
+}
+
+static inline size_t
+xfs_dir3_leaf_size(
+	struct xfs_dir3_icleaf_hdr	*hdr,
+	int				counts)
+{
+	int	entries;
+	int	hdrsize;
+
+	entries = hdr->count - hdr->stale;
+	if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+	    hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+		hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+	else
+		hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
+
+	return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+	               + counts * sizeof(xfs_dir2_data_off_t)
+		       + sizeof(xfs_dir2_leaf_tail_t);
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int						/* error */
+xfs_dir2_node_to_leaf(
+	xfs_da_state_t		*state)		/* directory operation state */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	struct xfs_buf		*fbp;		/* buffer for freespace block */
+	xfs_fileoff_t		fo;		/* freespace file offset */
+	struct xfs_buf		*lbp;		/* buffer for leaf block */
+	xfs_dir2_leaf_tail_t	*ltp;		/* tail of leaf structure */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			rval;		/* successful free trim? */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	/*
+	 * There's more than a leaf level in the btree, so there must
+	 * be multiple leafn blocks.  Give up.
+	 */
+	if (state->path.active > 1)
+		return 0;
+	args = state->args;
+
+	trace_xfs_dir2_node_to_leaf(args);
+
+	mp = state->mp;
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Get the last offset in the file.
+	 */
+	if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	fo -= args->geo->fsbcount;
+	/*
+	 * If there are freespace blocks other than the first one,
+	 * take this opportunity to remove trailing empty freespace blocks
+	 * that may have been left behind during no-space-reservation
+	 * operations.
+	 */
+	while (fo > args->geo->freeblk) {
+		if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+			return error;
+		}
+		if (rval)
+			fo -= args->geo->fsbcount;
+		else
+			return 0;
+	}
+	/*
+	 * Now find the block just before the freespace block.
+	 */
+	if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	/*
+	 * If it's not the single leaf block, give up.
+	 */
+	if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
+		return 0;
+	lbp = state->path.blk[0].bp;
+	leaf = lbp->b_addr;
+	xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
+	/*
+	 * Read the freespace block.
+	 */
+	error = xfs_dir2_free_read(tp, dp,  args->geo->freeblk, &fbp);
+	if (error)
+		return error;
+	xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr);
+
+	ASSERT(!freehdr.firstdb);
+
+	/*
+	 * Now see if the leafn and free data will fit in a leaf1.
+	 * If not, release the buffer and give up.
+	 */
+	if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
+		xfs_trans_brelse(tp, fbp);
+		return 0;
+	}
+
+	/*
+	 * If the leaf has any stale entries in it, compress them out.
+	 */
+	if (leafhdr.stale)
+		xfs_dir3_leaf_compact(args, &leafhdr, lbp);
+
+	lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+	leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+					? XFS_DIR2_LEAF1_MAGIC
+					: XFS_DIR3_LEAF1_MAGIC;
+
+	/*
+	 * Set up the leaf tail from the freespace block.
+	 */
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
+	/*
+	 * Set up the leaf bests table.
+	 */
+	memcpy(xfs_dir2_leaf_bests_p(ltp), freehdr.bests,
+		freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+	xfs_dir2_leaf_hdr_to_disk(mp, leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	xfs_dir3_leaf_log_tail(args, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
+
+	/*
+	 * Get rid of the freespace block.
+	 */
+	error = xfs_dir2_shrink_inode(args,
+			xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+			fbp);
+	if (error) {
+		/*
+		 * This can't fail here because it can only happen when
+		 * punching out the middle of an extent, and this is an
+		 * isolated block.
+		 */
+		ASSERT(error != -ENOSPC);
+		return error;
+	}
+	fbp = NULL;
+	/*
+	 * Now see if we can convert the single-leaf directory
+	 * down to a block form directory.
+	 * This routine always kills the dabuf for the leaf, so
+	 * eliminate it from the path.
+	 */
+	error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+	state->path.blk[0].bp = NULL;
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
new file mode 100644
index 000000000..7a03aeb9f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -0,0 +1,2337 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+/*
+ * Function declarations.
+ */
+static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
+			      int index);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+				     xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
+				 int index, xfs_da_state_blk_t *dblk,
+				 int *rval);
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+			(db / geo->free_max_bests);
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return db % geo->free_max_bests;
+}
+
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+static xfs_failaddr_t
+xfs_dir3_leafn_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+
+	if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp))
+			return __this_address;
+	} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+		return __this_address;
+
+	return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false);
+}
+
+static inline void
+xfs_dir3_leaf_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	xfs_failaddr_t		fa;
+
+	fa = xfs_dir3_leafn_check(dp, bp);
+	if (!fa)
+		return;
+	xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount,
+			bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__,
+			fa);
+	ASSERT(0);
+}
+#else
+#define	xfs_dir3_leaf_check(dp, bp)
+#endif
+
+static xfs_failaddr_t
+xfs_dir3_free_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+	if (!xfs_verify_magic(bp, hdr->magic))
+		return __this_address;
+
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
+			return __this_address;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+			return __this_address;
+	}
+
+	/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+	return NULL;
+}
+
+static void
+xfs_dir3_free_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	xfs_failaddr_t		fa;
+
+	if (xfs_has_crc(mp) &&
+	    !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_dir3_free_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_dir3_free_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_dir3_free_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+	.name = "xfs_dir3_free",
+	.magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+		   cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
+	.verify_read = xfs_dir3_free_read_verify,
+	.verify_write = xfs_dir3_free_write_verify,
+	.verify_struct = xfs_dir3_free_verify,
+};
+
+/* Everything ok in the free block header? */
+static xfs_failaddr_t
+xfs_dir3_free_header_check(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	int			maxbests = mp->m_dir_geo->free_max_bests;
+	unsigned int		firstdb;
+
+	firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
+		   xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
+			maxbests;
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+		if (be32_to_cpu(hdr3->firstdb) != firstdb)
+			return __this_address;
+		if (be32_to_cpu(hdr3->nvalid) > maxbests)
+			return __this_address;
+		if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
+			return __this_address;
+		if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+			return __this_address;
+	} else {
+		struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+		if (be32_to_cpu(hdr->firstdb) != firstdb)
+			return __this_address;
+		if (be32_to_cpu(hdr->nvalid) > maxbests)
+			return __this_address;
+		if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused))
+			return __this_address;
+	}
+	return NULL;
+}
+
+static int
+__xfs_dir3_free_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	unsigned int		flags,
+	struct xfs_buf		**bpp)
+{
+	xfs_failaddr_t		fa;
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, flags, bpp, XFS_DATA_FORK,
+			&xfs_dir3_free_buf_ops);
+	if (err || !*bpp)
+		return err;
+
+	/* Check things that we can't do in the verifier. */
+	fa = xfs_dir3_free_header_check(dp, fbno, *bpp);
+	if (fa) {
+		__xfs_buf_mark_corrupt(*bpp, fa);
+		xfs_trans_brelse(tp, *bpp);
+		*bpp = NULL;
+		return -EFSCORRUPTED;
+	}
+
+	/* try read returns without an error or *bpp if it lands in a hole */
+	if (tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+
+	return 0;
+}
+
+void
+xfs_dir2_free_hdr_from_disk(
+	struct xfs_mount		*mp,
+	struct xfs_dir3_icfree_hdr	*to,
+	struct xfs_dir2_free		*from)
+{
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_free	*from3 = (struct xfs_dir3_free *)from;
+
+		to->magic = be32_to_cpu(from3->hdr.hdr.magic);
+		to->firstdb = be32_to_cpu(from3->hdr.firstdb);
+		to->nvalid = be32_to_cpu(from3->hdr.nvalid);
+		to->nused = be32_to_cpu(from3->hdr.nused);
+		to->bests = from3->bests;
+
+		ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+	} else {
+		to->magic = be32_to_cpu(from->hdr.magic);
+		to->firstdb = be32_to_cpu(from->hdr.firstdb);
+		to->nvalid = be32_to_cpu(from->hdr.nvalid);
+		to->nused = be32_to_cpu(from->hdr.nused);
+		to->bests = from->bests;
+
+		ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+	}
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_free		*to,
+	struct xfs_dir3_icfree_hdr	*from)
+{
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_free	*to3 = (struct xfs_dir3_free *)to;
+
+		ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+		to3->hdr.hdr.magic = cpu_to_be32(from->magic);
+		to3->hdr.firstdb = cpu_to_be32(from->firstdb);
+		to3->hdr.nvalid = cpu_to_be32(from->nvalid);
+		to3->hdr.nused = cpu_to_be32(from->nused);
+	} else {
+		ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+		to->hdr.magic = cpu_to_be32(from->magic);
+		to->hdr.firstdb = cpu_to_be32(from->firstdb);
+		to->hdr.nvalid = cpu_to_be32(from->nvalid);
+		to->hdr.nused = cpu_to_be32(from->nused);
+	}
+}
+
+int
+xfs_dir2_free_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp);
+}
+
+static int
+xfs_dir3_free_get_buf(
+	xfs_da_args_t		*args,
+	xfs_dir2_db_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+	struct xfs_dir3_icfree_hdr hdr;
+
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
+			&bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+	bp->b_ops = &xfs_dir3_free_buf_ops;
+
+	/*
+	 * Initialize the new block to be empty, and remember
+	 * its first slot as our empty slot.
+	 */
+	memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+	memset(&hdr, 0, sizeof(hdr));
+
+	if (xfs_has_crc(mp)) {
+		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+		hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+		hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
+	} else
+		hdr.magic = XFS_DIR2_FREE_MAGIC;
+	xfs_dir2_free_hdr_to_disk(mp, bp->b_addr, &hdr);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log entries from a freespace block.
+ */
+STATIC void
+xfs_dir2_free_log_bests(
+	struct xfs_da_args	*args,
+	struct xfs_dir3_icfree_hdr *hdr,
+	struct xfs_buf		*bp,
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	struct xfs_dir2_free	*free = bp->b_addr;
+
+	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+	xfs_trans_log_buf(args->trans, bp,
+			  (char *)&hdr->bests[first] - (char *)free,
+			  (char *)&hdr->bests[last] - (char *)free +
+			   sizeof(hdr->bests[0]) - 1);
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+#ifdef DEBUG
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->b_addr;
+	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
+	xfs_trans_log_buf(args->trans, bp, 0,
+			  args->geo->free_hdr_size - 1);
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int						/* error */
+xfs_dir2_leaf_to_node(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp)		/* leaf buffer */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	struct xfs_buf		*fbp;		/* freespace buffer */
+	xfs_dir2_db_t		fdb;		/* freespace block number */
+	__be16			*from;		/* pointer to freespace entry */
+	int			i;		/* leaf freespace index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	int			n;		/* count of live freespc ents */
+	xfs_dir2_data_off_t	off;		/* freespace entry value */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	trace_xfs_dir2_leaf_to_node(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Add a freespace block to the directory.
+	 */
+	if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+		return error;
+	}
+	ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+	/*
+	 * Get the buffer for the new freespace block.
+	 */
+	error = xfs_dir3_free_get_buf(args, fdb, &fbp);
+	if (error)
+		return error;
+
+	xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, fbp->b_addr);
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	if (be32_to_cpu(ltp->bestcount) >
+				(uint)dp->i_disk_size / args->geo->blksize) {
+		xfs_buf_mark_corrupt(lbp);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Copy freespace entries from the leaf block to the new block.
+	 * Count active entries.
+	 */
+	from = xfs_dir2_leaf_bests_p(ltp);
+	for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++) {
+		off = be16_to_cpu(*from);
+		if (off != NULLDATAOFF)
+			n++;
+		freehdr.bests[i] = cpu_to_be16(off);
+	}
+
+	/*
+	 * Now initialize the freespace block header.
+	 */
+	freehdr.nused = n;
+	freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+	xfs_dir2_free_hdr_to_disk(dp->i_mount, fbp->b_addr, &freehdr);
+	xfs_dir2_free_log_bests(args, &freehdr, fbp, 0, freehdr.nvalid - 1);
+	xfs_dir2_free_log_header(args, fbp);
+
+	/*
+	 * Converting the leaf to a leafnode is just a matter of changing the
+	 * magic number and the ops. Do the change directly to the buffer as
+	 * it's less work (and less code) than decoding the header to host
+	 * format and back again.
+	 */
+	if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+	else
+		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+	lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
+	return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int					/* error */
+xfs_dir2_leafn_add(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	struct xfs_da_args	*args,		/* operation arguments */
+	int			index)		/* insertion pt for new entry */
+{
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir2_leaf_entry *lep;
+	struct xfs_dir2_leaf_entry *ents;
+	int			compact;	/* compacting stale leaves */
+	int			highstale = 0;	/* next stale entry */
+	int			lfloghigh;	/* high leaf entry logging */
+	int			lfloglow;	/* low leaf entry logging */
+	int			lowstale = 0;	/* previous stale entry */
+
+	trace_xfs_dir2_leafn_add(args, index);
+
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+	ents = leafhdr.ents;
+
+	/*
+	 * Quick check just to make sure we are not going to index
+	 * into other peoples memory
+	 */
+	if (index < 0) {
+		xfs_buf_mark_corrupt(bp);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * If there are already the maximum number of leaf entries in
+	 * the block, if there are no stale entries it won't fit.
+	 * Caller will do a split.  If there are stale entries we'll do
+	 * a compact.
+	 */
+
+	if (leafhdr.count == args->geo->leaf_max_ents) {
+		if (!leafhdr.stale)
+			return -ENOSPC;
+		compact = leafhdr.stale > 1;
+	} else
+		compact = 0;
+	ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+	ASSERT(index == leafhdr.count ||
+	       be32_to_cpu(ents[index].hashval) >= args->hashval);
+
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+
+	/*
+	 * Compact out all but one stale leaf entry.  Leaves behind
+	 * the entry closest to index.
+	 */
+	if (compact)
+		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+					 &highstale, &lfloglow, &lfloghigh);
+	else if (leafhdr.stale) {
+		/*
+		 * Set impossible logging indices for this case.
+		 */
+		lfloglow = leafhdr.count;
+		lfloghigh = -1;
+	}
+
+	/*
+	 * Insert the new entry, log everything.
+	 */
+	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+				       highstale, &lfloglow, &lfloghigh);
+
+	lep->hashval = cpu_to_be32(args->hashval);
+	lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
+				args->blkno, args->index));
+
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+	xfs_dir3_leaf_log_ents(args, &leafhdr, bp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(dp, bp);
+	return 0;
+}
+
+#ifdef DEBUG
+static void
+xfs_dir2_free_hdr_check(
+	struct xfs_inode *dp,
+	struct xfs_buf	*bp,
+	xfs_dir2_db_t	db)
+{
+	struct xfs_dir3_icfree_hdr hdr;
+
+	xfs_dir2_free_hdr_from_disk(dp->i_mount, &hdr, bp->b_addr);
+
+	ASSERT((hdr.firstdb % dp->i_mount->m_dir_geo->free_max_bests) == 0);
+	ASSERT(hdr.firstdb <= db);
+	ASSERT(db < hdr.firstdb + hdr.nvalid);
+}
+#else
+#define xfs_dir2_free_hdr_check(dp, bp, db)
+#endif	/* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t					/* hash value */
+xfs_dir2_leaf_lasthash(
+	struct xfs_inode *dp,
+	struct xfs_buf	*bp,			/* leaf buffer */
+	int		*count)			/* count of entries in leaf */
+{
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, bp->b_addr);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
+
+	if (count)
+		*count = leafhdr.count;
+	if (!leafhdr.count)
+		return 0;
+	return be32_to_cpu(leafhdr.ents[leafhdr.count - 1].hashval);
+}
+
+/*
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	struct xfs_buf		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_db_t		curfdb = -1;	/* current free block number */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			fi;		/* free entry index */
+	xfs_dir2_free_t		*free = NULL;	/* free block structure */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new data entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_dir2_db_t		newfdb;		/* new free block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->b_addr;
+	xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
+
+	xfs_dir3_leaf_check(dp, bp);
+	ASSERT(leafhdr.count > 0);
+
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		/* If so, it's a free block buffer, get the block number. */
+		curbp = state->extrablk.bp;
+		curfdb = state->extrablk.blkno;
+		free = curbp->b_addr;
+		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+		       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+	}
+	length = xfs_dir2_data_entsize(mp, args->namelen);
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &leafhdr.ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * For addname, we're looking for a place to put the new entry.
+		 * We want to use a data block with an entry of equal
+		 * hash value to ours if there is one with room.
+		 *
+		 * If this block isn't the data block we already have
+		 * in hand, take a look at it.
+		 */
+		if (newdb != curdb) {
+			struct xfs_dir3_icfree_hdr freehdr;
+
+			curdb = newdb;
+			/*
+			 * Convert the data block to the free block
+			 * holding its freespace information.
+			 */
+			newfdb = xfs_dir2_db_to_fdb(args->geo, newdb);
+			/*
+			 * If it's not the one we have in hand, read it in.
+			 */
+			if (newfdb != curfdb) {
+				/*
+				 * If we had one before, drop it.
+				 */
+				if (curbp)
+					xfs_trans_brelse(tp, curbp);
+
+				error = xfs_dir2_free_read(tp, dp,
+						xfs_dir2_db_to_da(args->geo,
+								  newfdb),
+						&curbp);
+				if (error)
+					return error;
+				free = curbp->b_addr;
+
+				xfs_dir2_free_hdr_check(dp, curbp, curdb);
+			}
+			/*
+			 * Get the index for our entry.
+			 */
+			fi = xfs_dir2_db_to_fdindex(args->geo, curdb);
+			/*
+			 * If it has room, return it.
+			 */
+			xfs_dir2_free_hdr_from_disk(mp, &freehdr, free);
+			if (XFS_IS_CORRUPT(mp,
+					   freehdr.bests[fi] ==
+					   cpu_to_be16(NULLDATAOFF))) {
+				if (curfdb != newfdb)
+					xfs_trans_brelse(tp, curbp);
+				return -EFSCORRUPTED;
+			}
+			curfdb = newfdb;
+			if (be16_to_cpu(freehdr.bests[fi]) >= length)
+				goto out;
+		}
+	}
+	/* Didn't find any space */
+	fi = -1;
+out:
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	if (curbp) {
+		/* Giving back a free block. */
+		state->extravalid = 1;
+		state->extrablk.bp = curbp;
+		state->extrablk.index = fi;
+		state->extrablk.blkno = curfdb;
+
+		/*
+		 * Important: this magic number is not in the buffer - it's for
+		 * buffer type information and therefore only the free/data type
+		 * matters here, not whether CRCs are enabled or not.
+		 */
+		state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+	} else {
+		state->extravalid = 0;
+	}
+	/*
+	 * Return the index, that will be the insertion point.
+	 */
+	*indexp = index;
+	return -ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	struct xfs_buf		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->b_addr;
+	xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
+
+	xfs_dir3_leaf_check(dp, bp);
+	if (leafhdr.count <= 0) {
+		xfs_buf_mark_corrupt(bp);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		curbp = state->extrablk.bp;
+		curdb = state->extrablk.blkno;
+	}
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &leafhdr.ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * Not adding a new entry, so we really want to find
+		 * the name given to us.
+		 *
+		 * If it's a different data block, go get it.
+		 */
+		if (newdb != curdb) {
+			/*
+			 * If we had a block before that we aren't saving
+			 * for a CI name, drop it
+			 */
+			if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+						curdb != state->extrablk.blkno))
+				xfs_trans_brelse(tp, curbp);
+			/*
+			 * If needing the block that is saved with a CI match,
+			 * use it otherwise read in the new data block.
+			 */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+					newdb == state->extrablk.blkno) {
+				ASSERT(state->extravalid);
+				curbp = state->extrablk.bp;
+			} else {
+				error = xfs_dir3_data_read(tp, dp,
+						xfs_dir2_db_to_da(args->geo,
+								  newdb),
+						0, &curbp);
+				if (error)
+					return error;
+			}
+			xfs_dir3_data_check(dp, curbp);
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(lep->address)));
+		/*
+		 * Compare the entry and if it's an exact match, return
+		 * EEXIST immediately. If it's the first case-insensitive
+		 * match, store the block & inode number and continue looking.
+		 */
+		cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			/* If there is a CI match block, drop it */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+						curdb != state->extrablk.blkno)
+				xfs_trans_brelse(tp, state->extrablk.bp);
+			args->cmpresult = cmp;
+			args->inumber = be64_to_cpu(dep->inumber);
+			args->filetype = xfs_dir2_data_get_ftype(mp, dep);
+			*indexp = index;
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.blkno = curdb;
+			state->extrablk.index = (int)((char *)dep -
+							(char *)curbp->b_addr);
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			curbp->b_ops = &xfs_dir3_data_buf_ops;
+			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+			if (cmp == XFS_CMP_EXACT)
+				return -EEXIST;
+		}
+	}
+	ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
+	if (curbp) {
+		if (args->cmpresult == XFS_CMP_DIFFERENT) {
+			/* Giving back last used data block. */
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.index = -1;
+			state->extrablk.blkno = curdb;
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			curbp->b_ops = &xfs_dir3_data_buf_ops;
+			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+		} else {
+			/* If the curbp is not the CI match block, drop it */
+			if (state->extrablk.bp != curbp)
+				xfs_trans_brelse(tp, curbp);
+		}
+	} else {
+		state->extravalid = 0;
+	}
+	*indexp = index;
+	return -ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	if (args->op_flags & XFS_DA_OP_ADDNAME)
+		return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+							state);
+	return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir3_leafn_moveents(
+	xfs_da_args_t			*args,	/* operation arguments */
+	struct xfs_buf			*bp_s,	/* source */
+	struct xfs_dir3_icleaf_hdr	*shdr,
+	struct xfs_dir2_leaf_entry	*sents,
+	int				start_s,/* source leaf index */
+	struct xfs_buf			*bp_d,	/* destination */
+	struct xfs_dir3_icleaf_hdr	*dhdr,
+	struct xfs_dir2_leaf_entry	*dents,
+	int				start_d,/* destination leaf index */
+	int				count)	/* count of leaves to copy */
+{
+	int				stale;	/* count stale leaves copied */
+
+	trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+
+	/*
+	 * Silently return if nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * If the destination index is not the end of the current
+	 * destination leaf entries, open up a hole in the destination
+	 * to hold the new entries.
+	 */
+	if (start_d < dhdr->count) {
+		memmove(&dents[start_d + count], &dents[start_d],
+			(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d + count,
+				       count + dhdr->count - 1);
+	}
+	/*
+	 * If the source has stale leaves, count the ones in the copy range
+	 * so we can update the header correctly.
+	 */
+	if (shdr->stale) {
+		int	i;			/* temp leaf index */
+
+		for (i = start_s, stale = 0; i < start_s + count; i++) {
+			if (sents[i].address ==
+					cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+				stale++;
+		}
+	} else
+		stale = 0;
+	/*
+	 * Copy the leaf entries from source to destination.
+	 */
+	memcpy(&dents[start_d], &sents[start_s],
+		count * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d, start_d + count - 1);
+
+	/*
+	 * If there are source entries after the ones we copied,
+	 * delete the ones we copied by sliding the next ones down.
+	 */
+	if (start_s + count < shdr->count) {
+		memmove(&sents[start_s], &sents[start_s + count],
+			count * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir3_leaf_log_ents(args, shdr, bp_s, start_s,
+				       start_s + count - 1);
+	}
+
+	/*
+	 * Update the headers and log them.
+	 */
+	shdr->count -= count;
+	shdr->stale -= stale;
+	dhdr->count += count;
+	dhdr->stale += stale;
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int						/* sort order */
+xfs_dir2_leafn_order(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*leaf1_bp,		/* leaf1 buffer */
+	struct xfs_buf		*leaf2_bp)		/* leaf2 buffer */
+{
+	struct xfs_dir2_leaf	*leaf1 = leaf1_bp->b_addr;
+	struct xfs_dir2_leaf	*leaf2 = leaf2_bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents1;
+	struct xfs_dir2_leaf_entry *ents2;
+	struct xfs_dir3_icleaf_hdr hdr1;
+	struct xfs_dir3_icleaf_hdr hdr2;
+
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1);
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2);
+	ents1 = hdr1.ents;
+	ents2 = hdr2.ents;
+
+	if (hdr1.count > 0 && hdr2.count > 0 &&
+	    (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+	     be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+				be32_to_cpu(ents1[hdr1.count - 1].hashval)))
+		return 1;
+	return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*blk1,		/* first btree block */
+	xfs_da_state_blk_t	*blk2)		/* second btree block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	int			count;		/* count (& direction) leaves */
+	int			isleft;		/* new goes in left leaf */
+	xfs_dir2_leaf_t		*leaf1;		/* first leaf structure */
+	xfs_dir2_leaf_t		*leaf2;		/* second leaf structure */
+	int			mid;		/* midpoint leaf index */
+#if defined(DEBUG) || defined(XFS_WARN)
+	int			oldstale;	/* old count of stale leaves */
+#endif
+	int			oldsum;		/* old total leaf count */
+	int			swap_blocks;	/* swapped leaf blocks */
+	struct xfs_dir2_leaf_entry *ents1;
+	struct xfs_dir2_leaf_entry *ents2;
+	struct xfs_dir3_icleaf_hdr hdr1;
+	struct xfs_dir3_icleaf_hdr hdr2;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+	/*
+	 * If the block order is wrong, swap the arguments.
+	 */
+	swap_blocks = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp);
+	if (swap_blocks)
+		swap(blk1, blk2);
+
+	leaf1 = blk1->bp->b_addr;
+	leaf2 = blk2->bp->b_addr;
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1);
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2);
+	ents1 = hdr1.ents;
+	ents2 = hdr2.ents;
+
+	oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+	oldstale = hdr1.stale + hdr2.stale;
+#endif
+	mid = oldsum >> 1;
+
+	/*
+	 * If the old leaf count was odd then the new one will be even,
+	 * so we need to divide the new count evenly.
+	 */
+	if (oldsum & 1) {
+		xfs_dahash_t	midhash;	/* middle entry hash value */
+
+		if (mid >= hdr1.count)
+			midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
+		else
+			midhash = be32_to_cpu(ents1[mid].hashval);
+		isleft = args->hashval <= midhash;
+	}
+	/*
+	 * If the old count is even then the new count is odd, so there's
+	 * no preferred side for the new entry.
+	 * Pick the left one.
+	 */
+	else
+		isleft = 1;
+	/*
+	 * Calculate moved entry count.  Positive means left-to-right,
+	 * negative means right-to-left.  Then move the entries.
+	 */
+	count = hdr1.count - mid + (isleft == 0);
+	if (count > 0)
+		xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+					hdr1.count - count, blk2->bp,
+					&hdr2, ents2, 0, count);
+	else if (count < 0)
+		xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+					blk1->bp, &hdr1, ents1,
+					hdr1.count, count);
+
+	ASSERT(hdr1.count + hdr2.count == oldsum);
+	ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+	/* log the changes made when moving the entries */
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf1, &hdr1);
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf2, &hdr2);
+	xfs_dir3_leaf_log_header(args, blk1->bp);
+	xfs_dir3_leaf_log_header(args, blk2->bp);
+
+	xfs_dir3_leaf_check(dp, blk1->bp);
+	xfs_dir3_leaf_check(dp, blk2->bp);
+
+	/*
+	 * Mark whether we're inserting into the old or new leaf.
+	 */
+	if (hdr1.count < hdr2.count)
+		state->inleaf = swap_blocks;
+	else if (hdr1.count > hdr2.count)
+		state->inleaf = !swap_blocks;
+	else
+		state->inleaf = swap_blocks ^ (blk1->index <= hdr1.count);
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (!state->inleaf)
+		blk2->index = blk1->index - hdr1.count;
+
+	/*
+	 * Finally sanity check just to make sure we are not returning a
+	 * negative index
+	 */
+	if (blk2->index < 0) {
+		state->inleaf = 1;
+		blk2->index = 0;
+		xfs_alert(dp->i_mount,
+	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
+			__func__, blk1->index);
+	}
+}
+
+static int
+xfs_dir3_data_block_free(
+	xfs_da_args_t		*args,
+	struct xfs_dir2_data_hdr *hdr,
+	struct xfs_dir2_free	*free,
+	xfs_dir2_db_t		fdb,
+	int			findex,
+	struct xfs_buf		*fbp,
+	int			longest)
+{
+	int			logfree = 0;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_inode	*dp = args->dp;
+
+	xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
+	if (hdr) {
+		/*
+		 * Data block is not empty, just set the free entry to the new
+		 * value.
+		 */
+		freehdr.bests[findex] = cpu_to_be16(longest);
+		xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
+		return 0;
+	}
+
+	/* One less used entry in the free table. */
+	freehdr.nused--;
+
+	/*
+	 * If this was the last entry in the table, we can trim the table size
+	 * back.  There might be other entries at the end referring to
+	 * non-existent data blocks, get those too.
+	 */
+	if (findex == freehdr.nvalid - 1) {
+		int	i;		/* free entry index */
+
+		for (i = findex - 1; i >= 0; i--) {
+			if (freehdr.bests[i] != cpu_to_be16(NULLDATAOFF))
+				break;
+		}
+		freehdr.nvalid = i + 1;
+		logfree = 0;
+	} else {
+		/* Not the last entry, just punch it out.  */
+		freehdr.bests[findex] = cpu_to_be16(NULLDATAOFF);
+		logfree = 1;
+	}
+
+	xfs_dir2_free_hdr_to_disk(dp->i_mount, free, &freehdr);
+	xfs_dir2_free_log_header(args, fbp);
+
+	/*
+	 * If there are no useful entries left in the block, get rid of the
+	 * block if we can.
+	 */
+	if (!freehdr.nused) {
+		int error;
+
+		error = xfs_dir2_shrink_inode(args, fdb, fbp);
+		if (error == 0) {
+			fbp = NULL;
+			logfree = 0;
+		} else if (error != -ENOSPC || args->total != 0)
+			return error;
+		/*
+		 * It's possible to get ENOSPC if there is no
+		 * space reservation.  In this case some one
+		 * else will eventually get rid of this block.
+		 */
+	}
+
+	/* Log the free entry that changed, unless we got rid of it.  */
+	if (logfree)
+		xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
+	return 0;
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int					/* error */
+xfs_dir2_leafn_remove(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*bp,		/* leaf buffer */
+	int			index,		/* leaf entry index */
+	xfs_da_state_blk_t	*dblk,		/* data block */
+	int			*rval)		/* resulting block needs join */
+{
+	struct xfs_da_geometry	*geo = args->geo;
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_db_t		db;		/* data block number */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			longest;	/* longest data free entry */
+	int			off;		/* data block entry offset */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leafn_remove(args, index);
+
+	dp = args->dp;
+	tp = args->trans;
+	leaf = bp->b_addr;
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+
+	/*
+	 * Point to the entry we're removing.
+	 */
+	lep = &leafhdr.ents[index];
+
+	/*
+	 * Extract the data block and offset from the entry.
+	 */
+	db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address));
+	ASSERT(dblk->blkno == db);
+	off = xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address));
+	ASSERT(dblk->index == off);
+
+	/*
+	 * Kill the leaf entry by marking it stale.
+	 * Log the leaf block changes.
+	 */
+	leafhdr.stale++;
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+
+	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir3_leaf_log_ents(args, &leafhdr, bp, index, index);
+
+	/*
+	 * Make the data entry free.  Keep track of the longest freespace
+	 * in the data block in case it changes.
+	 */
+	dbp = dblk->bp;
+	hdr = dbp->b_addr;
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
+	bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+	longest = be16_to_cpu(bf[0].length);
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(args, dbp, off,
+		xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+		&needscan);
+	/*
+	 * Rescan the data block freespaces for bestfree.
+	 * Log the data block header if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir3_data_check(dp, dbp);
+	/*
+	 * If the longest data block freespace changes, need to update
+	 * the corresponding freeblock entry.
+	 */
+	if (longest < be16_to_cpu(bf[0].length)) {
+		int		error;		/* error return value */
+		struct xfs_buf	*fbp;		/* freeblock buffer */
+		xfs_dir2_db_t	fdb;		/* freeblock block number */
+		int		findex;		/* index in freeblock entries */
+		xfs_dir2_free_t	*free;		/* freeblock structure */
+
+		/*
+		 * Convert the data block number to a free block,
+		 * read in the free block.
+		 */
+		fdb = xfs_dir2_db_to_fdb(geo, db);
+		error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb),
+					   &fbp);
+		if (error)
+			return error;
+		free = fbp->b_addr;
+#ifdef DEBUG
+	{
+		struct xfs_dir3_icfree_hdr freehdr;
+
+		xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
+		ASSERT(freehdr.firstdb == geo->free_max_bests *
+			(fdb - xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET)));
+	}
+#endif
+		/*
+		 * Calculate which entry we need to fix.
+		 */
+		findex = xfs_dir2_db_to_fdindex(geo, db);
+		longest = be16_to_cpu(bf[0].length);
+		/*
+		 * If the data block is now empty we can get rid of it
+		 * (usually).
+		 */
+		if (longest == geo->blksize - geo->data_entry_offset) {
+			/*
+			 * Try to punch out the data block.
+			 */
+			error = xfs_dir2_shrink_inode(args, db, dbp);
+			if (error == 0) {
+				dblk->bp = NULL;
+				hdr = NULL;
+			}
+			/*
+			 * We can get ENOSPC if there's no space reservation.
+			 * In this case just drop the buffer and some one else
+			 * will eventually get rid of the empty block.
+			 */
+			else if (!(error == -ENOSPC && args->total == 0))
+				return error;
+		}
+		/*
+		 * If we got rid of the data block, we can eliminate that entry
+		 * in the free block.
+		 */
+		error = xfs_dir3_data_block_free(args, hdr, free,
+						 fdb, findex, fbp, longest);
+		if (error)
+			return error;
+	}
+
+	xfs_dir3_leaf_check(dp, bp);
+	/*
+	 * Return indication of whether this leaf block is empty enough
+	 * to justify trying to join it with a neighbor.
+	 */
+	*rval = (geo->leaf_hdr_size +
+		 (uint)sizeof(leafhdr.ents) * (leafhdr.count - leafhdr.stale)) <
+		geo->magicpct;
+	return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int						/* error */
+xfs_dir2_leafn_split(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*oldblk,	/* original block */
+	xfs_da_state_blk_t	*newblk)	/* newly created block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dablk_t		blkno;		/* new leaf block number */
+	int			error;		/* error return value */
+	struct xfs_inode	*dp;
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	args = state->args;
+	dp = args->dp;
+	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Initialize the new leaf block.
+	 */
+	error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
+				      &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+	if (error)
+		return error;
+
+	newblk->blkno = blkno;
+	newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+	/*
+	 * Rebalance the entries across the two leaves, link the new
+	 * block into the leaves.
+	 */
+	xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+	error = xfs_da3_blk_link(state, oldblk, newblk);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Insert the new entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+	else
+		error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_dir2_leaf_lasthash(dp, oldblk->bp, NULL);
+	newblk->hashval = xfs_dir2_leaf_lasthash(dp, newblk->bp, NULL);
+	xfs_dir3_leaf_check(dp, oldblk->bp);
+	xfs_dir3_leaf_check(dp, newblk->bp);
+	return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int						/* error */
+xfs_dir2_leafn_toosmall(
+	xfs_da_state_t		*state,		/* btree cursor */
+	int			*action)	/* resulting action to take */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dablk_t		blkno;		/* leaf block number */
+	struct xfs_buf		*bp;		/* leaf buffer */
+	int			bytes;		/* bytes in use */
+	int			count;		/* leaf live entry count */
+	int			error;		/* error return value */
+	int			forward;	/* sibling block direction */
+	int			i;		/* sibling counter */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			rval;		/* result from path_shift */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_inode	*dp = state->args->dp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	leaf = blk->bp->b_addr;
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+	ents = leafhdr.ents;
+	xfs_dir3_leaf_check(dp, blk->bp);
+
+	count = leafhdr.count - leafhdr.stale;
+	bytes = state->args->geo->leaf_hdr_size + count * sizeof(ents[0]);
+	if (bytes > (state->args->geo->blksize >> 1)) {
+		/*
+		 * Blk over 50%, don't try to join.
+		 */
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (leafhdr.forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+		if (error)
+			return error;
+		*action = rval ? 2 : 0;
+		return 0;
+	}
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	forward = leafhdr.forw < leafhdr.back;
+	for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+		struct xfs_dir3_icleaf_hdr hdr2;
+
+		blkno = forward ? leafhdr.forw : leafhdr.back;
+		if (blkno == 0)
+			continue;
+		/*
+		 * Read the sibling leaf block.
+		 */
+		error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp);
+		if (error)
+			return error;
+
+		/*
+		 * Count bytes in the two blocks combined.
+		 */
+		count = leafhdr.count - leafhdr.stale;
+		bytes = state->args->geo->blksize -
+			(state->args->geo->blksize >> 2);
+
+		leaf = bp->b_addr;
+		xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf);
+		ents = hdr2.ents;
+		count += hdr2.count - hdr2.stale;
+		bytes -= count * sizeof(ents[0]);
+
+		/*
+		 * Fits with at least 25% to spare.
+		 */
+		if (bytes >= 0)
+			break;
+		xfs_trans_brelse(state->args->trans, bp);
+	}
+	/*
+	 * Didn't like either block, give up.
+	 */
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno)
+		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+	else
+		error = xfs_da3_path_shift(state, &state->path, forward, 0,
+			&rval);
+	if (error) {
+		return error;
+	}
+	*action = rval ? 0 : 1;
+	return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+	xfs_da_state_t		*state,		/* cursor */
+	xfs_da_state_blk_t	*drop_blk,	/* dead block */
+	xfs_da_state_blk_t	*save_blk)	/* surviving block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dir2_leaf_t		*drop_leaf;	/* dead leaf structure */
+	xfs_dir2_leaf_t		*save_leaf;	/* surviving leaf structure */
+	struct xfs_dir3_icleaf_hdr savehdr;
+	struct xfs_dir3_icleaf_hdr drophdr;
+	struct xfs_dir2_leaf_entry *sents;
+	struct xfs_dir2_leaf_entry *dents;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	drop_leaf = drop_blk->bp->b_addr;
+	save_leaf = save_blk->bp->b_addr;
+
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &savehdr, save_leaf);
+	xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &drophdr, drop_leaf);
+	sents = savehdr.ents;
+	dents = drophdr.ents;
+
+	/*
+	 * If there are any stale leaf entries, take this opportunity
+	 * to purge them.
+	 */
+	if (drophdr.stale)
+		xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+	if (savehdr.stale)
+		xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
+	/*
+	 * Move the entries from drop to the appropriate end of save.
+	 */
+	drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
+	if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
+		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+					save_blk->bp, &savehdr, sents, 0,
+					drophdr.count);
+	else
+		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+					save_blk->bp, &savehdr, sents,
+					savehdr.count, drophdr.count);
+	save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+	/* log the changes made when moving the entries */
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, save_leaf, &savehdr);
+	xfs_dir2_leaf_hdr_to_disk(dp->i_mount, drop_leaf, &drophdr);
+	xfs_dir3_leaf_log_header(args, save_blk->bp);
+	xfs_dir3_leaf_log_header(args, drop_blk->bp);
+
+	xfs_dir3_leaf_check(dp, save_blk->bp);
+	xfs_dir3_leaf_check(dp, drop_blk->bp);
+}
+
+/*
+ * Add a new data block to the directory at the free space index that the caller
+ * has specified.
+ */
+static int
+xfs_dir2_node_add_datablk(
+	struct xfs_da_args	*args,
+	struct xfs_da_state_blk	*fblk,
+	xfs_dir2_db_t		*dbno,
+	struct xfs_buf		**dbpp,
+	struct xfs_buf		**fbpp,
+	struct xfs_dir3_icfree_hdr *hdr,
+	int			*findex)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir2_data_free *bf;
+	xfs_dir2_db_t		fbno;
+	struct xfs_buf		*fbp;
+	struct xfs_buf		*dbp;
+	int			error;
+
+	/* Not allowed to allocate, return failure. */
+	if (args->total == 0)
+		return -ENOSPC;
+
+	/* Allocate and initialize the new data block.  */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
+	if (error)
+		return error;
+	error = xfs_dir3_data_init(args, *dbno, &dbp);
+	if (error)
+		return error;
+
+	/*
+	 * Get the freespace block corresponding to the data block
+	 * that was just allocated.
+	 */
+	fbno = xfs_dir2_db_to_fdb(args->geo, *dbno);
+	error = xfs_dir2_free_try_read(tp, dp,
+			       xfs_dir2_db_to_da(args->geo, fbno), &fbp);
+	if (error)
+		return error;
+
+	/*
+	 * If there wasn't a freespace block, the read will
+	 * return a NULL fbp.  Allocate and initialize a new one.
+	 */
+	if (!fbp) {
+		error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
+		if (error)
+			return error;
+
+		if (XFS_IS_CORRUPT(mp,
+				   xfs_dir2_db_to_fdb(args->geo, *dbno) !=
+				   fbno)) {
+			xfs_alert(mp,
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
+				__func__, (unsigned long long)dp->i_ino,
+				(long long)xfs_dir2_db_to_fdb(args->geo, *dbno),
+				(long long)*dbno, (long long)fbno);
+			if (fblk) {
+				xfs_alert(mp,
+			" fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
+					fblk, (unsigned long long)fblk->blkno,
+					fblk->index, fblk->magic);
+			} else {
+				xfs_alert(mp, " ... fblk is NULL");
+			}
+			return -EFSCORRUPTED;
+		}
+
+		/* Get a buffer for the new block. */
+		error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+		if (error)
+			return error;
+		xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr);
+
+		/* Remember the first slot as our empty slot. */
+		hdr->firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+							XFS_DIR2_FREE_OFFSET)) *
+				args->geo->free_max_bests;
+	} else {
+		xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr);
+	}
+
+	/* Set the freespace block index from the data block number. */
+	*findex = xfs_dir2_db_to_fdindex(args->geo, *dbno);
+
+	/* Extend the freespace table if the new data block is off the end. */
+	if (*findex >= hdr->nvalid) {
+		ASSERT(*findex < args->geo->free_max_bests);
+		hdr->nvalid = *findex + 1;
+		hdr->bests[*findex] = cpu_to_be16(NULLDATAOFF);
+	}
+
+	/*
+	 * If this entry was for an empty data block (this should always be
+	 * true) then update the header.
+	 */
+	if (hdr->bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
+		hdr->nused++;
+		xfs_dir2_free_hdr_to_disk(mp, fbp->b_addr, hdr);
+		xfs_dir2_free_log_header(args, fbp);
+	}
+
+	/* Update the freespace value for the new block in the table. */
+	bf = xfs_dir2_data_bestfree_p(mp, dbp->b_addr);
+	hdr->bests[*findex] = bf[0].length;
+
+	*dbpp = dbp;
+	*fbpp = fbp;
+	return 0;
+}
+
+static int
+xfs_dir2_node_find_freeblk(
+	struct xfs_da_args	*args,
+	struct xfs_da_state_blk	*fblk,
+	xfs_dir2_db_t		*dbnop,
+	struct xfs_buf		**fbpp,
+	struct xfs_dir3_icfree_hdr *hdr,
+	int			*findexp,
+	int			length)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_buf		*fbp = NULL;
+	xfs_dir2_db_t		firstfbno;
+	xfs_dir2_db_t		lastfbno;
+	xfs_dir2_db_t		ifbno = -1;
+	xfs_dir2_db_t		dbno = -1;
+	xfs_dir2_db_t		fbno;
+	xfs_fileoff_t		fo;
+	int			findex = 0;
+	int			error;
+
+	/*
+	 * If we came in with a freespace block that means that lookup
+	 * found an entry with our hash value.  This is the freespace
+	 * block for that data entry.
+	 */
+	if (fblk) {
+		fbp = fblk->bp;
+		findex = fblk->index;
+		xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr);
+		if (findex >= 0) {
+			/* caller already found the freespace for us. */
+			ASSERT(findex < hdr->nvalid);
+			ASSERT(be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF);
+			ASSERT(be16_to_cpu(hdr->bests[findex]) >= length);
+			dbno = hdr->firstdb + findex;
+			goto found_block;
+		}
+
+		/*
+		 * The data block looked at didn't have enough room.
+		 * We'll start at the beginning of the freespace entries.
+		 */
+		ifbno = fblk->blkno;
+		xfs_trans_brelse(tp, fbp);
+		fbp = NULL;
+		fblk->bp = NULL;
+	}
+
+	/*
+	 * If we don't have a data block yet, we're going to scan the freespace
+	 * data for a data block with enough free space in it.
+	 */
+	error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
+	if (error)
+		return error;
+	lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+	firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
+
+	for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
+		/* If it's ifbno we already looked at it. */
+		if (fbno == ifbno)
+			continue;
+
+		/*
+		 * Read the block.  There can be holes in the freespace blocks,
+		 * so this might not succeed.  This should be really rare, so
+		 * there's no reason to avoid it.
+		 */
+		error = xfs_dir2_free_try_read(tp, dp,
+				xfs_dir2_db_to_da(args->geo, fbno),
+				&fbp);
+		if (error)
+			return error;
+		if (!fbp)
+			continue;
+
+		xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr);
+
+		/* Scan the free entry array for a large enough free space. */
+		for (findex = hdr->nvalid - 1; findex >= 0; findex--) {
+			if (be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF &&
+			    be16_to_cpu(hdr->bests[findex]) >= length) {
+				dbno = hdr->firstdb + findex;
+				goto found_block;
+			}
+		}
+
+		/* Didn't find free space, go on to next free block */
+		xfs_trans_brelse(tp, fbp);
+	}
+
+found_block:
+	*dbnop = dbno;
+	*fbpp = fbp;
+	*findexp = findex;
+	return 0;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int
+xfs_dir2_node_addname_int(
+	struct xfs_da_args	*args,		/* operation arguments */
+	struct xfs_da_state_blk	*fblk)		/* optional freespace block */
+{
+	struct xfs_dir2_data_unused *dup;	/* data unused entry pointer */
+	struct xfs_dir2_data_entry *dep;	/* data entry pointer */
+	struct xfs_dir2_data_hdr *hdr;		/* data block header */
+	struct xfs_dir2_data_free *bf;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_buf		*dbp;		/* data block buffer */
+	struct xfs_buf		*fbp;		/* freespace buffer */
+	xfs_dir2_data_aoff_t	aoff;
+	xfs_dir2_db_t		dbno;		/* data block number */
+	int			error;		/* error return value */
+	int			findex;		/* freespace entry index */
+	int			length;		/* length of the new entry */
+	int			logfree = 0;	/* need to log free entry */
+	int			needlog = 0;	/* need to log data header */
+	int			needscan = 0;	/* need to rescan data frees */
+	__be16			*tagp;		/* data entry tag pointer */
+
+	length = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
+	error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &freehdr,
+					   &findex, length);
+	if (error)
+		return error;
+
+	/*
+	 * Now we know if we must allocate blocks, so if we are checking whether
+	 * we can insert without allocation then we can return now.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		if (dbno == -1)
+			return -ENOSPC;
+		return 0;
+	}
+
+	/*
+	 * If we don't have a data block, we need to allocate one and make
+	 * the freespace entries refer to it.
+	 */
+	if (dbno == -1) {
+		/* we're going to have to log the free block index later */
+		logfree = 1;
+		error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
+						  &freehdr, &findex);
+	} else {
+		/* Read the data block in. */
+		error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, dbno),
+					   0, &dbp);
+	}
+	if (error)
+		return error;
+
+	/* setup for data block up now */
+	hdr = dbp->b_addr;
+	bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+	ASSERT(be16_to_cpu(bf[0].length) >= length);
+
+	/* Point to the existing unused space. */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
+
+	/* Mark the first part of the unused space, inuse for us. */
+	aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+	error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
+			&needlog, &needscan);
+	if (error) {
+		xfs_trans_brelse(tp, dbp);
+		return error;
+	}
+
+	/* Fill in the new entry and log it. */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+	tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, dbp, dep);
+
+	/* Rescan the freespace and log the data block if needed. */
+	if (needscan)
+		xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+
+	/* If the freespace block entry is now wrong, update it. */
+	if (freehdr.bests[findex] != bf[0].length) {
+		freehdr.bests[findex] = bf[0].length;
+		logfree = 1;
+	}
+
+	/* Log the freespace entry if needed. */
+	if (logfree)
+		xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
+
+	/* Return the data block and offset in args. */
+	args->blkno = (xfs_dablk_t)dbno;
+	args->index = be16_to_cpu(*tagp);
+	return 0;
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int						/* error */
+xfs_dir2_node_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block for insert */
+	int			error;		/* error return value */
+	int			rval;		/* sub-return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_addname(args);
+
+	/*
+	 * Allocate and initialize the state (btree cursor).
+	 */
+	state = xfs_da_state_alloc(args);
+	/*
+	 * Look up the name.  We're not supposed to find it, but
+	 * this gives us the insertion point.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	if (rval != -ENOENT) {
+		goto done;
+	}
+	/*
+	 * Add the data entry to a data block.
+	 * Extravalid is set to a freeblock found by lookup.
+	 */
+	rval = xfs_dir2_node_addname_int(args,
+		state->extravalid ? &state->extrablk : NULL);
+	if (rval) {
+		goto done;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Add the new leaf entry.
+	 */
+	rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+	if (rval == 0) {
+		/*
+		 * It worked, fix the hash values up the btree.
+		 */
+		if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+			xfs_da3_fixhashpath(state, &state->path);
+	} else {
+		/*
+		 * It didn't work, we need to split the leaf block.
+		 */
+		if (args->total == 0) {
+			ASSERT(rval == -ENOSPC);
+			goto done;
+		}
+		/*
+		 * Split the leaf block and insert the new entry.
+		 */
+		rval = xfs_da3_split(state);
+	}
+done:
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da3_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int						/* error */
+xfs_dir2_node_lookup(
+	xfs_da_args_t	*args)			/* operation arguments */
+{
+	int		error;			/* error return value */
+	int		i;			/* btree level */
+	int		rval;			/* operation return value */
+	xfs_da_state_t	*state;			/* btree cursor */
+
+	trace_xfs_dir2_node_lookup(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc(args);
+
+	/*
+	 * Fill in the path to the entry in the cursor.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
+		/* If a CI match, dup the actual name and return -EEXIST */
+		xfs_dir2_data_entry_t	*dep;
+
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)state->extrablk.bp->b_addr +
+						 state->extrablk.index);
+		rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	}
+	/*
+	 * Release the btree blocks and leaf block.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	/*
+	 * Release the data block if we have it.
+	 */
+	if (state->extravalid && state->extrablk.bp) {
+		xfs_trans_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_removename(
+	struct xfs_da_args	*args)		/* operation arguments */
+{
+	struct xfs_da_state_blk	*blk;		/* leaf block */
+	int			error;		/* error return value */
+	int			rval;		/* operation return value */
+	struct xfs_da_state	*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_removename(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc(args);
+
+	/* Look up the entry we're deleting, set up the cursor. */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		goto out_free;
+
+	/* Didn't find it, upper layer screwed up. */
+	if (rval != -EEXIST) {
+		error = rval;
+		goto out_free;
+	}
+
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(state->extravalid);
+	/*
+	 * Remove the leaf and data entries.
+	 * Extrablk refers to the data block.
+	 */
+	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+		&state->extrablk, &rval);
+	if (error)
+		goto out_free;
+	/*
+	 * Fix the hash values up the btree.
+	 */
+	xfs_da3_fixhashpath(state, &state->path);
+	/*
+	 * If we need to join leaf blocks, do it.
+	 */
+	if (rval && state->path.active > 1)
+		error = xfs_da3_join(state);
+	/*
+	 * If no errors so far, try conversion to leaf format.
+	 */
+	if (!error)
+		error = xfs_dir2_node_to_leaf(state);
+out_free:
+	xfs_da_state_free(state);
+	return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_entry_t	*dep;		/* data entry changed */
+	int			error;		/* error return value */
+	int			i;		/* btree level */
+	xfs_ino_t		inum;		/* new inode number */
+	int			ftype;		/* new file type */
+	int			rval;		/* internal return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_replace(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc(args);
+
+	/*
+	 * We have to save new inode number and ftype since
+	 * xfs_da3_node_lookup_int() is going to overwrite them
+	 */
+	inum = args->inumber;
+	ftype = args->filetype;
+
+	/*
+	 * Lookup the entry to change in the btree.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error) {
+		rval = error;
+	}
+	/*
+	 * It should be found, since the vnodeops layer has looked it up
+	 * and locked it.  But paranoia is good.
+	 */
+	if (rval == -EEXIST) {
+		struct xfs_dir3_icleaf_hdr	leafhdr;
+
+		/*
+		 * Find the leaf entry.
+		 */
+		blk = &state->path.blk[state->path.active - 1];
+		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+		ASSERT(state->extravalid);
+
+		xfs_dir2_leaf_hdr_from_disk(state->mp, &leafhdr,
+					    blk->bp->b_addr);
+		/*
+		 * Point to the data entry.
+		 */
+		hdr = state->extrablk.bp->b_addr;
+		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+		       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)hdr +
+		       xfs_dir2_dataptr_to_off(args->geo,
+				be32_to_cpu(leafhdr.ents[blk->index].address)));
+		ASSERT(inum != be64_to_cpu(dep->inumber));
+		/*
+		 * Fill in the new inode number and log the entry.
+		 */
+		dep->inumber = cpu_to_be64(inum);
+		xfs_dir2_data_put_ftype(state->mp, dep, ftype);
+		xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
+		rval = 0;
+	}
+	/*
+	 * Didn't find it, and we're holding a data block.  Drop it.
+	 */
+	else if (state->extravalid) {
+		xfs_trans_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	/*
+	 * Release all the buffers in the cursor.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int						/* error */
+xfs_dir2_node_trim_free(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_fileoff_t		fo,		/* free block number */
+	int			*rvalp)		/* out: did something */
+{
+	struct xfs_buf		*bp;		/* freespace buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	dp = args->dp;
+	tp = args->trans;
+
+	*rvalp = 0;
+
+	/*
+	 * Read the freespace block.
+	 */
+	error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+	if (error)
+		return error;
+	/*
+	 * There can be holes in freespace.  If fo is a hole, there's
+	 * nothing to do.
+	 */
+	if (!bp)
+		return 0;
+	free = bp->b_addr;
+	xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
+
+	/*
+	 * If there are used entries, there's nothing to do.
+	 */
+	if (freehdr.nused > 0) {
+		xfs_trans_brelse(tp, bp);
+		return 0;
+	}
+	/*
+	 * Blow the block away.
+	 */
+	error = xfs_dir2_shrink_inode(args,
+			xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+	if (error) {
+		/*
+		 * Can't fail with ENOSPC since that only happens with no
+		 * space reservation, when breaking up an extent into two
+		 * pieces.  This is the last block of an extent.
+		 */
+		ASSERT(error != -ENOSPC);
+		xfs_trans_brelse(tp, bp);
+		return error;
+	}
+	/*
+	 * Return that we succeeded.
+	 */
+	*rvalp = 1;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
new file mode 100644
index 000000000..7404a9ff1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DIR2_PRIV_H__
+#define __XFS_DIR2_PRIV_H__
+
+struct dir_context;
+
+/*
+ * In-core version of the leaf and free block headers to abstract the
+ * differences in the v2 and v3 disk format of the headers.
+ */
+struct xfs_dir3_icleaf_hdr {
+	uint32_t		forw;
+	uint32_t		back;
+	uint16_t		magic;
+	uint16_t		count;
+	uint16_t		stale;
+
+	/*
+	 * Pointer to the on-disk format entries, which are behind the
+	 * variable size (v4 vs v5) header in the on-disk block.
+	 */
+	struct xfs_dir2_leaf_entry *ents;
+};
+
+struct xfs_dir3_icfree_hdr {
+	uint32_t		magic;
+	uint32_t		firstdb;
+	uint32_t		nvalid;
+	uint32_t		nused;
+
+	/*
+	 * Pointer to the on-disk format entries, which are behind the
+	 * variable size (v4 vs v5) header in the on-disk block.
+	 */
+	__be16			*bests;
+};
+
+/* xfs_dir2.c */
+xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name);
+enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args,
+		const unsigned char *name, int len);
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+				xfs_dir2_db_t *dbp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+				const unsigned char *name, int len);
+
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			       struct xfs_buf **bpp);
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+		struct xfs_buf *lbp, struct xfs_buf *dbp);
+
+/* xfs_dir2_data.c */
+struct xfs_dir2_data_free *xfs_dir2_data_bestfree_p(struct xfs_mount *mp,
+		struct xfs_dir2_data_hdr *hdr);
+__be16 *xfs_dir2_data_entry_tag_p(struct xfs_mount *mp,
+		struct xfs_dir2_data_entry *dep);
+uint8_t xfs_dir2_data_get_ftype(struct xfs_mount *mp,
+		struct xfs_dir2_data_entry *dep);
+void xfs_dir2_data_put_ftype(struct xfs_mount *mp,
+		struct xfs_dir2_data_entry *dep, uint8_t ftype);
+
+#ifdef DEBUG
+extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+#else
+#define	xfs_dir3_data_check(dp,bp)
+#endif
+
+extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp,
+		struct xfs_buf *bp);
+int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp);
+int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+		unsigned int flags);
+
+extern struct xfs_dir2_data_free *
+xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
+		struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+		int *loghead);
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+		struct xfs_buf **bpp);
+
+/* xfs_dir2_leaf.c */
+void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp,
+		struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from);
+void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to,
+		struct xfs_dir3_icleaf_hdr *from);
+int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, struct xfs_buf **bpp);
+int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, struct xfs_buf **bpp);
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+		struct xfs_buf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+		struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+		struct xfs_dir2_leaf_entry *ents, int *indexp,
+		int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+		struct xfs_buf **bpp, uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+		struct xfs_dir3_icleaf_hdr *hdr, struct xfs_buf *bp, int first,
+		int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
+		struct xfs_buf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+		struct xfs_buf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+		struct xfs_buf *lbp, xfs_dir2_db_t db);
+extern struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+		struct xfs_dir2_leaf_entry *ents, int index, int compact,
+		int lowstale, int highstale, int *lfloglow, int *lfloghigh);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp,
+		struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf,
+		bool expensive_checks);
+
+/* xfs_dir2_node.c */
+void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp,
+		struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from);
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+		struct xfs_buf *lbp);
+extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp,
+		struct xfs_buf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
+		struct xfs_da_args *args, int *indexp,
+		struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
+		struct xfs_buf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+	struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+		struct xfs_da_state_blk *drop_blk,
+		struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+		int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, struct xfs_buf **bpp);
+
+/* xfs_dir2_sf.c */
+xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
+		struct xfs_dir2_sf_entry *sfep);
+xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *hdr);
+void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino);
+uint8_t xfs_dir2_sf_get_ftype(struct xfs_mount *mp,
+		struct xfs_dir2_sf_entry *sfep);
+struct xfs_dir2_sf_entry *xfs_dir2_sf_nextentry(struct xfs_mount *mp,
+		struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep);
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+		struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
+		int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
+int xfs_dir2_sf_entsize(struct xfs_mount *mp,
+		struct xfs_dir2_sf_hdr *hdr, int len);
+void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
+		struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino);
+void xfs_dir2_sf_put_ftype(struct xfs_mount *mp,
+		struct xfs_dir2_sf_entry *sfep, uint8_t ftype);
+
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
+		       struct dir_context *ctx, size_t bufsize);
+
+static inline unsigned int
+xfs_dir2_data_entsize(
+	struct xfs_mount	*mp,
+	unsigned int		namelen)
+{
+	unsigned int		len;
+
+	len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen +
+			sizeof(xfs_dir2_data_off_t) /* tag */;
+	if (xfs_has_ftype(mp))
+		len += sizeof(uint8_t);
+	return round_up(len, XFS_DIR2_DATA_ALIGN);
+}
+
+xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp,
+		const struct xfs_name *name);
+enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args,
+		const unsigned char *name, int len);
+
+#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
new file mode 100644
index 000000000..8cd37e6e9
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -0,0 +1,1293 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+				     xfs_dir2_sf_entry_t *sfep,
+				     xfs_dir2_data_aoff_t offset,
+				     int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+				     int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+				    xfs_dir2_sf_entry_t **sfepp,
+				    xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define	xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+
+int
+xfs_dir2_sf_entsize(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_sf_hdr	*hdr,
+	int			len)
+{
+	int			count = len;
+
+	count += sizeof(struct xfs_dir2_sf_entry);	/* namelen + offset */
+	count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
+
+	if (xfs_has_ftype(mp))
+		count += sizeof(uint8_t);
+	return count;
+}
+
+struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (void *)sfep + xfs_dir2_sf_entsize(mp, hdr, sfep->namelen);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number.  The actual inode numbers can
+ * come in two formats as well, either 4 bytes or 8 bytes wide.
+ */
+xfs_ino_t
+xfs_dir2_sf_get_ino(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_sf_hdr		*hdr,
+	struct xfs_dir2_sf_entry	*sfep)
+{
+	uint8_t				*from = sfep->name + sfep->namelen;
+
+	if (xfs_has_ftype(mp))
+		from++;
+
+	if (!hdr->i8count)
+		return get_unaligned_be32(from);
+	return get_unaligned_be64(from) & XFS_MAXINUMBER;
+}
+
+void
+xfs_dir2_sf_put_ino(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_sf_hdr		*hdr,
+	struct xfs_dir2_sf_entry	*sfep,
+	xfs_ino_t			ino)
+{
+	uint8_t				*to = sfep->name + sfep->namelen;
+
+	ASSERT(ino <= XFS_MAXINUMBER);
+
+	if (xfs_has_ftype(mp))
+		to++;
+
+	if (hdr->i8count)
+		put_unaligned_be64(ino, to);
+	else
+		put_unaligned_be32(ino, to);
+}
+
+xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+	struct xfs_dir2_sf_hdr	*hdr)
+{
+	if (!hdr->i8count)
+		return get_unaligned_be32(hdr->parent);
+	return get_unaligned_be64(hdr->parent) & XFS_MAXINUMBER;
+}
+
+void
+xfs_dir2_sf_put_parent_ino(
+	struct xfs_dir2_sf_hdr		*hdr,
+	xfs_ino_t			ino)
+{
+	ASSERT(ino <= XFS_MAXINUMBER);
+
+	if (hdr->i8count)
+		put_unaligned_be64(ino, hdr->parent);
+	else
+		put_unaligned_be32(ino, hdr->parent);
+}
+
+/*
+ * The file type field is stored at the end of the name for filetype enabled
+ * shortform directories, or not at all otherwise.
+ */
+uint8_t
+xfs_dir2_sf_get_ftype(
+	struct xfs_mount		*mp,
+	struct xfs_dir2_sf_entry	*sfep)
+{
+	if (xfs_has_ftype(mp)) {
+		uint8_t			ftype = sfep->name[sfep->namelen];
+
+		if (ftype < XFS_DIR3_FT_MAX)
+			return ftype;
+	}
+
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+void
+xfs_dir2_sf_put_ftype(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_sf_entry *sfep,
+	uint8_t			ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+	if (xfs_has_ftype(mp))
+		sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int						/* size for sf form */
+xfs_dir2_block_sfsize(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dir2_data_hdr_t	*hdr,		/* block directory data */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* output: header for sf form */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_leaf_entry_t	*blp;		/* leaf area of the block */
+	xfs_dir2_block_tail_t	*btp;		/* tail area of the block */
+	int			count;		/* shortform entry count */
+	xfs_dir2_data_entry_t	*dep;		/* data entry in the block */
+	int			i;		/* block entry index */
+	int			i8count;	/* count of big-inode entries */
+	int			isdot;		/* entry is "." */
+	int			isdotdot;	/* entry is ".." */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+	int			namelen;	/* total name bytes */
+	xfs_ino_t		parent = 0;	/* parent inode number */
+	int			size=0;		/* total computed size */
+	int			has_ftype;
+	struct xfs_da_geometry	*geo;
+
+	mp = dp->i_mount;
+	geo = mp->m_dir_geo;
+
+	/*
+	 * if there is a filetype field, add the extra byte to the namelen
+	 * for each entry that we see.
+	 */
+	has_ftype = xfs_has_ftype(mp) ? 1 : 0;
+
+	count = i8count = namelen = 0;
+	btp = xfs_dir2_block_tail_p(geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Iterate over the block's data entries by using the leaf pointers.
+	 */
+	for (i = 0; i < be32_to_cpu(btp->count); i++) {
+		if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Calculate the pointer to the entry at hand.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+				xfs_dir2_dataptr_to_off(geo, addr));
+		/*
+		 * Detect . and .., so we can special-case them.
+		 * . is not included in sf directories.
+		 * .. is included by just the parent inode number.
+		 */
+		isdot = dep->namelen == 1 && dep->name[0] == '.';
+		isdotdot =
+			dep->namelen == 2 &&
+			dep->name[0] == '.' && dep->name[1] == '.';
+
+		if (!isdot)
+			i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
+
+		/* take into account the file type field */
+		if (!isdot && !isdotdot) {
+			count++;
+			namelen += dep->namelen + has_ftype;
+		} else if (isdotdot)
+			parent = be64_to_cpu(dep->inumber);
+		/*
+		 * Calculate the new size, see if we should give up yet.
+		 */
+		size = xfs_dir2_sf_hdr_size(i8count) +	/* header */
+		       count * 3 * sizeof(u8) +		/* namelen + offset */
+		       namelen +			/* name */
+		       (i8count ?			/* inumber */
+				count * XFS_INO64_SIZE :
+				count * XFS_INO32_SIZE);
+		if (size > xfs_inode_data_fork_size(dp))
+			return size;		/* size value is a failure */
+	}
+	/*
+	 * Create the output header, if it worked.
+	 */
+	sfhp->count = count;
+	sfhp->i8count = i8count;
+	xfs_dir2_sf_put_parent_ino(sfhp, parent);
+	return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int						/* error */
+xfs_dir2_block_to_sf(
+	struct xfs_da_args	*args,		/* operation arguments */
+	struct xfs_buf		*bp,
+	int			size,		/* shortform directory size */
+	struct xfs_dir2_sf_hdr	*sfhp)		/* shortform directory hdr */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			error;		/* error return value */
+	int			logflags;	/* inode logging flags */
+	struct xfs_dir2_sf_entry *sfep;		/* shortform entry */
+	struct xfs_dir2_sf_hdr	*sfp;		/* shortform directory header */
+	unsigned int		offset = args->geo->data_entry_offset;
+	unsigned int		end;
+
+	trace_xfs_dir2_block_to_sf(args);
+
+	/*
+	 * Allocate a temporary destination buffer the size of the inode to
+	 * format the data into.  Once we have formatted the data, we can free
+	 * the block and copy the formatted data into the inode literal area.
+	 */
+	sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0);
+	memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+
+	/*
+	 * Loop over the active and unused entries.  Stop when we reach the
+	 * leaf/tail portion of the block.
+	 */
+	end = xfs_dir3_data_end_offset(args->geo, bp->b_addr);
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	while (offset < end) {
+		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
+		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;
+
+		/*
+		 * If it's unused, just skip over it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			offset += be16_to_cpu(dup->length);
+			continue;
+		}
+
+		/*
+		 * Skip .
+		 */
+		if (dep->namelen == 1 && dep->name[0] == '.')
+			ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
+		/*
+		 * Skip .., but make sure the inode number is right.
+		 */
+		else if (dep->namelen == 2 &&
+			 dep->name[0] == '.' && dep->name[1] == '.')
+			ASSERT(be64_to_cpu(dep->inumber) ==
+			       xfs_dir2_sf_get_parent_ino(sfp));
+		/*
+		 * Normal entry, copy it into shortform.
+		 */
+		else {
+			sfep->namelen = dep->namelen;
+			xfs_dir2_sf_put_offset(sfep, offset);
+			memcpy(sfep->name, dep->name, dep->namelen);
+			xfs_dir2_sf_put_ino(mp, sfp, sfep,
+					      be64_to_cpu(dep->inumber));
+			xfs_dir2_sf_put_ftype(mp, sfep,
+					xfs_dir2_data_get_ftype(mp, dep));
+
+			sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+		}
+		offset += xfs_dir2_data_entsize(mp, dep->namelen);
+	}
+	ASSERT((char *)sfep - (char *)sfp == size);
+
+	/* now we are done with the block, we can shrink the inode */
+	logflags = XFS_ILOG_CORE;
+	error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+	if (error) {
+		ASSERT(error != -ENOSPC);
+		goto out;
+	}
+
+	/*
+	 * The buffer is now unconditionally gone, whether
+	 * xfs_dir2_shrink_inode worked or not.
+	 *
+	 * Convert the inode to local format and copy the data in.
+	 */
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size);
+	dp->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+	dp->i_disk_size = size;
+
+	logflags |= XFS_ILOG_DDATA;
+	xfs_dir2_sf_check(args);
+out:
+	xfs_trans_log_inode(args->trans, dp, logflags);
+	kmem_free(sfp);
+	return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int						/* error */
+xfs_dir2_sf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			incr_isize;	/* total change in size */
+	int			new_isize;	/* size after adding name */
+	int			objchange;	/* changing to 8-byte inodes */
+	xfs_dir2_data_aoff_t	offset = 0;	/* offset for new entry */
+	int			pick;		/* which algorithm to use */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	xfs_dir2_sf_entry_t	*sfep = NULL;	/* shortform entry */
+
+	trace_xfs_dir2_sf_addname(args);
+
+	ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
+	dp = args->dp;
+	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+	ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Compute entry (and change in) size.
+	 */
+	incr_isize = xfs_dir2_sf_entsize(dp->i_mount, sfp, args->namelen);
+	objchange = 0;
+
+	/*
+	 * Do we have to change to 8 byte inodes?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+		/*
+		 * Yes, adjust the inode size.  old count + (parent + new)
+		 */
+		incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
+		objchange = 1;
+	}
+
+	new_isize = (int)dp->i_disk_size + incr_isize;
+	/*
+	 * Won't fit as shortform any more (due to size),
+	 * or the pick routine says it won't (due to offset values).
+	 */
+	if (new_isize > xfs_inode_data_fork_size(dp) ||
+	    (pick =
+	     xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+		/*
+		 * Just checking or no space reservation, it doesn't fit.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+			return -ENOSPC;
+		/*
+		 * Convert to block form then add the name.
+		 */
+		error = xfs_dir2_sf_to_block(args);
+		if (error)
+			return error;
+		return xfs_dir2_block_addname(args);
+	}
+	/*
+	 * Just checking, it fits.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+	/*
+	 * Do it the easy way - just add it at the end.
+	 */
+	if (pick == 1)
+		xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+	/*
+	 * Do it the hard way - look for a place to insert the new entry.
+	 * Convert to 8 byte inode numbers first if necessary.
+	 */
+	else {
+		ASSERT(pick == 2);
+		if (objchange)
+			xfs_dir2_sf_toino8(args);
+		xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+	}
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_sf_entry_t	*sfep,		/* pointer to new entry */
+	xfs_dir2_data_aoff_t	offset,		/* offset to use for new ent */
+	int			new_isize)	/* new directory size */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			byteoff;	/* byte offset in sf dir */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	/*
+	 * Grow the in-inode space.
+	 */
+	xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
+			  XFS_DATA_FORK);
+	/*
+	 * Need to set up again due to realloc of the inode data.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+	/*
+	 * Fill in the new entry.
+	 */
+	sfep->namelen = args->namelen;
+	xfs_dir2_sf_put_offset(sfep, offset);
+	memcpy(sfep->name, args->name, sfep->namelen);
+	xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber);
+	xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
+
+	/*
+	 * Update the header and inode.
+	 */
+	sfp->count++;
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+		sfp->i8count++;
+	dp->i_disk_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* changing inode number size */
+	int			new_isize)	/* new directory size */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			add_datasize;	/* data size need for new ent */
+	char			*buf;		/* buffer for old */
+	int			eof;		/* reached end of old dir */
+	int			nbytes;		/* temp for byte copies */
+	xfs_dir2_data_aoff_t	new_offset;	/* next offset value */
+	xfs_dir2_data_aoff_t	offset;		/* current offset value */
+	int			old_isize;	/* previous size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* entry in original dir */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* original shortform dir */
+	xfs_dir2_sf_entry_t	*sfep;		/* entry in new dir */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new shortform dir */
+
+	/*
+	 * Copy the old directory to the stack buffer.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	old_isize = (int)dp->i_disk_size;
+	buf = kmem_alloc(old_isize, 0);
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	memcpy(oldsfp, sfp, old_isize);
+	/*
+	 * Loop over the old directory finding the place we're going
+	 * to insert the new entry.
+	 * If it's going to end up at the end then oldsfep will point there.
+	 */
+	for (offset = args->geo->data_first_offset,
+	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+	      add_datasize = xfs_dir2_data_entsize(mp, args->namelen),
+	      eof = (char *)oldsfep == &buf[old_isize];
+	     !eof;
+	     offset = new_offset + xfs_dir2_data_entsize(mp, oldsfep->namelen),
+	      oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep),
+	      eof = (char *)oldsfep == &buf[old_isize]) {
+		new_offset = xfs_dir2_sf_get_offset(oldsfep);
+		if (offset + add_datasize <= new_offset)
+			break;
+	}
+	/*
+	 * Get rid of the old directory, then allocate space for
+	 * the new one.  We do this so xfs_idata_realloc won't copy
+	 * the data.
+	 */
+	xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+	/*
+	 * Reset the pointer since the buffer was reallocated.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Copy the first part of the directory, including the header.
+	 */
+	nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+	memcpy(sfp, oldsfp, nbytes);
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+	/*
+	 * Fill in the new entry, and update the header counts.
+	 */
+	sfep->namelen = args->namelen;
+	xfs_dir2_sf_put_offset(sfep, offset);
+	memcpy(sfep->name, args->name, sfep->namelen);
+	xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber);
+	xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
+	sfp->count++;
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+		sfp->i8count++;
+	/*
+	 * If there's more left to copy, do that.
+	 */
+	if (!eof) {
+		sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+		memcpy(sfep, oldsfep, old_isize - nbytes);
+	}
+	kmem_free(buf);
+	dp->i_disk_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int					/* pick result */
+xfs_dir2_sf_addname_pick(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* inode # size changes */
+	xfs_dir2_sf_entry_t	**sfepp,	/* out(1): new entry ptr */
+	xfs_dir2_data_aoff_t	*offsetp)	/* out(1): new offset */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			holefit;	/* found hole it will fit in */
+	int			i;		/* entry number */
+	xfs_dir2_data_aoff_t	offset;		/* data block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	int			size;		/* entry's data size */
+	int			used;		/* data bytes used */
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	size = xfs_dir2_data_entsize(mp, args->namelen);
+	offset = args->geo->data_first_offset;
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	holefit = 0;
+	/*
+	 * Loop over sf entries.
+	 * Keep track of data offset and whether we've seen a place
+	 * to insert the new entry.
+	 */
+	for (i = 0; i < sfp->count; i++) {
+		if (!holefit)
+			holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+		offset = xfs_dir2_sf_get_offset(sfep) +
+			 xfs_dir2_data_entsize(mp, sfep->namelen);
+		sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+	}
+	/*
+	 * Calculate data bytes used excluding the new entry, if this
+	 * was a data block (block form directory).
+	 */
+	used = offset +
+	       (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t);
+	/*
+	 * If it won't fit in a block form then we can't insert it,
+	 * we'll go back, convert to block, then try the insert and convert
+	 * to leaf.
+	 */
+	if (used + (holefit ? 0 : size) > args->geo->blksize)
+		return 0;
+	/*
+	 * If changing the inode number size, do it the hard way.
+	 */
+	if (objchange)
+		return 2;
+	/*
+	 * If it won't fit at the end then do it the hard way (use the hole).
+	 */
+	if (used + size > args->geo->blksize)
+		return 2;
+	/*
+	 * Do it the easy way.
+	 */
+	*sfepp = sfep;
+	*offsetp = offset;
+	return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			i;		/* entry number */
+	int			i8count;	/* number of big inode#s */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			offset;		/* data offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform dir entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	offset = args->geo->data_first_offset;
+	ino = xfs_dir2_sf_get_parent_ino(sfp);
+	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+	     i < sfp->count;
+	     i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
+		ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+		ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+		offset =
+			xfs_dir2_sf_get_offset(sfep) +
+			xfs_dir2_data_entsize(mp, sfep->namelen);
+		ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX);
+	}
+	ASSERT(i8count == sfp->i8count);
+	ASSERT((char *)sfep - (char *)sfp == dp->i_disk_size);
+	ASSERT(offset +
+	       (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
+}
+#endif	/* DEBUG */
+
+/* Verify the consistency of an inline directory. */
+xfs_failaddr_t
+xfs_dir2_sf_verify(
+	struct xfs_inode		*ip)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_dir2_sf_hdr		*sfp;
+	struct xfs_dir2_sf_entry	*sfep;
+	struct xfs_dir2_sf_entry	*next_sfep;
+	char				*endp;
+	xfs_ino_t			ino;
+	int				i;
+	int				i8count;
+	int				offset;
+	int64_t				size;
+	int				error;
+	uint8_t				filetype;
+
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+	sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
+	size = ifp->if_bytes;
+
+	/*
+	 * Give up if the directory is way too short.
+	 */
+	if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) ||
+	    size < xfs_dir2_sf_hdr_size(sfp->i8count))
+		return __this_address;
+
+	endp = (char *)sfp + size;
+
+	/* Check .. entry */
+	ino = xfs_dir2_sf_get_parent_ino(sfp);
+	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+	error = xfs_dir_ino_validate(mp, ino);
+	if (error)
+		return __this_address;
+	offset = mp->m_dir_geo->data_first_offset;
+
+	/* Check all reported entries */
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	for (i = 0; i < sfp->count; i++) {
+		/*
+		 * struct xfs_dir2_sf_entry has a variable length.
+		 * Check the fixed-offset parts of the structure are
+		 * within the data buffer.
+		 */
+		if (((char *)sfep + sizeof(*sfep)) >= endp)
+			return __this_address;
+
+		/* Don't allow names with known bad length. */
+		if (sfep->namelen == 0)
+			return __this_address;
+
+		/*
+		 * Check that the variable-length part of the structure is
+		 * within the data buffer.  The next entry starts after the
+		 * name component, so nextentry is an acceptable test.
+		 */
+		next_sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+		if (endp < (char *)next_sfep)
+			return __this_address;
+
+		/* Check that the offsets always increase. */
+		if (xfs_dir2_sf_get_offset(sfep) < offset)
+			return __this_address;
+
+		/* Check the inode number. */
+		ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+		error = xfs_dir_ino_validate(mp, ino);
+		if (error)
+			return __this_address;
+
+		/* Check the file type. */
+		filetype = xfs_dir2_sf_get_ftype(mp, sfep);
+		if (filetype >= XFS_DIR3_FT_MAX)
+			return __this_address;
+
+		offset = xfs_dir2_sf_get_offset(sfep) +
+				xfs_dir2_data_entsize(mp, sfep->namelen);
+
+		sfep = next_sfep;
+	}
+	if (i8count != sfp->i8count)
+		return __this_address;
+	if ((void *)sfep != (void *)endp)
+		return __this_address;
+
+	/* Make sure this whole thing ought to be in local format. */
+	if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	    (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize)
+		return __this_address;
+
+	return NULL;
+}
+
+/*
+ * Create a new (shortform) directory.
+ */
+int					/* error, always 0 */
+xfs_dir2_sf_create(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_ino_t	pino)		/* parent inode number */
+{
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		i8count;	/* parent inode is an 8-byte number */
+	xfs_dir2_sf_hdr_t *sfp;		/* shortform structure */
+	int		size;		/* directory size */
+
+	trace_xfs_dir2_sf_create(args);
+
+	dp = args->dp;
+
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_disk_size == 0);
+	/*
+	 * If it's currently a zero-length extent file,
+	 * convert it to local format.
+	 */
+	if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) {
+		dp->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+	}
+	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+	ASSERT(dp->i_df.if_bytes == 0);
+	i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+	size = xfs_dir2_sf_hdr_size(i8count);
+	/*
+	 * Make a buffer for the data.
+	 */
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	/*
+	 * Fill in the header,
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp->i8count = i8count;
+	/*
+	 * Now can put in the inode number, since i8count is set.
+	 */
+	xfs_dir2_sf_put_parent_ino(sfp, pino);
+	sfp->count = 0;
+	dp->i_disk_size = size;
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int						/* error */
+xfs_dir2_sf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			i;		/* entry index */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	enum xfs_dacmp		cmp;		/* comparison result */
+	xfs_dir2_sf_entry_t	*ci_sfep;	/* case-insens. entry */
+
+	trace_xfs_dir2_sf_lookup(args);
+
+	xfs_dir2_sf_check(args);
+
+	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+	ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Special case for .
+	 */
+	if (args->namelen == 1 && args->name[0] == '.') {
+		args->inumber = dp->i_ino;
+		args->cmpresult = XFS_CMP_EXACT;
+		args->filetype = XFS_DIR3_FT_DIR;
+		return -EEXIST;
+	}
+	/*
+	 * Special case for ..
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
+		args->cmpresult = XFS_CMP_EXACT;
+		args->filetype = XFS_DIR3_FT_DIR;
+		return -EEXIST;
+	}
+	/*
+	 * Loop over all the entries trying to match ours.
+	 */
+	ci_sfep = NULL;
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+	     i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
+		/*
+		 * Compare name and if it's an exact match, return the inode
+		 * number. If it's the first case-insensitive match, store the
+		 * inode number and continue looking for an exact match.
+		 */
+		cmp = xfs_dir2_compname(args, sfep->name, sfep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			args->inumber = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+			args->filetype = xfs_dir2_sf_get_ftype(mp, sfep);
+			if (cmp == XFS_CMP_EXACT)
+				return -EEXIST;
+			ci_sfep = sfep;
+		}
+	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was not found, return -ENOENT.
+	 */
+	if (!ci_sfep)
+		return -ENOENT;
+	/* otherwise process the CI match as required by the caller */
+	return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_removename(
+	xfs_da_args_t		*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			byteoff;	/* offset of removed entry */
+	int			entsize;	/* this entry's size */
+	int			i;		/* shortform entry index */
+	int			newsize;	/* new inode size */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	trace_xfs_dir2_sf_removename(args);
+
+	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+	oldsize = (int)dp->i_disk_size;
+	ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
+	ASSERT(dp->i_df.if_bytes == oldsize);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Loop over the old directory entries.
+	 * Find the one we're deleting.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+	     i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
+		if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
+			ASSERT(xfs_dir2_sf_get_ino(mp, sfp, sfep) ==
+			       args->inumber);
+			break;
+		}
+	}
+	/*
+	 * Didn't find it.
+	 */
+	if (i == sfp->count)
+		return -ENOENT;
+	/*
+	 * Calculate sizes.
+	 */
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	entsize = xfs_dir2_sf_entsize(mp, sfp, args->namelen);
+	newsize = oldsize - entsize;
+	/*
+	 * Copy the part if any after the removed entry, sliding it down.
+	 */
+	if (byteoff + entsize < oldsize)
+		memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+			oldsize - (byteoff + entsize));
+	/*
+	 * Fix up the header and file size.
+	 */
+	sfp->count--;
+	dp->i_disk_size = newsize;
+	/*
+	 * Reallocate, making it smaller.
+	 */
+	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Are we changing inode number size?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		if (sfp->i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->i8count--;
+	}
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Check whether the sf dir replace operation need more blocks.
+ */
+static bool
+xfs_dir2_sf_replace_needblock(
+	struct xfs_inode	*dp,
+	xfs_ino_t		inum)
+{
+	int			newsize;
+	struct xfs_dir2_sf_hdr	*sfp;
+
+	if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL)
+		return false;
+
+	sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+	newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
+
+	return inum > XFS_DIR2_MAX_SHORT_INUM &&
+	       sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp);
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	int			i;		/* entry index */
+	xfs_ino_t		ino=0;		/* entry old inode number */
+	int			i8elevated;	/* sf_toino8 set i8count=1 */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	trace_xfs_dir2_sf_replace(args);
+
+	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+	ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+	/*
+	 * New inode number is large, and need to convert to 8-byte inodes.
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+		int	error;			/* error return value */
+
+		/*
+		 * Won't fit as shortform, convert to block then do replace.
+		 */
+		if (xfs_dir2_sf_replace_needblock(dp, args->inumber)) {
+			error = xfs_dir2_sf_to_block(args);
+			if (error)
+				return error;
+			return xfs_dir2_block_replace(args);
+		}
+		/*
+		 * Still fits, convert to 8-byte now.
+		 */
+		xfs_dir2_sf_toino8(args);
+		i8elevated = 1;
+		sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	} else
+		i8elevated = 0;
+
+	ASSERT(args->namelen != 1 || args->name[0] != '.');
+	/*
+	 * Replace ..'s entry.
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		ino = xfs_dir2_sf_get_parent_ino(sfp);
+		ASSERT(args->inumber != ino);
+		xfs_dir2_sf_put_parent_ino(sfp, args->inumber);
+	}
+	/*
+	 * Normal entry, look for the name.
+	 */
+	else {
+		for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+		     i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
+			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
+				ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+				ASSERT(args->inumber != ino);
+				xfs_dir2_sf_put_ino(mp, sfp, sfep,
+						args->inumber);
+				xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
+				break;
+			}
+		}
+		/*
+		 * Didn't find it.
+		 */
+		if (i == sfp->count) {
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+			if (i8elevated)
+				xfs_dir2_sf_toino4(args);
+			return -ENOENT;
+		}
+	}
+	/*
+	 * See if the old number was large, the new number is small.
+	 */
+	if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * And the old count was one, so need to convert to small.
+		 */
+		if (sfp->i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->i8count--;
+	}
+	/*
+	 * See if the old number was small, the new number is large.
+	 */
+	if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * add to the i8count unless we just converted to 8-byte
+		 * inodes (which does an implied i8count = 1)
+		 */
+		ASSERT(sfp->i8count != 0);
+		if (!i8elevated)
+			sfp->i8count++;
+	}
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	char			*buf;		/* old dir's buffer */
+	int			i;		/* entry index */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
+
+	trace_xfs_dir2_sf_toino4(args);
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, 0);
+	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->i8count == 1);
+	memcpy(buf, oldsfp, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->count = oldsfp->count;
+	sfp->i8count = 0;
+	xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+	     i < sfp->count;
+	     i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep),
+		  oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
+		memcpy(sfep->name, oldsfep->name, sfep->namelen);
+		xfs_dir2_sf_put_ino(mp, sfp, sfep,
+				xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep));
+		xfs_dir2_sf_put_ftype(mp, sfep,
+				xfs_dir2_sf_get_ftype(mp, oldsfep));
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf);
+	dp->i_disk_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	char			*buf;		/* old dir's buffer */
+	int			i;		/* entry index */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
+
+	trace_xfs_dir2_sf_toino8(args);
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, 0);
+	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->i8count == 0);
+	memcpy(buf, oldsfp, oldsize);
+	/*
+	 * Compute the new inode size (nb: entry count + 1 for parent)
+	 */
+	newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->count = oldsfp->count;
+	sfp->i8count = 1;
+	xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+	     i < sfp->count;
+	     i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep),
+		  oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
+		memcpy(sfep->name, oldsfep->name, sfep->namelen);
+		xfs_dir2_sf_put_ino(mp, sfp, sfep,
+				xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep));
+		xfs_dir2_sf_put_ftype(mp, sfep,
+				xfs_dir2_sf_get_ftype(mp, oldsfep));
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf);
+	dp->i_disk_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
new file mode 100644
index 000000000..15a362e2f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+
+int
+xfs_calc_dquots_per_chunk(
+	unsigned int		nbblks)	/* basic block units */
+{
+	ASSERT(nbblks > 0);
+	return BBTOB(nbblks) / sizeof(struct xfs_dqblk);
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ *
+ * The xfs_dqblk structure /contains/ the xfs_disk_dquot structure;
+ * we verify them separately because at some points we have only the
+ * smaller xfs_disk_dquot structure available.
+ */
+
+xfs_failaddr_t
+xfs_dquot_verify(
+	struct xfs_mount	*mp,
+	struct xfs_disk_dquot	*ddq,
+	xfs_dqid_t		id)	/* used only during quotacheck */
+{
+	__u8			ddq_type;
+
+	/*
+	 * We can encounter an uninitialized dquot buffer for 2 reasons:
+	 * 1. If we crash while deleting the quotainode(s), and those blks got
+	 *    used for user data. This is because we take the path of regular
+	 *    file deletion; however, the size field of quotainodes is never
+	 *    updated, so all the tricks that we play in itruncate_finish
+	 *    don't quite matter.
+	 *
+	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
+	 *    But the allocation will be replayed so we'll end up with an
+	 *    uninitialized quota block.
+	 *
+	 * This is all fine; things are still consistent, and we haven't lost
+	 * any quota information. Just don't complain about bad dquot blks.
+	 */
+	if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC))
+		return __this_address;
+	if (ddq->d_version != XFS_DQUOT_VERSION)
+		return __this_address;
+
+	if (ddq->d_type & ~XFS_DQTYPE_ANY)
+		return __this_address;
+	ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK;
+	if (ddq_type != XFS_DQTYPE_USER &&
+	    ddq_type != XFS_DQTYPE_PROJ &&
+	    ddq_type != XFS_DQTYPE_GROUP)
+		return __this_address;
+
+	if ((ddq->d_type & XFS_DQTYPE_BIGTIME) &&
+	    !xfs_has_bigtime(mp))
+		return __this_address;
+
+	if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id)
+		return __this_address;
+
+	if (id != -1 && id != be32_to_cpu(ddq->d_id))
+		return __this_address;
+
+	if (!ddq->d_id)
+		return NULL;
+
+	if (ddq->d_blk_softlimit &&
+	    be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit) &&
+	    !ddq->d_btimer)
+		return __this_address;
+
+	if (ddq->d_ino_softlimit &&
+	    be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit) &&
+	    !ddq->d_itimer)
+		return __this_address;
+
+	if (ddq->d_rtb_softlimit &&
+	    be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit) &&
+	    !ddq->d_rtbtimer)
+		return __this_address;
+
+	return NULL;
+}
+
+xfs_failaddr_t
+xfs_dqblk_verify(
+	struct xfs_mount	*mp,
+	struct xfs_dqblk	*dqb,
+	xfs_dqid_t		id)	/* used only during quotacheck */
+{
+	if (xfs_has_crc(mp) &&
+	    !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+
+	return xfs_dquot_verify(mp, &dqb->dd_diskdq, id);
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+void
+xfs_dqblk_repair(
+	struct xfs_mount	*mp,
+	struct xfs_dqblk	*dqb,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type)
+{
+	/*
+	 * Typically, a repair is only requested by quotacheck.
+	 */
+	ASSERT(id != -1);
+	memset(dqb, 0, sizeof(struct xfs_dqblk));
+
+	dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+	dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+	dqb->dd_diskdq.d_type = type;
+	dqb->dd_diskdq.d_id = cpu_to_be32(id);
+
+	if (xfs_has_crc(mp)) {
+		uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid);
+		xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	bool			readahead)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	int			ndquots;
+	int			i;
+
+	if (!xfs_has_crc(mp))
+		return true;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+	for (i = 0; i < ndquots; i++, d++) {
+		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF)) {
+			if (!readahead)
+				xfs_buf_verifier_error(bp, -EFSBADCRC, __func__,
+					d, sizeof(*d), __this_address);
+			return false;
+		}
+	}
+	return true;
+}
+
+STATIC xfs_failaddr_t
+xfs_dquot_buf_verify(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	bool			readahead)
+{
+	struct xfs_dqblk	*dqb = bp->b_addr;
+	xfs_failaddr_t		fa;
+	xfs_dqid_t		id = 0;
+	int			ndquots;
+	int			i;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+	/*
+	 * On the first read of the buffer, verify that each dquot is valid.
+	 * We don't know what the id of the dquot is supposed to be, just that
+	 * they should be increasing monotonically within the buffer. If the
+	 * first id is corrupt, then it will fail on the second dquot in the
+	 * buffer so corruptions could point to the wrong dquot in this case.
+	 */
+	for (i = 0; i < ndquots; i++) {
+		struct xfs_disk_dquot	*ddq;
+
+		ddq = &dqb[i].dd_diskdq;
+
+		if (i == 0)
+			id = be32_to_cpu(ddq->d_id);
+
+		fa = xfs_dqblk_verify(mp, &dqb[i], id + i);
+		if (fa) {
+			if (!readahead)
+				xfs_buf_verifier_error(bp, -EFSCORRUPTED,
+					__func__, &dqb[i],
+					sizeof(struct xfs_dqblk), fa);
+			return fa;
+		}
+	}
+
+	return NULL;
+}
+
+static xfs_failaddr_t
+xfs_dquot_buf_verify_struct(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+
+	return xfs_dquot_buf_verify(mp, bp, false);
+}
+
+static void
+xfs_dquot_buf_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+
+	if (!xfs_dquot_buf_verify_crc(mp, bp, false))
+		return;
+	xfs_dquot_buf_verify(mp, bp, false);
+}
+
+/*
+ * readahead errors are silent and simply leave the buffer as !done so a real
+ * read will then be run with the xfs_dquot_buf_ops verifier. See
+ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
+ * reporting the failure.
+ */
+static void
+xfs_dquot_buf_readahead_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+
+	if (!xfs_dquot_buf_verify_crc(mp, bp, true) ||
+	    xfs_dquot_buf_verify(mp, bp, true) != NULL) {
+		xfs_buf_ioerror(bp, -EIO);
+		bp->b_flags &= ~XBF_DONE;
+	}
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+
+	xfs_dquot_buf_verify(mp, bp, false);
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+	.name = "xfs_dquot",
+	.magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+		     cpu_to_be16(XFS_DQUOT_MAGIC) },
+	.verify_read = xfs_dquot_buf_read_verify,
+	.verify_write = xfs_dquot_buf_write_verify,
+	.verify_struct = xfs_dquot_buf_verify_struct,
+};
+
+const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
+	.name = "xfs_dquot_ra",
+	.magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+		     cpu_to_be16(XFS_DQUOT_MAGIC) },
+	.verify_read = xfs_dquot_buf_readahead_verify,
+	.verify_write = xfs_dquot_buf_write_verify,
+};
+
+/* Convert an on-disk timer value into an incore timer value. */
+time64_t
+xfs_dquot_from_disk_ts(
+	struct xfs_disk_dquot	*ddq,
+	__be32			dtimer)
+{
+	uint32_t		t = be32_to_cpu(dtimer);
+
+	if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME))
+		return xfs_dq_bigtime_to_unix(t);
+
+	return t;
+}
+
+/* Convert an incore timer value into an on-disk timer value. */
+__be32
+xfs_dquot_to_disk_ts(
+	struct xfs_dquot	*dqp,
+	time64_t		timer)
+{
+	uint32_t		t = timer;
+
+	if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME))
+		t = xfs_dq_unix_to_bigtime(timer);
+
+	return cpu_to_be32(t);
+}
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
new file mode 100644
index 000000000..536290816
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2017 Oracle.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ERRORTAG_H_
+#define __XFS_ERRORTAG_H_
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR				0
+#define XFS_ERRTAG_IFLUSH_1				1
+#define XFS_ERRTAG_IFLUSH_2				2
+#define XFS_ERRTAG_IFLUSH_3				3
+#define XFS_ERRTAG_IFLUSH_4				4
+#define XFS_ERRTAG_IFLUSH_5				5
+#define XFS_ERRTAG_IFLUSH_6				6
+#define XFS_ERRTAG_DA_READ_BUF				7
+#define XFS_ERRTAG_BTREE_CHECK_LBLOCK			8
+#define XFS_ERRTAG_BTREE_CHECK_SBLOCK			9
+#define XFS_ERRTAG_ALLOC_READ_AGF			10
+#define XFS_ERRTAG_IALLOC_READ_AGI			11
+#define XFS_ERRTAG_ITOBP_INOTOBP			12
+#define XFS_ERRTAG_IUNLINK				13
+#define XFS_ERRTAG_IUNLINK_REMOVE			14
+#define XFS_ERRTAG_DIR_INO_VALIDATE			15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK			16
+#define XFS_ERRTAG_IODONE_IOERR				17
+#define XFS_ERRTAG_STRATREAD_IOERR			18
+#define XFS_ERRTAG_STRATCMPL_IOERR			19
+#define XFS_ERRTAG_DIOWRITE_IOERR			20
+#define XFS_ERRTAG_BMAPIFORMAT				21
+#define XFS_ERRTAG_FREE_EXTENT				22
+#define XFS_ERRTAG_RMAP_FINISH_ONE			23
+#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE		24
+#define XFS_ERRTAG_REFCOUNT_FINISH_ONE			25
+#define XFS_ERRTAG_BMAP_FINISH_ONE			26
+#define XFS_ERRTAG_AG_RESV_CRITICAL			27
+/*
+ * DEBUG mode instrumentation to test and/or trigger delayed allocation
+ * block killing in the event of failed writes. When enabled, all
+ * buffered writes are silenty dropped and handled as if they failed.
+ * All delalloc blocks in the range of the write (including pre-existing
+ * delalloc blocks!) are tossed as part of the write failure error
+ * handling sequence.
+ */
+#define XFS_ERRTAG_DROP_WRITES				28
+#define XFS_ERRTAG_LOG_BAD_CRC				29
+#define XFS_ERRTAG_LOG_ITEM_PIN				30
+#define XFS_ERRTAG_BUF_LRU_REF				31
+#define XFS_ERRTAG_FORCE_SCRUB_REPAIR			32
+#define XFS_ERRTAG_FORCE_SUMMARY_RECALC			33
+#define XFS_ERRTAG_IUNLINK_FALLBACK			34
+#define XFS_ERRTAG_BUF_IOERROR				35
+#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS			36
+#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT		37
+#define XFS_ERRTAG_AG_RESV_FAIL				38
+#define XFS_ERRTAG_LARP					39
+#define XFS_ERRTAG_DA_LEAF_SPLIT			40
+#define XFS_ERRTAG_ATTR_LEAF_TO_NODE			41
+#define XFS_ERRTAG_MAX					42
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT				100
+#define XFS_RANDOM_IFLUSH_1				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK			(XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR				(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_FREE_EXTENT				1
+#define XFS_RANDOM_RMAP_FINISH_ONE			1
+#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE		1
+#define XFS_RANDOM_REFCOUNT_FINISH_ONE			1
+#define XFS_RANDOM_BMAP_FINISH_ONE			1
+#define XFS_RANDOM_AG_RESV_CRITICAL			4
+#define XFS_RANDOM_DROP_WRITES				1
+#define XFS_RANDOM_LOG_BAD_CRC				1
+#define XFS_RANDOM_LOG_ITEM_PIN				1
+#define XFS_RANDOM_BUF_LRU_REF				2
+#define XFS_RANDOM_FORCE_SCRUB_REPAIR			1
+#define XFS_RANDOM_FORCE_SUMMARY_RECALC			1
+#define XFS_RANDOM_IUNLINK_FALLBACK			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BUF_IOERROR				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_REDUCE_MAX_IEXTENTS			1
+#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT		1
+#define XFS_RANDOM_AG_RESV_FAIL				1
+#define XFS_RANDOM_LARP					1
+#define XFS_RANDOM_DA_LEAF_SPLIT			1
+#define XFS_RANDOM_ATTR_LEAF_TO_NODE			1
+
+#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
new file mode 100644
index 000000000..371dc0723
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -0,0 +1,1829 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+
+/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+#define	XFS_SB_MAGIC		0x58465342	/* 'XFSB' */
+#define	XFS_SB_VERSION_1	1		/* 5.3, 6.0.1, 6.1 */
+#define	XFS_SB_VERSION_2	2		/* 6.2 - attributes */
+#define	XFS_SB_VERSION_3	3		/* 6.2 - new inode version */
+#define	XFS_SB_VERSION_4	4		/* 6.2+ - bitmask version */
+#define	XFS_SB_VERSION_5	5		/* CRC enabled filesystem */
+#define	XFS_SB_VERSION_NUMBITS		0x000f
+#define	XFS_SB_VERSION_ALLFBITS		0xfff0
+#define	XFS_SB_VERSION_ATTRBIT		0x0010
+#define	XFS_SB_VERSION_NLINKBIT		0x0020
+#define	XFS_SB_VERSION_QUOTABIT		0x0040
+#define	XFS_SB_VERSION_ALIGNBIT		0x0080
+#define	XFS_SB_VERSION_DALIGNBIT	0x0100
+#define	XFS_SB_VERSION_SHAREDBIT	0x0200
+#define XFS_SB_VERSION_LOGV2BIT		0x0400
+#define XFS_SB_VERSION_SECTORBIT	0x0800
+#define	XFS_SB_VERSION_EXTFLGBIT	0x1000
+#define	XFS_SB_VERSION_DIRV2BIT		0x2000
+#define	XFS_SB_VERSION_BORGBIT		0x4000	/* ASCII only case-insens. */
+#define	XFS_SB_VERSION_MOREBITSBIT	0x8000
+
+/*
+ * The size of a single extended attribute on disk is limited by
+ * the size of index values within the attribute entries themselves.
+ * These are be16 fields, so we can only support attribute data
+ * sizes up to 2^16 bytes in length.
+ */
+#define XFS_XATTR_SIZE_MAX (1 << 16)
+
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define	XFS_SB_VERSION_OKBITS		\
+	((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+		~XFS_SB_VERSION_SHAREDBIT)
+
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_RESERVED1BIT	0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT	0x00000002	/* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT	0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT	0x00000008	/* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT	0x00000010	/* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT	0x00000080	/* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT		0x00000100	/* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE		0x00000200	/* inode type in dir */
+
+#define	XFS_SB_VERSION2_OKBITS		\
+	(XFS_SB_VERSION2_LAZYSBCOUNTBIT	| \
+	 XFS_SB_VERSION2_ATTR2BIT	| \
+	 XFS_SB_VERSION2_PROJID32BIT	| \
+	 XFS_SB_VERSION2_FTYPE)
+
+/* Maximum size of the xfs filesystem label, no terminating NULL */
+#define XFSLABEL_MAX			12
+
+/*
+ * Superblock - in core version.  Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+	uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
+	uint32_t	sb_blocksize;	/* logical block size, bytes */
+	xfs_rfsblock_t	sb_dblocks;	/* number of data blocks */
+	xfs_rfsblock_t	sb_rblocks;	/* number of realtime blocks */
+	xfs_rtblock_t	sb_rextents;	/* number of realtime extents */
+	uuid_t		sb_uuid;	/* user-visible file system unique id */
+	xfs_fsblock_t	sb_logstart;	/* starting block of log if internal */
+	xfs_ino_t	sb_rootino;	/* root inode number */
+	xfs_ino_t	sb_rbmino;	/* bitmap inode for realtime extents */
+	xfs_ino_t	sb_rsumino;	/* summary inode for rt bitmap */
+	xfs_agblock_t	sb_rextsize;	/* realtime extent size, blocks */
+	xfs_agblock_t	sb_agblocks;	/* size of an allocation group */
+	xfs_agnumber_t	sb_agcount;	/* number of allocation groups */
+	xfs_extlen_t	sb_rbmblocks;	/* number of rt bitmap blocks */
+	xfs_extlen_t	sb_logblocks;	/* number of log blocks */
+	uint16_t	sb_versionnum;	/* header version == XFS_SB_VERSION */
+	uint16_t	sb_sectsize;	/* volume sector size, bytes */
+	uint16_t	sb_inodesize;	/* inode size, bytes */
+	uint16_t	sb_inopblock;	/* inodes per block */
+	char		sb_fname[XFSLABEL_MAX]; /* file system name */
+	uint8_t		sb_blocklog;	/* log2 of sb_blocksize */
+	uint8_t		sb_sectlog;	/* log2 of sb_sectsize */
+	uint8_t		sb_inodelog;	/* log2 of sb_inodesize */
+	uint8_t		sb_inopblog;	/* log2 of sb_inopblock */
+	uint8_t		sb_agblklog;	/* log2 of sb_agblocks (rounded up) */
+	uint8_t		sb_rextslog;	/* log2 of sb_rextents */
+	uint8_t		sb_inprogress;	/* mkfs is in progress, don't mount */
+	uint8_t		sb_imax_pct;	/* max % of fs for inode space */
+					/* statistics */
+	/*
+	 * These fields must remain contiguous.  If you really
+	 * want to change their layout, make sure you fix the
+	 * code in xfs_trans_apply_sb_deltas().
+	 */
+	uint64_t	sb_icount;	/* allocated inodes */
+	uint64_t	sb_ifree;	/* free inodes */
+	uint64_t	sb_fdblocks;	/* free data blocks */
+	uint64_t	sb_frextents;	/* free realtime extents */
+	/*
+	 * End contiguous fields.
+	 */
+	xfs_ino_t	sb_uquotino;	/* user quota inode */
+	xfs_ino_t	sb_gquotino;	/* group quota inode */
+	uint16_t	sb_qflags;	/* quota flags */
+	uint8_t		sb_flags;	/* misc. flags */
+	uint8_t		sb_shared_vn;	/* shared version number */
+	xfs_extlen_t	sb_inoalignmt;	/* inode chunk alignment, fsblocks */
+	uint32_t	sb_unit;	/* stripe or raid unit */
+	uint32_t	sb_width;	/* stripe or raid width */
+	uint8_t		sb_dirblklog;	/* log2 of dir block size (fsbs) */
+	uint8_t		sb_logsectlog;	/* log2 of the log sector size */
+	uint16_t	sb_logsectsize;	/* sector size for the log, bytes */
+	uint32_t	sb_logsunit;	/* stripe unit size for the log */
+	uint32_t	sb_features2;	/* additional feature bits */
+
+	/*
+	 * bad features2 field as a result of failing to pad the sb structure to
+	 * 64 bits. Some machines will be using this field for features2 bits.
+	 * Easiest just to mark it bad and not use it for anything else.
+	 *
+	 * This is not kept up to date in memory; it is always overwritten by
+	 * the value in sb_features2 when formatting the incore superblock to
+	 * the disk buffer.
+	 */
+	uint32_t	sb_bad_features2;
+
+	/* version 5 superblock fields start here */
+
+	/* feature masks */
+	uint32_t	sb_features_compat;
+	uint32_t	sb_features_ro_compat;
+	uint32_t	sb_features_incompat;
+	uint32_t	sb_features_log_incompat;
+
+	uint32_t	sb_crc;		/* superblock crc */
+	xfs_extlen_t	sb_spino_align;	/* sparse inode chunk alignment */
+
+	xfs_ino_t	sb_pquotino;	/* project quota inode */
+	xfs_lsn_t	sb_lsn;		/* last write sequence */
+	uuid_t		sb_meta_uuid;	/* metadata file system unique id */
+
+	/* must be padded to 64 bit alignment */
+} xfs_sb_t;
+
+#define XFS_SB_CRC_OFF		offsetof(struct xfs_sb, sb_crc)
+
+/*
+ * Superblock - on disk version.  Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+struct xfs_dsb {
+	__be32		sb_magicnum;	/* magic number == XFS_SB_MAGIC */
+	__be32		sb_blocksize;	/* logical block size, bytes */
+	__be64		sb_dblocks;	/* number of data blocks */
+	__be64		sb_rblocks;	/* number of realtime blocks */
+	__be64		sb_rextents;	/* number of realtime extents */
+	uuid_t		sb_uuid;	/* user-visible file system unique id */
+	__be64		sb_logstart;	/* starting block of log if internal */
+	__be64		sb_rootino;	/* root inode number */
+	__be64		sb_rbmino;	/* bitmap inode for realtime extents */
+	__be64		sb_rsumino;	/* summary inode for rt bitmap */
+	__be32		sb_rextsize;	/* realtime extent size, blocks */
+	__be32		sb_agblocks;	/* size of an allocation group */
+	__be32		sb_agcount;	/* number of allocation groups */
+	__be32		sb_rbmblocks;	/* number of rt bitmap blocks */
+	__be32		sb_logblocks;	/* number of log blocks */
+	__be16		sb_versionnum;	/* header version == XFS_SB_VERSION */
+	__be16		sb_sectsize;	/* volume sector size, bytes */
+	__be16		sb_inodesize;	/* inode size, bytes */
+	__be16		sb_inopblock;	/* inodes per block */
+	char		sb_fname[XFSLABEL_MAX]; /* file system name */
+	__u8		sb_blocklog;	/* log2 of sb_blocksize */
+	__u8		sb_sectlog;	/* log2 of sb_sectsize */
+	__u8		sb_inodelog;	/* log2 of sb_inodesize */
+	__u8		sb_inopblog;	/* log2 of sb_inopblock */
+	__u8		sb_agblklog;	/* log2 of sb_agblocks (rounded up) */
+	__u8		sb_rextslog;	/* log2 of sb_rextents */
+	__u8		sb_inprogress;	/* mkfs is in progress, don't mount */
+	__u8		sb_imax_pct;	/* max % of fs for inode space */
+					/* statistics */
+	/*
+	 * These fields must remain contiguous.  If you really
+	 * want to change their layout, make sure you fix the
+	 * code in xfs_trans_apply_sb_deltas().
+	 */
+	__be64		sb_icount;	/* allocated inodes */
+	__be64		sb_ifree;	/* free inodes */
+	__be64		sb_fdblocks;	/* free data blocks */
+	__be64		sb_frextents;	/* free realtime extents */
+	/*
+	 * End contiguous fields.
+	 */
+	__be64		sb_uquotino;	/* user quota inode */
+	__be64		sb_gquotino;	/* group quota inode */
+	__be16		sb_qflags;	/* quota flags */
+	__u8		sb_flags;	/* misc. flags */
+	__u8		sb_shared_vn;	/* shared version number */
+	__be32		sb_inoalignmt;	/* inode chunk alignment, fsblocks */
+	__be32		sb_unit;	/* stripe or raid unit */
+	__be32		sb_width;	/* stripe or raid width */
+	__u8		sb_dirblklog;	/* log2 of dir block size (fsbs) */
+	__u8		sb_logsectlog;	/* log2 of the log sector size */
+	__be16		sb_logsectsize;	/* sector size for the log, bytes */
+	__be32		sb_logsunit;	/* stripe unit size for the log */
+	__be32		sb_features2;	/* additional feature bits */
+	/*
+	 * bad features2 field as a result of failing to pad the sb
+	 * structure to 64 bits. Some machines will be using this field
+	 * for features2 bits. Easiest just to mark it bad and not use
+	 * it for anything else.
+	 */
+	__be32		sb_bad_features2;
+
+	/* version 5 superblock fields start here */
+
+	/* feature masks */
+	__be32		sb_features_compat;
+	__be32		sb_features_ro_compat;
+	__be32		sb_features_incompat;
+	__be32		sb_features_log_incompat;
+
+	__le32		sb_crc;		/* superblock crc */
+	__be32		sb_spino_align;	/* sparse inode chunk alignment */
+
+	__be64		sb_pquotino;	/* project quota inode */
+	__be64		sb_lsn;		/* last write sequence */
+	uuid_t		sb_meta_uuid;	/* metadata file system unique id */
+
+	/* must be padded to 64 bit alignment */
+};
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS		0x00	/* no flags set */
+#define XFS_SBF_READONLY	0x01	/* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN	0
+
+#define	XFS_SB_VERSION_NUM(sbp)	((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+
+static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+/*
+ * Detect a mismatched features2 field.  Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+{
+	return sbp->sb_bad_features2 != sbp->sb_features2;
+}
+
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+{
+	return xfs_sb_is_v5(sbp) ||
+	       (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+}
+
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+}
+
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+	sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+
+static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+	sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+}
+
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN	~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+	struct xfs_sb	*sbp,
+	uint32_t	feature)
+{
+	return (sbp->sb_features_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)		/* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
+#define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
+#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
+		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
+		 XFS_SB_FEAT_RO_COMPAT_REFLINK| \
+		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+	struct xfs_sb	*sbp,
+	uint32_t	feature)
+{
+	return (sbp->sb_features_ro_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_FTYPE	(1 << 0)	/* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES	(1 << 1)	/* sparse inode chunks */
+#define XFS_SB_FEAT_INCOMPAT_META_UUID	(1 << 2)	/* metadata UUID */
+#define XFS_SB_FEAT_INCOMPAT_BIGTIME	(1 << 3)	/* large timestamps */
+#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4)	/* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64	(1 << 5)	/* large extent counters */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+		(XFS_SB_FEAT_INCOMPAT_FTYPE|	\
+		 XFS_SB_FEAT_INCOMPAT_SPINODES|	\
+		 XFS_SB_FEAT_INCOMPAT_META_UUID| \
+		 XFS_SB_FEAT_INCOMPAT_BIGTIME| \
+		 XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
+		 XFS_SB_FEAT_INCOMPAT_NREXT64)
+
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+	struct xfs_sb	*sbp,
+	uint32_t	feature)
+{
+	return (sbp->sb_features_incompat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS   (1 << 0)	/* Delayed Attributes */
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
+	(XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+	struct xfs_sb	*sbp,
+	uint32_t	feature)
+{
+	return (sbp->sb_features_log_incompat & feature) != 0;
+}
+
+static inline void
+xfs_sb_remove_incompat_log_features(
+	struct xfs_sb	*sbp)
+{
+	sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL;
+}
+
+static inline void
+xfs_sb_add_incompat_log_features(
+	struct xfs_sb	*sbp,
+	unsigned int	features)
+{
+	sbp->sb_features_log_incompat |= features;
+}
+
+static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+{
+	return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+		 XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+}
+
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+	return (ino == sbp->sb_uquotino ||
+		ino == sbp->sb_gquotino ||
+		ino == sbp->sb_pquotino);
+}
+
+#define XFS_SB_DADDR		((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define	XFS_SB_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+
+#define	XFS_HDR_BLOCK(mp,d)	((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define	XFS_DADDR_TO_FSB(mp,d)	XFS_AGB_TO_FSB(mp, \
+			xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define	XFS_FSB_TO_DADDR(mp,fsbno)	XFS_AGB_TO_DADDR(mp, \
+			XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec)	((sec) << (mp)->m_sectbb_log)
+
+/*
+ * File system block to basic block conversions.
+ */
+#define	XFS_FSB_TO_BB(mp,fsbno)	((fsbno) << (mp)->m_blkbb_log)
+#define	XFS_BB_TO_FSB(mp,bb)	\
+	(((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define	XFS_BB_TO_FSBT(mp,bb)	((bb) >> (mp)->m_blkbb_log)
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno)	((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)	\
+	((((uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)	(((uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+
+/*
+ * Allocation group header
+ *
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+#define	XFS_AGF_MAGIC	0x58414746	/* 'XAGF' */
+#define	XFS_AGI_MAGIC	0x58414749	/* 'XAGI' */
+#define	XFS_AGFL_MAGIC	0x5841464c	/* 'XAFL' */
+#define	XFS_AGF_VERSION	1
+#define	XFS_AGI_VERSION	1
+
+#define	XFS_AGF_GOOD_VERSION(v)	((v) == XFS_AGF_VERSION)
+#define	XFS_AGI_GOOD_VERSION(v)	((v) == XFS_AGI_VERSION)
+
+/*
+ * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
+ * arrays below.
+ */
+#define	XFS_BTNUM_AGF	((int)XFS_BTNUM_RMAPi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf {
+	/*
+	 * Common allocation group header information
+	 */
+	__be32		agf_magicnum;	/* magic number == XFS_AGF_MAGIC */
+	__be32		agf_versionnum;	/* header version == XFS_AGF_VERSION */
+	__be32		agf_seqno;	/* sequence # starting from 0 */
+	__be32		agf_length;	/* size in blocks of a.g. */
+	/*
+	 * Freespace and rmap information
+	 */
+	__be32		agf_roots[XFS_BTNUM_AGF];	/* root blocks */
+	__be32		agf_levels[XFS_BTNUM_AGF];	/* btree levels */
+
+	__be32		agf_flfirst;	/* first freelist block's index */
+	__be32		agf_fllast;	/* last freelist block's index */
+	__be32		agf_flcount;	/* count of blocks in freelist */
+	__be32		agf_freeblks;	/* total free blocks */
+
+	__be32		agf_longest;	/* longest free space */
+	__be32		agf_btreeblks;	/* # of blocks held in AGF btrees */
+	uuid_t		agf_uuid;	/* uuid of filesystem */
+
+	__be32		agf_rmap_blocks;	/* rmapbt blocks used */
+	__be32		agf_refcount_blocks;	/* refcountbt blocks used */
+
+	__be32		agf_refcount_root;	/* refcount tree root block */
+	__be32		agf_refcount_level;	/* refcount btree levels */
+
+	/*
+	 * reserve some contiguous space for future logged fields before we add
+	 * the unlogged fields. This makes the range logging via flags and
+	 * structure offsets much simpler.
+	 */
+	__be64		agf_spare64[14];
+
+	/* unlogged fields, written during buffer writeback. */
+	__be64		agf_lsn;	/* last write sequence */
+	__be32		agf_crc;	/* crc of agf sector */
+	__be32		agf_spare2;
+
+	/* structure must be padded to 64 bit alignment */
+} xfs_agf_t;
+
+#define XFS_AGF_CRC_OFF		offsetof(struct xfs_agf, agf_crc)
+
+#define	XFS_AGF_MAGICNUM	(1u << 0)
+#define	XFS_AGF_VERSIONNUM	(1u << 1)
+#define	XFS_AGF_SEQNO		(1u << 2)
+#define	XFS_AGF_LENGTH		(1u << 3)
+#define	XFS_AGF_ROOTS		(1u << 4)
+#define	XFS_AGF_LEVELS		(1u << 5)
+#define	XFS_AGF_FLFIRST		(1u << 6)
+#define	XFS_AGF_FLLAST		(1u << 7)
+#define	XFS_AGF_FLCOUNT		(1u << 8)
+#define	XFS_AGF_FREEBLKS	(1u << 9)
+#define	XFS_AGF_LONGEST		(1u << 10)
+#define	XFS_AGF_BTREEBLKS	(1u << 11)
+#define	XFS_AGF_UUID		(1u << 12)
+#define	XFS_AGF_RMAP_BLOCKS	(1u << 13)
+#define	XFS_AGF_REFCOUNT_BLOCKS	(1u << 14)
+#define	XFS_AGF_REFCOUNT_ROOT	(1u << 15)
+#define	XFS_AGF_REFCOUNT_LEVEL	(1u << 16)
+#define	XFS_AGF_SPARE64		(1u << 17)
+#define	XFS_AGF_NUM_BITS	18
+#define	XFS_AGF_ALL_BITS	((1u << XFS_AGF_NUM_BITS) - 1)
+
+#define XFS_AGF_FLAGS \
+	{ XFS_AGF_MAGICNUM,	"MAGICNUM" }, \
+	{ XFS_AGF_VERSIONNUM,	"VERSIONNUM" }, \
+	{ XFS_AGF_SEQNO,	"SEQNO" }, \
+	{ XFS_AGF_LENGTH,	"LENGTH" }, \
+	{ XFS_AGF_ROOTS,	"ROOTS" }, \
+	{ XFS_AGF_LEVELS,	"LEVELS" }, \
+	{ XFS_AGF_FLFIRST,	"FLFIRST" }, \
+	{ XFS_AGF_FLLAST,	"FLLAST" }, \
+	{ XFS_AGF_FLCOUNT,	"FLCOUNT" }, \
+	{ XFS_AGF_FREEBLKS,	"FREEBLKS" }, \
+	{ XFS_AGF_LONGEST,	"LONGEST" }, \
+	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }, \
+	{ XFS_AGF_UUID,		"UUID" }, \
+	{ XFS_AGF_RMAP_BLOCKS,	"RMAP_BLOCKS" }, \
+	{ XFS_AGF_REFCOUNT_BLOCKS,	"REFCOUNT_BLOCKS" }, \
+	{ XFS_AGF_REFCOUNT_ROOT,	"REFCOUNT_ROOT" }, \
+	{ XFS_AGF_REFCOUNT_LEVEL,	"REFCOUNT_LEVEL" }, \
+	{ XFS_AGF_SPARE64,	"SPARE64" }
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp)	((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define	XFS_AGI_UNLINKED_BUCKETS	64
+
+typedef struct xfs_agi {
+	/*
+	 * Common allocation group header information
+	 */
+	__be32		agi_magicnum;	/* magic number == XFS_AGI_MAGIC */
+	__be32		agi_versionnum;	/* header version == XFS_AGI_VERSION */
+	__be32		agi_seqno;	/* sequence # starting from 0 */
+	__be32		agi_length;	/* size in blocks of a.g. */
+	/*
+	 * Inode information
+	 * Inodes are mapped by interpreting the inode number, so no
+	 * mapping data is needed here.
+	 */
+	__be32		agi_count;	/* count of allocated inodes */
+	__be32		agi_root;	/* root of inode btree */
+	__be32		agi_level;	/* levels in inode btree */
+	__be32		agi_freecount;	/* number of free inodes */
+
+	__be32		agi_newino;	/* new inode just allocated */
+	__be32		agi_dirino;	/* last directory inode chunk */
+	/*
+	 * Hash table of inodes which have been unlinked but are
+	 * still being referenced.
+	 */
+	__be32		agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+	/*
+	 * This marks the end of logging region 1 and start of logging region 2.
+	 */
+	uuid_t		agi_uuid;	/* uuid of filesystem */
+	__be32		agi_crc;	/* crc of agi sector */
+	__be32		agi_pad32;
+	__be64		agi_lsn;	/* last write sequence */
+
+	__be32		agi_free_root; /* root of the free inode btree */
+	__be32		agi_free_level;/* levels in free inode btree */
+
+	__be32		agi_iblocks;	/* inobt blocks used */
+	__be32		agi_fblocks;	/* finobt blocks used */
+
+	/* structure must be padded to 64 bit alignment */
+} xfs_agi_t;
+
+#define XFS_AGI_CRC_OFF		offsetof(struct xfs_agi, agi_crc)
+
+#define	XFS_AGI_MAGICNUM	(1u << 0)
+#define	XFS_AGI_VERSIONNUM	(1u << 1)
+#define	XFS_AGI_SEQNO		(1u << 2)
+#define	XFS_AGI_LENGTH		(1u << 3)
+#define	XFS_AGI_COUNT		(1u << 4)
+#define	XFS_AGI_ROOT		(1u << 5)
+#define	XFS_AGI_LEVEL		(1u << 6)
+#define	XFS_AGI_FREECOUNT	(1u << 7)
+#define	XFS_AGI_NEWINO		(1u << 8)
+#define	XFS_AGI_DIRINO		(1u << 9)
+#define	XFS_AGI_UNLINKED	(1u << 10)
+#define	XFS_AGI_NUM_BITS_R1	11	/* end of the 1st agi logging region */
+#define	XFS_AGI_ALL_BITS_R1	((1u << XFS_AGI_NUM_BITS_R1) - 1)
+#define	XFS_AGI_FREE_ROOT	(1u << 11)
+#define	XFS_AGI_FREE_LEVEL	(1u << 12)
+#define	XFS_AGI_IBLOCKS		(1u << 13) /* both inobt/finobt block counters */
+#define	XFS_AGI_NUM_BITS_R2	14
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp)	((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define	XFS_AGI_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp)	((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define	XFS_AGFL_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define	XFS_BUF_TO_AGFL(bp)	((struct xfs_agfl *)((bp)->b_addr))
+
+struct xfs_agfl {
+	__be32		agfl_magicnum;
+	__be32		agfl_seqno;
+	uuid_t		agfl_uuid;
+	__be64		agfl_lsn;
+	__be32		agfl_crc;
+} __attribute__((packed));
+
+#define XFS_AGFL_CRC_OFF	offsetof(struct xfs_agfl, agfl_crc)
+
+#define XFS_AGB_TO_FSB(mp,agno,agbno)	\
+	(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define	XFS_FSB_TO_AGNO(mp,fsbno)	\
+	((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define	XFS_FSB_TO_AGBNO(mp,fsbno)	\
+	((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define	XFS_AGB_TO_DADDR(mp,agno,agbno)	\
+	((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+		(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define	XFS_AG_DADDR(mp,agno,d)		(XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define	XFS_AG_CHECK_DADDR(mp,d,len)	\
+	((len) == 1 ? \
+	    ASSERT((d) == XFS_SB_DADDR || \
+		   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+	    ASSERT(xfs_daddr_to_agno(mp, d) == \
+		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+
+/*
+ * XFS Timestamps
+ * ==============
+ *
+ * Traditional ondisk inode timestamps consist of signed 32-bit counters for
+ * seconds and nanoseconds; time zero is the Unix epoch, Jan  1 00:00:00 UTC
+ * 1970, which means that the timestamp epoch is the same as the Unix epoch.
+ * Therefore, the ondisk min and max defined here can be used directly to
+ * constrain the incore timestamps on a Unix system.  Note that we actually
+ * encode a __be64 value on disk.
+ *
+ * When the bigtime feature is enabled, ondisk inode timestamps become an
+ * unsigned 64-bit nanoseconds counter.  This means that the bigtime inode
+ * timestamp epoch is the start of the classic timestamp range, which is
+ * Dec 13 20:45:52 UTC 1901.  Because the epochs are not the same, callers
+ * /must/ use the bigtime conversion functions when encoding and decoding raw
+ * timestamps.
+ */
+typedef __be64 xfs_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_timestamp {
+	__be32		t_sec;		/* timestamp seconds */
+	__be32		t_nsec;		/* timestamp nanoseconds */
+};
+
+/*
+ * Smallest possible ondisk seconds value with traditional timestamps.  This
+ * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_LEGACY_TIME_MIN	((int64_t)S32_MIN)
+
+/*
+ * Largest possible ondisk seconds value with traditional timestamps.  This
+ * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038.
+ */
+#define XFS_LEGACY_TIME_MAX	((int64_t)S32_MAX)
+
+/*
+ * Smallest possible ondisk seconds value with bigtime timestamps.  This
+ * corresponds (after conversion to a Unix timestamp) with the traditional
+ * minimum timestamp of Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_BIGTIME_TIME_MIN	((int64_t)0)
+
+/*
+ * Largest supported ondisk seconds value with bigtime timestamps.  This
+ * corresponds (after conversion to a Unix timestamp) with an incore timestamp
+ * of Jul  2 20:20:24 UTC 2486.
+ *
+ * We round down the ondisk limit so that the bigtime quota and inode max
+ * timestamps will be the same.
+ */
+#define XFS_BIGTIME_TIME_MAX	((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL))
+
+/*
+ * Bigtime epoch is set exactly to the minimum time value that a traditional
+ * 32-bit timestamp can represent when using the Unix epoch as a reference.
+ * Hence the Unix epoch is at a fixed offset into the supported bigtime
+ * timestamp range.
+ *
+ * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS
+ * timestamp can represent so we will not lose any fidelity in converting
+ * to/from unix and bigtime timestamps.
+ *
+ * The following conversion factor converts a seconds counter from the Unix
+ * epoch to the bigtime epoch.
+ */
+#define XFS_BIGTIME_EPOCH_OFFSET	(-(int64_t)S32_MIN)
+
+/* Convert a timestamp from the Unix epoch to the bigtime epoch. */
+static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds)
+{
+	return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET;
+}
+
+/* Convert a timestamp from the bigtime epoch to the Unix epoch. */
+static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
+{
+	return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
+}
+
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct xfs_log_dinode which matches the layout of
+ * this structure, but is kept in native format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
+ */
+#define	XFS_DINODE_MAGIC		0x494e	/* 'IN' */
+struct xfs_dinode {
+	__be16		di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	__be16		di_mode;	/* mode and type of file */
+	__u8		di_version;	/* inode version */
+	__u8		di_format;	/* format of di_c data */
+	__be16		di_onlink;	/* old number of links to file */
+	__be32		di_uid;		/* owner's user id */
+	__be32		di_gid;		/* owner's group id */
+	__be32		di_nlink;	/* number of links to file */
+	__be16		di_projid_lo;	/* lower part of owner's project id */
+	__be16		di_projid_hi;	/* higher part owner's project id */
+	union {
+		/* Number of data fork extents if NREXT64 is set */
+		__be64	di_big_nextents;
+
+		/* Padding for V3 inodes without NREXT64 set. */
+		__be64	di_v3_pad;
+
+		/* Padding and inode flush counter for V2 inodes. */
+		struct {
+			__u8	di_v2_pad[6];
+			__be16	di_flushiter;
+		};
+	};
+	xfs_timestamp_t	di_atime;	/* time last accessed */
+	xfs_timestamp_t	di_mtime;	/* time last modified */
+	xfs_timestamp_t	di_ctime;	/* time created/inode modified */
+	__be64		di_size;	/* number of bytes in file */
+	__be64		di_nblocks;	/* # of direct & btree blocks used */
+	__be32		di_extsize;	/* basic/minimum extent size for file */
+	union {
+		/*
+		 * For V2 inodes and V3 inodes without NREXT64 set, this
+		 * is the number of data and attr fork extents.
+		 */
+		struct {
+			__be32	di_nextents;
+			__be16	di_anextents;
+		} __packed;
+
+		/* Number of attr fork extents if NREXT64 is set. */
+		struct {
+			__be32	di_big_anextents;
+			__be16	di_nrext64_pad;
+		} __packed;
+	} __packed;
+	__u8		di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	__s8		di_aformat;	/* format of attr fork's data */
+	__be32		di_dmevmask;	/* DMIG event mask */
+	__be16		di_dmstate;	/* DMIG state info */
+	__be16		di_flags;	/* random flags, XFS_DIFLAG_... */
+	__be32		di_gen;		/* generation number */
+
+	/* di_next_unlinked is the only non-core field in the old dinode */
+	__be32		di_next_unlinked;/* agi unlinked list ptr */
+
+	/* start of the extended dinode, writable fields */
+	__le32		di_crc;		/* CRC of the inode */
+	__be64		di_changecount;	/* number of attribute changes */
+	__be64		di_lsn;		/* flush sequence */
+	__be64		di_flags2;	/* more random flags */
+	__be32		di_cowextsize;	/* basic cow extent size for file */
+	__u8		di_pad2[12];	/* more padding for future expansion */
+
+	/* fields only written to during inode creation */
+	xfs_timestamp_t	di_crtime;	/* time created */
+	__be64		di_ino;		/* inode number */
+	uuid_t		di_uuid;	/* UUID of the filesystem */
+
+	/* structure must be padded to 64 bit alignment */
+};
+
+#define XFS_DINODE_CRC_OFF	offsetof(struct xfs_dinode, di_crc)
+
+#define DI_MAX_FLUSH 0xffff
+
+/*
+ * Size of the core inode on disk.  Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+	if (version == 3)
+		return sizeof(struct xfs_dinode);
+	return offsetof(struct xfs_dinode, di_crc);
+}
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ */
+#define	XFS_MAXLINK		((1U << 31) - 1U)
+
+/*
+ * Values for di_format
+ *
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
+ */
+enum xfs_dinode_fmt {
+	XFS_DINODE_FMT_DEV,		/* xfs_dev_t */
+	XFS_DINODE_FMT_LOCAL,		/* bulk data */
+	XFS_DINODE_FMT_EXTENTS,		/* struct xfs_bmbt_rec */
+	XFS_DINODE_FMT_BTREE,		/* struct xfs_bmdr_block */
+	XFS_DINODE_FMT_UUID		/* added long ago, but never used */
+};
+
+#define XFS_INODE_FORMAT_STR \
+	{ XFS_DINODE_FMT_DEV,		"dev" }, \
+	{ XFS_DINODE_FMT_LOCAL,		"local" }, \
+	{ XFS_DINODE_FMT_EXTENTS,	"extent" }, \
+	{ XFS_DINODE_FMT_BTREE,		"btree" }, \
+	{ XFS_DINODE_FMT_UUID,		"uuid" }
+
+/*
+ * Max values for extnum and aextnum.
+ *
+ * The original on-disk extent counts were held in signed fields, resulting in
+ * maximum extent counts of 2^31 and 2^15 for the data and attr forks
+ * respectively. Similarly the maximum extent length is limited to 2^21 blocks
+ * by the 21-bit wide blockcount field of a BMBT extent record.
+ *
+ * The newly introduced data fork extent counter can hold a 64-bit value,
+ * however the maximum number of extents in a file is also limited to 2^54
+ * extents by the 54-bit wide startoff field of a BMBT extent record.
+ *
+ * It is further limited by the maximum supported file size of 2^63
+ * *bytes*. This leads to a maximum extent count for maximally sized filesystem
+ * blocks (64kB) of:
+ *
+ * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
+ *
+ * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
+ * 2^48 was chosen as the maximum data fork extent count.
+ *
+ * The maximum file size that can be represented by the data fork extent counter
+ * in the worst case occurs when all extents are 1 block in length and each
+ * block is 1KB in size.
+ *
+ * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
+ * with 1KB sized blocks, a file can reach upto,
+ * 1KB * (2^31) = 2TB
+ *
+ * This is much larger than the theoretical maximum size of a directory
+ * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
+ *
+ * Hence, a directory inode can never overflow its data fork extent counter.
+ */
+#define XFS_MAX_EXTCNT_DATA_FORK_LARGE	((xfs_extnum_t)((1ULL << 48) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE	((xfs_extnum_t)((1ULL << 32) - 1))
+#define XFS_MAX_EXTCNT_DATA_FORK_SMALL	((xfs_extnum_t)((1ULL << 31) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL	((xfs_extnum_t)((1ULL << 15) - 1))
+
+/*
+ * When we upgrade an inode to the large extent counts, the maximum value by
+ * which the extent count can increase is bound by the change in size of the
+ * on-disk field. No upgrade operation should ever be adding more than a few
+ * tens of extents, so if we get a really large value it is a sign of a code bug
+ * or corruption.
+ */
+#define XFS_MAX_EXTCNT_UPGRADE_NR	\
+	min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL,	\
+	    XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define	XFS_DINODE_MIN_LOG	8
+#define	XFS_DINODE_MAX_LOG	11
+#define	XFS_DINODE_MIN_SIZE	(1 << XFS_DINODE_MIN_LOG)
+#define	XFS_DINODE_MAX_SIZE	(1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#define XFS_DINODE_SIZE(mp) \
+	(xfs_has_v3inodes(mp) ? \
+		sizeof(struct xfs_dinode) : \
+		offsetof(struct xfs_dinode, di_crc))
+#define XFS_LITINO(mp) \
+	((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(mp))
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_BOFF(dip)		((int)((dip)->di_forkoff << 3))
+
+#define XFS_DFORK_DSIZE(dip,mp) \
+	((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp))
+#define XFS_DFORK_ASIZE(dip,mp) \
+	((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_DFORK_DSIZE(dip, mp) : \
+		XFS_DFORK_ASIZE(dip, mp))
+
+#define XFS_DFORK_MAXEXT(dip, mp, w) \
+	(XFS_DFORK_SIZE(dip, mp, w) / sizeof(struct xfs_bmbt_rec))
+
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+	((char *)dip + xfs_dinode_size(dip->di_version))
+#define XFS_DFORK_APTR(dip)	\
+	(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w)	\
+	((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
+#define XFS_DFORK_FORMAT(dip,w) \
+	((w) == XFS_DATA_FORK ? \
+		(dip)->di_format : \
+		(dip)->di_aformat)
+
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+	return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+	*(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
+/*
+ * Values for di_flags
+ */
+#define XFS_DIFLAG_REALTIME_BIT  0	/* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT  1	/* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT   2	/* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3	/* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT    4	/* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT      5	/* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT   6	/* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT    7	/* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8	/* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT   9	/* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT   10	/* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT      11	/* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12	/* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13	/* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
+/* Do not use bit 15, di_flags is legacy and unchanging now */
+
+#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)
+
+#define XFS_DIFLAG_ANY \
+	(XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+	 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+	 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+	 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+	 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+
+/*
+ * Values for di_flags2 These start by being exposed to userspace in the upper
+ * 16 bits of the XFS_XFLAG_s range.
+ */
+#define XFS_DIFLAG2_DAX_BIT	0	/* use DAX for this inode */
+#define XFS_DIFLAG2_REFLINK_BIT	1	/* file's blocks may be shared */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
+#define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
+#define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
+
+#define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
+
+#define XFS_DIFLAG2_ANY \
+	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+
+static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
+{
+	return dip->di_version >= 3 &&
+	       (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
+}
+
+static inline bool xfs_dinode_has_large_extent_counts(
+	const struct xfs_dinode *dip)
+{
+	return dip->di_version >= 3 &&
+	       (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
+}
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+#define	XFS_INO_MASK(k)			(uint32_t)((1ULL << (k)) - 1)
+#define	XFS_INO_OFFSET_BITS(mp)		(mp)->m_sb.sb_inopblog
+#define	XFS_INO_AGBNO_BITS(mp)		(mp)->m_sb.sb_agblklog
+#define	XFS_INO_AGINO_BITS(mp)		((mp)->m_ino_geo.agino_log)
+#define	XFS_INO_AGNO_BITS(mp)		(mp)->m_agno_log
+#define	XFS_INO_BITS(mp)		\
+	XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define	XFS_INO_TO_AGNO(mp,i)		\
+	((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define	XFS_INO_TO_AGINO(mp,i)		\
+	((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define	XFS_INO_TO_AGBNO(mp,i)		\
+	(((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+		XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define	XFS_INO_TO_OFFSET(mp,i)		\
+	((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define	XFS_INO_TO_FSB(mp,i)		\
+	XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define	XFS_AGINO_TO_INO(mp,a,i)	\
+	(((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define	XFS_AGINO_TO_AGBNO(mp,i)	((i) >> XFS_INO_OFFSET_BITS(mp))
+#define	XFS_AGINO_TO_OFFSET(mp,i)	\
+	((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define	XFS_OFFBNO_TO_AGINO(mp,b,o)	\
+	((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#define	XFS_FSB_TO_INO(mp, b)	((xfs_ino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
+#define	XFS_AGB_TO_AGINO(mp, b)	((xfs_agino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
+
+#define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define	XFS_MAXINUMBER_32	((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+/*
+ * RealTime Device format definitions
+ */
+
+/* Min and max rt extent sizes, specified in bytes */
+#define	XFS_MAX_RTEXTSIZE	(1024 * 1024 * 1024)	/* 1GB */
+#define	XFS_DFL_RTEXTSIZE	(64 * 1024)	        /* 64kB */
+#define	XFS_MIN_RTEXTSIZE	(4 * 1024)		/* 4kB */
+
+#define	XFS_BLOCKSIZE(mp)	((mp)->m_sb.sb_blocksize)
+#define	XFS_BLOCKMASK(mp)	((mp)->m_blockmask)
+#define	XFS_BLOCKWSIZE(mp)	((mp)->m_blockwsize)
+#define	XFS_BLOCKWMASK(mp)	((mp)->m_blockwmask)
+
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define	XFS_SUMOFFS(mp,ls,bb)	((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define	XFS_SUMOFFSTOBLOCK(mp,s)	\
+	(((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define	XFS_SUMPTR(mp,bp,so)	\
+	((xfs_suminfo_t *)((bp)->b_addr + \
+		(((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define	XFS_BITTOBLOCK(mp,bi)	((bi) >> (mp)->m_blkbit_log)
+#define	XFS_BLOCKTOBIT(mp,bb)	((bb) << (mp)->m_blkbit_log)
+#define	XFS_BITTOWORD(mp,bi)	\
+	((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define	XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
+#define	XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
+
+#define	XFS_RTLOBIT(w)	xfs_lowbit32(w)
+#define	XFS_RTHIBIT(w)	xfs_highbit32(w)
+
+#define	XFS_RTBLOCKLOG(b)	xfs_highbit64(b)
+
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC		0x4451		/* 'DQ' */
+#define XFS_DQUOT_VERSION	(uint8_t)0x01	/* latest version number */
+
+#define XFS_DQTYPE_USER		(1u << 0)	/* user dquot record */
+#define XFS_DQTYPE_PROJ		(1u << 1)	/* project dquot record */
+#define XFS_DQTYPE_GROUP	(1u << 2)	/* group dquot record */
+#define XFS_DQTYPE_BIGTIME	(1u << 7)	/* large expiry timestamps */
+
+/* bitmask to determine if this is a user/group/project dquot */
+#define XFS_DQTYPE_REC_MASK	(XFS_DQTYPE_USER | \
+				 XFS_DQTYPE_PROJ | \
+				 XFS_DQTYPE_GROUP)
+
+#define XFS_DQTYPE_ANY		(XFS_DQTYPE_REC_MASK | \
+				 XFS_DQTYPE_BIGTIME)
+
+/*
+ * XFS Quota Timers
+ * ================
+ *
+ * Traditional quota grace period expiration timers are an unsigned 32-bit
+ * seconds counter; time zero is the Unix epoch, Jan  1 00:00:01 UTC 1970.
+ * Note that an expiration value of zero means that the quota limit has not
+ * been reached, and therefore no expiration has been set.  Therefore, the
+ * ondisk min and max defined here can be used directly to constrain the incore
+ * quota expiration timestamps on a Unix system.
+ *
+ * When bigtime is enabled, we trade two bits of precision to expand the
+ * expiration timeout range to match that of big inode timestamps.  The min and
+ * max recorded here are the on-disk limits, not a Unix timestamp.
+ *
+ * The grace period for each quota type is stored in the root dquot (id = 0)
+ * and is applied to a non-root dquot when it exceeds the soft or hard limits.
+ * The length of quota grace periods are unsigned 32-bit quantities measured in
+ * units of seconds.  A value of zero means to use the default period.
+ */
+
+/*
+ * Smallest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Jan  1 00:00:01 UTC 1970.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MIN	((int64_t)1)
+
+/*
+ * Largest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Feb  7 06:28:15 UTC 2106.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MAX	((int64_t)U32_MAX)
+
+/*
+ * Smallest possible ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with the incore
+ * expiration of Jan  1 00:00:04 UTC 1970.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MIN	(XFS_DQ_LEGACY_EXPIRY_MIN)
+
+/*
+ * Largest supported ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with an incore
+ * expiration of Jul  2 20:20:24 UTC 2486.
+ *
+ * The ondisk field supports values up to -1U, which corresponds to an incore
+ * expiration in 2514.  This is beyond the maximum the bigtime inode timestamp,
+ * so we cap the maximum bigtime quota expiration to the max inode timestamp.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MAX	((int64_t)4074815106U)
+
+/*
+ * The following conversion factors assist in converting a quota expiration
+ * timestamp between the incore and ondisk formats.
+ */
+#define XFS_DQ_BIGTIME_SHIFT	(2)
+#define XFS_DQ_BIGTIME_SLACK	((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1)
+
+/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */
+static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds)
+{
+	/*
+	 * Round the expiration timestamp up to the nearest bigtime timestamp
+	 * that we can store, to give users the most time to fix problems.
+	 */
+	return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >>
+			XFS_DQ_BIGTIME_SHIFT;
+}
+
+/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */
+static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
+{
+	return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT;
+}
+
+/*
+ * Default quota grace periods, ranging from zero (use the compiled defaults)
+ * to ~136 years.  These are applied to a non-root dquot that has exceeded
+ * either limit.
+ */
+#define XFS_DQ_GRACE_MIN		((int64_t)0)
+#define XFS_DQ_GRACE_MAX		((int64_t)U32_MAX)
+
+/*
+ * This is the main portion of the on-disk representation of quota information
+ * for a user.  We pad this with some more expansion room to construct the on
+ * disk structure.
+ */
+struct xfs_disk_dquot {
+	__be16		d_magic;	/* dquot magic = XFS_DQUOT_MAGIC */
+	__u8		d_version;	/* dquot version */
+	__u8		d_type;		/* XFS_DQTYPE_USER/PROJ/GROUP */
+	__be32		d_id;		/* user,project,group id */
+	__be64		d_blk_hardlimit;/* absolute limit on disk blks */
+	__be64		d_blk_softlimit;/* preferred limit on disk blks */
+	__be64		d_ino_hardlimit;/* maximum # allocated inodes */
+	__be64		d_ino_softlimit;/* preferred inode limit */
+	__be64		d_bcount;	/* disk blocks owned by the user */
+	__be64		d_icount;	/* inodes owned by the user */
+	__be32		d_itimer;	/* zero if within inode limits if not,
+					   this is when we refuse service */
+	__be32		d_btimer;	/* similar to above; for disk blocks */
+	__be16		d_iwarns;	/* warnings issued wrt num inodes */
+	__be16		d_bwarns;	/* warnings issued wrt disk blocks */
+	__be32		d_pad0;		/* 64 bit align */
+	__be64		d_rtb_hardlimit;/* absolute limit on realtime blks */
+	__be64		d_rtb_softlimit;/* preferred limit on RT disk blks */
+	__be64		d_rtbcount;	/* realtime blocks owned */
+	__be32		d_rtbtimer;	/* similar to above; for RT disk blocks */
+	__be16		d_rtbwarns;	/* warnings issued wrt RT disk blocks */
+	__be16		d_pad;
+};
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+struct xfs_dqblk {
+	struct xfs_disk_dquot	dd_diskdq; /* portion living incore as well */
+	char			dd_fill[4];/* filling for posterity */
+
+	/*
+	 * These two are only present on filesystems with the CRC bits set.
+	 */
+	__be32		  dd_crc;	/* checksum */
+	__be64		  dd_lsn;	/* last modification in log */
+	uuid_t		  dd_uuid;	/* location information */
+};
+
+#define XFS_DQUOT_CRC_OFF	offsetof(struct xfs_dqblk, dd_crc)
+
+/*
+ * This defines the unit of allocation of dquots.
+ *
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ *
+ * However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ *
+ * This is part of the ondisk format because the structure size is not a power
+ * of two, which leaves slack at the end of the disk block.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB	(xfs_filblks_t)1
+
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC	0x58534c4d	/* XSLM */
+
+struct xfs_dsymlink_hdr {
+	__be32	sl_magic;
+	__be32	sl_offset;
+	__be32	sl_bytes;
+	__be32	sl_crc;
+	uuid_t	sl_uuid;
+	__be64	sl_owner;
+	__be64	sl_blkno;
+	__be64	sl_lsn;
+};
+
+#define XFS_SYMLINK_CRC_OFF	offsetof(struct xfs_dsymlink_hdr, sl_crc)
+
+#define XFS_SYMLINK_MAXLEN	1024
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)	\
+	((bufsize) - (xfs_has_crc((mp)) ? \
+			sizeof(struct xfs_dsymlink_hdr) : 0))
+
+
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define	XFS_ABTB_MAGIC		0x41425442	/* 'ABTB' for bno tree */
+#define	XFS_ABTB_CRC_MAGIC	0x41423342	/* 'AB3B' */
+#define	XFS_ABTC_MAGIC		0x41425443	/* 'ABTC' for cnt tree */
+#define	XFS_ABTC_CRC_MAGIC	0x41423343	/* 'AB3C' */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+	__be32		ar_startblock;	/* starting block number */
+	__be32		ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+	xfs_agblock_t	ar_startblock;	/* starting block number */
+	xfs_extlen_t	ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define	XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define	XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define	XFS_IBT_MAGIC		0x49414254	/* 'IABT' */
+#define	XFS_IBT_CRC_MAGIC	0x49414233	/* 'IAB3' */
+#define	XFS_FIBT_MAGIC		0x46494254	/* 'FIBT' */
+#define	XFS_FIBT_CRC_MAGIC	0x46494233	/* 'FIB3' */
+
+typedef uint64_t	xfs_inofree_t;
+#define	XFS_INODES_PER_CHUNK		(NBBY * sizeof(xfs_inofree_t))
+#define	XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
+#define	XFS_INOBT_ALL_FREE		((xfs_inofree_t)-1)
+#define	XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
+
+#define XFS_INOBT_HOLEMASK_FULL		0	/* holemask for full chunk */
+#define XFS_INOBT_HOLEMASK_BITS		(NBBY * sizeof(uint16_t))
+#define XFS_INODES_PER_HOLEMASK_BIT	\
+	(XFS_INODES_PER_CHUNK / (NBBY * sizeof(uint16_t)))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+	return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * The on-disk inode record structure has two formats. The original "full"
+ * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
+ * and replaces the 3 high-order freecount bytes wth the holemask and inode
+ * count.
+ *
+ * The holemask of the sparse record format allows an inode chunk to have holes
+ * that refer to blocks not owned by the inode record. This facilitates inode
+ * allocation in the event of severe free space fragmentation.
+ */
+typedef struct xfs_inobt_rec {
+	__be32		ir_startino;	/* starting inode number */
+	union {
+		struct {
+			__be32	ir_freecount;	/* count of free inodes */
+		} f;
+		struct {
+			__be16	ir_holemask;/* hole mask for sparse chunks */
+			__u8	ir_count;	/* total inode count */
+			__u8	ir_freecount;	/* count of free inodes */
+		} sp;
+	} ir_u;
+	__be64		ir_free;	/* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+	xfs_agino_t	ir_startino;	/* starting inode number */
+	uint16_t	ir_holemask;	/* hole mask for sparse chunks */
+	uint8_t		ir_count;	/* total inode count */
+	uint8_t		ir_freecount;	/* count of free inodes (set bits) */
+	xfs_inofree_t	ir_free;	/* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+static inline bool xfs_inobt_issparse(uint16_t holemask)
+{
+	/* non-zero holemask represents a sparse rec. */
+	return holemask;
+}
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+	__be32		ir_startino;	/* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define	XFS_IBT_BLOCK(mp)		((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define	XFS_FIBT_BLOCK(mp)		((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * Reverse mapping btree format definitions
+ *
+ * There is a btree for the reverse map per allocation group
+ */
+#define	XFS_RMAP_CRC_MAGIC	0x524d4233	/* 'RMB3' */
+
+/*
+ * Ownership info for an extent.  This is used to create reverse-mapping
+ * entries.
+ */
+#define XFS_OWNER_INFO_ATTR_FORK	(1 << 0)
+#define XFS_OWNER_INFO_BMBT_BLOCK	(1 << 1)
+struct xfs_owner_info {
+	uint64_t		oi_owner;
+	xfs_fileoff_t		oi_offset;
+	unsigned int		oi_flags;
+};
+
+/*
+ * Special owner types.
+ *
+ * Seeing as we only support up to 8EB, we have the upper bit of the owner field
+ * to tell us we have a special owner value. We use these for static metadata
+ * allocated at mkfs/growfs time, as well as for freespace management metadata.
+ */
+#define XFS_RMAP_OWN_NULL	(-1ULL)	/* No owner, for growfs */
+#define XFS_RMAP_OWN_UNKNOWN	(-2ULL)	/* Unknown owner, for EFI recovery */
+#define XFS_RMAP_OWN_FS		(-3ULL)	/* static fs metadata */
+#define XFS_RMAP_OWN_LOG	(-4ULL)	/* static fs metadata */
+#define XFS_RMAP_OWN_AG		(-5ULL)	/* AG freespace btree blocks */
+#define XFS_RMAP_OWN_INOBT	(-6ULL)	/* Inode btree blocks */
+#define XFS_RMAP_OWN_INODES	(-7ULL)	/* Inode chunk */
+#define XFS_RMAP_OWN_REFC	(-8ULL) /* refcount tree */
+#define XFS_RMAP_OWN_COW	(-9ULL) /* cow allocations */
+#define XFS_RMAP_OWN_MIN	(-10ULL) /* guard */
+
+#define XFS_RMAP_NON_INODE_OWNER(owner)	(!!((owner) & (1ULL << 63)))
+
+/*
+ * Data record structure
+ */
+struct xfs_rmap_rec {
+	__be32		rm_startblock;	/* extent start block */
+	__be32		rm_blockcount;	/* extent length */
+	__be64		rm_owner;	/* extent owner */
+	__be64		rm_offset;	/* offset within the owner */
+};
+
+/*
+ * rmap btree record
+ *  rm_offset:63 is the attribute fork flag
+ *  rm_offset:62 is the bmbt block flag
+ *  rm_offset:61 is the unwritten extent flag (same as l0:63 in bmbt)
+ *  rm_offset:54-60 aren't used and should be zero
+ *  rm_offset:0-53 is the block offset within the inode
+ */
+#define XFS_RMAP_OFF_ATTR_FORK	((uint64_t)1ULL << 63)
+#define XFS_RMAP_OFF_BMBT_BLOCK	((uint64_t)1ULL << 62)
+#define XFS_RMAP_OFF_UNWRITTEN	((uint64_t)1ULL << 61)
+
+#define XFS_RMAP_LEN_MAX	((uint32_t)~0U)
+#define XFS_RMAP_OFF_FLAGS	(XFS_RMAP_OFF_ATTR_FORK | \
+				 XFS_RMAP_OFF_BMBT_BLOCK | \
+				 XFS_RMAP_OFF_UNWRITTEN)
+#define XFS_RMAP_OFF_MASK	((uint64_t)0x3FFFFFFFFFFFFFULL)
+
+#define XFS_RMAP_OFF(off)		((off) & XFS_RMAP_OFF_MASK)
+
+#define XFS_RMAP_IS_BMBT_BLOCK(off)	(!!((off) & XFS_RMAP_OFF_BMBT_BLOCK))
+#define XFS_RMAP_IS_ATTR_FORK(off)	(!!((off) & XFS_RMAP_OFF_ATTR_FORK))
+#define XFS_RMAP_IS_UNWRITTEN(len)	(!!((off) & XFS_RMAP_OFF_UNWRITTEN))
+
+#define RMAPBT_STARTBLOCK_BITLEN	32
+#define RMAPBT_BLOCKCOUNT_BITLEN	32
+#define RMAPBT_OWNER_BITLEN		64
+#define RMAPBT_ATTRFLAG_BITLEN		1
+#define RMAPBT_BMBTFLAG_BITLEN		1
+#define RMAPBT_EXNTFLAG_BITLEN		1
+#define RMAPBT_UNUSED_OFFSET_BITLEN	7
+#define RMAPBT_OFFSET_BITLEN		54
+
+/*
+ * Key structure
+ *
+ * We don't use the length for lookups
+ */
+struct xfs_rmap_key {
+	__be32		rm_startblock;	/* extent start block */
+	__be64		rm_owner;	/* extent owner */
+	__be64		rm_offset;	/* offset within the owner */
+} __attribute__((packed));
+
+/* btree pointer type */
+typedef __be32 xfs_rmap_ptr_t;
+
+#define	XFS_RMAP_BLOCK(mp) \
+	(xfs_has_finobt(((mp))) ? \
+	 XFS_FIBT_BLOCK(mp) + 1 : \
+	 XFS_IBT_BLOCK(mp) + 1)
+
+/*
+ * Reference Count Btree format definitions
+ *
+ */
+#define	XFS_REFC_CRC_MAGIC	0x52334643	/* 'R3FC' */
+
+unsigned int xfs_refc_block(struct xfs_mount *mp);
+
+/*
+ * Data record/key structure
+ *
+ * Each record associates a range of physical blocks (starting at
+ * rc_startblock and ending rc_blockcount blocks later) with a reference
+ * count (rc_refcount).  Extents that are being used to stage a copy on
+ * write (CoW) operation are recorded in the refcount btree with a
+ * refcount of 1.  All other records must have a refcount > 1 and must
+ * track an extent mapped only by file data forks.
+ *
+ * Extents with a single owner (attributes, metadata, non-shared file
+ * data) are not tracked here.  Free space is also not tracked here.
+ * This is consistent with pre-reflink XFS.
+ */
+
+/*
+ * Extents that are being used to stage a copy on write are stored
+ * in the refcount btree with a refcount of 1 and the upper bit set
+ * on the startblock.  This speeds up mount time deletion of stale
+ * staging extents because they're all at the right side of the tree.
+ */
+#define XFS_REFC_COWFLAG		(1U << 31)
+#define REFCNTBT_COWFLAG_BITLEN		1
+#define REFCNTBT_AGBLOCK_BITLEN		31
+
+struct xfs_refcount_rec {
+	__be32		rc_startblock;	/* starting block number */
+	__be32		rc_blockcount;	/* count of blocks */
+	__be32		rc_refcount;	/* number of inodes linked here */
+};
+
+struct xfs_refcount_key {
+	__be32		rc_startblock;	/* starting block number */
+};
+
+#define MAXREFCOUNT	((xfs_nlink_t)~0U)
+#define MAXREFCEXTLEN	((xfs_extlen_t)~0U)
+
+/* btree pointer type */
+typedef __be32 xfs_refcount_ptr_t;
+
+
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC		0x424d4150	/* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC	0x424d4133	/* 'BMA3' */
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN	1
+#define BMBT_STARTOFF_BITLEN	54
+#define BMBT_STARTBLOCK_BITLEN	52
+#define BMBT_BLOCKCOUNT_BITLEN	21
+
+#define BMBT_STARTOFF_MASK	((1ULL << BMBT_STARTOFF_BITLEN) - 1)
+#define BMBT_BLOCKCOUNT_MASK	((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+
+#define XFS_MAX_BMBT_EXTLEN	((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
+
+/*
+ * bmbt records have a file offset (block) field that is 54 bits wide, so this
+ * is the largest xfs_fileoff_t that we ever expect to see.
+ */
+#define XFS_MAX_FILEOFF		(BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK)
+
+typedef struct xfs_bmbt_rec {
+	__be64			l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef uint64_t	xfs_bmbt_rec_base_t;	/* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS	17
+#define STARTBLOCKMASKBITS	(15 + 20)
+#define STARTBLOCKMASK		\
+	(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+	return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+	ASSERT(k < (1 << STARTBLOCKVALBITS));
+	return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+	return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+	__be64		br_startoff;	/* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees.  The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below.  Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+/* short form block header */
+struct xfs_btree_block_shdr {
+	__be32		bb_leftsib;
+	__be32		bb_rightsib;
+
+	__be64		bb_blkno;
+	__be64		bb_lsn;
+	uuid_t		bb_uuid;
+	__be32		bb_owner;
+	__le32		bb_crc;
+};
+
+/* long form block header */
+struct xfs_btree_block_lhdr {
+	__be64		bb_leftsib;
+	__be64		bb_rightsib;
+
+	__be64		bb_blkno;
+	__be64		bb_lsn;
+	uuid_t		bb_uuid;
+	__be64		bb_owner;
+	__le32		bb_crc;
+	__be32		bb_pad; /* padding for alignment */
+};
+
+struct xfs_btree_block {
+	__be32		bb_magic;	/* magic number for block type */
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+	union {
+		struct xfs_btree_block_shdr s;
+		struct xfs_btree_block_lhdr l;
+	} bb_u;				/* rest */
+};
+
+/* size of a short form block */
+#define XFS_BTREE_SBLOCK_LEN \
+	(offsetof(struct xfs_btree_block, bb_u) + \
+	 offsetof(struct xfs_btree_block_shdr, bb_blkno))
+/* size of a long form block */
+#define XFS_BTREE_LBLOCK_LEN \
+	(offsetof(struct xfs_btree_block, bb_u) + \
+	 offsetof(struct xfs_btree_block_lhdr, bb_blkno))
+
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN \
+	(offsetof(struct xfs_btree_block, bb_u) + \
+	 sizeof(struct xfs_btree_block_shdr))
+#define XFS_BTREE_LBLOCK_CRC_LEN \
+	(offsetof(struct xfs_btree_block, bb_u) + \
+	 sizeof(struct xfs_btree_block_lhdr))
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+	offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+	offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
+/*
+ * On-disk XFS access control list structure.
+ */
+struct xfs_acl_entry {
+	__be32	ae_tag;
+	__be32	ae_id;
+	__be16	ae_perm;
+	__be16	ae_pad;		/* fill the implicit hole in the structure */
+};
+
+struct xfs_acl {
+	__be32			acl_cnt;
+	struct xfs_acl_entry	acl_entry[];
+};
+
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp)	\
+	(xfs_has_crc(mp) \
+		?  (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+						sizeof(struct xfs_acl_entry) \
+		: 25)
+
+#define XFS_ACL_SIZE(cnt) \
+	(sizeof(struct xfs_acl) + \
+		sizeof(struct xfs_acl_entry) * cnt)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+	XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))
+
+
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE		"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT		"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
+
+#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
new file mode 100644
index 000000000..1cfd5bc65
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -0,0 +1,851 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+/*
+ * Copyright (c) 1995-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FS_H__
+#define __XFS_FS_H__
+
+/*
+ * SGI's XFS filesystem's major stuff (constants, structures)
+ */
+
+/*
+ * Direct I/O attribute record used with XFS_IOC_DIOINFO
+ * d_miniosz is the min xfer size, xfer size multiple and file seek offset
+ * alignment.
+ */
+#ifndef HAVE_DIOATTR
+struct dioattr {
+	__u32		d_mem;		/* data buffer memory alignment */
+	__u32		d_miniosz;	/* min xfer size		*/
+	__u32		d_maxiosz;	/* max xfer size		*/
+};
+#endif
+
+/*
+ * Structure for XFS_IOC_GETBMAP.
+ * On input, fill in bmv_offset and bmv_length of the first structure
+ * to indicate the area of interest in the file, and bmv_entries with
+ * the number of array elements given back.  The first structure is
+ * updated on return to give the offset and length for the next call.
+ */
+#ifndef HAVE_GETBMAP
+struct getbmap {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output)  */
+};
+#endif
+
+/*
+ *	Structure for XFS_IOC_GETBMAPX.	 Fields bmv_offset through bmv_entries
+ *	are used exactly as in the getbmap structure.  The getbmapx structure
+ *	has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
+ *	is only used for the first structure.  It contains input flags
+ *	specifying XFS_IOC_GETBMAPX actions.  The bmv_oflags field is filled
+ *	in by the XFS_IOC_GETBMAPX command for each returned structure after
+ *	the first.
+ */
+#ifndef HAVE_GETBMAPX
+struct getbmapx {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output). */
+	__s32		bmv_iflags;	/* input flags (1st structure)	    */
+	__s32		bmv_oflags;	/* output flags (after 1st structure)*/
+	__s32		bmv_unused1;	/* future use			    */
+	__s32		bmv_unused2;	/* future use			    */
+};
+#endif
+
+/*	bmv_iflags values - set by XFS_IOC_GETBMAPX caller.	*/
+#define BMV_IF_ATTRFORK		0x1	/* return attr fork rather than data */
+#define BMV_IF_NO_DMAPI_READ	0x2	/* Deprecated */
+#define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
+#define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES		0x10	/* Do not return holes */
+#define BMV_IF_COWFORK		0x20	/* return CoW fork rather than data */
+#define BMV_IF_VALID	\
+	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\
+	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK)
+
+/*	bmv_oflags values - returned for each non-header segment */
+#define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
+#define BMV_OF_DELALLOC		0x2	/* segment = delayed allocation */
+#define BMV_OF_LAST		0x4	/* segment is the last in the file */
+#define BMV_OF_SHARED		0x8	/* segment shared with another file */
+
+/*	fmr_owner special values for FS_IOC_GETFSMAP */
+#define XFS_FMR_OWN_FREE	FMR_OWN_FREE      /* free space */
+#define XFS_FMR_OWN_UNKNOWN	FMR_OWN_UNKNOWN   /* unknown owner */
+#define XFS_FMR_OWN_FS		FMR_OWNER('X', 1) /* static fs metadata */
+#define XFS_FMR_OWN_LOG		FMR_OWNER('X', 2) /* journalling log */
+#define XFS_FMR_OWN_AG		FMR_OWNER('X', 3) /* per-AG metadata */
+#define XFS_FMR_OWN_INOBT	FMR_OWNER('X', 4) /* inode btree blocks */
+#define XFS_FMR_OWN_INODES	FMR_OWNER('X', 5) /* inodes */
+#define XFS_FMR_OWN_REFC	FMR_OWNER('X', 6) /* refcount tree */
+#define XFS_FMR_OWN_COW		FMR_OWNER('X', 7) /* cow staging */
+#define XFS_FMR_OWN_DEFECTIVE	FMR_OWNER('X', 8) /* bad blocks */
+
+/*
+ * File segment locking set data type for 64 bit access.
+ * Also used for all the RESV/FREE interfaces.
+ */
+typedef struct xfs_flock64 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+} xfs_flock64_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V1
+ */
+struct xfs_fsop_geom_v1 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+};
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V4
+ */
+struct xfs_fsop_geom_v4 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+	__u32		logsunit;	/* log stripe unit, bytes	*/
+};
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY
+ */
+struct xfs_fsop_geom {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+	__u32		logsunit;	/* log stripe unit, bytes	*/
+	uint32_t	sick;		/* o: unhealthy fs & rt metadata */
+	uint32_t	checked;	/* o: checked fs & rt metadata	*/
+	__u64		reserved[17];	/* reserved space		*/
+};
+
+#define XFS_FSOP_GEOM_SICK_COUNTERS	(1 << 0)  /* summary counters */
+#define XFS_FSOP_GEOM_SICK_UQUOTA	(1 << 1)  /* user quota */
+#define XFS_FSOP_GEOM_SICK_GQUOTA	(1 << 2)  /* group quota */
+#define XFS_FSOP_GEOM_SICK_PQUOTA	(1 << 3)  /* project quota */
+#define XFS_FSOP_GEOM_SICK_RT_BITMAP	(1 << 4)  /* realtime bitmap */
+#define XFS_FSOP_GEOM_SICK_RT_SUMMARY	(1 << 5)  /* realtime summary */
+
+/* Output for XFS_FS_COUNTS */
+typedef struct xfs_fsop_counts {
+	__u64	freedata;	/* free data section blocks */
+	__u64	freertx;	/* free rt extents */
+	__u64	freeino;	/* free inodes */
+	__u64	allocino;	/* total allocated inodes */
+} xfs_fsop_counts_t;
+
+/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
+typedef struct xfs_fsop_resblks {
+	__u64  resblks;
+	__u64  resblks_avail;
+} xfs_fsop_resblks_t;
+
+#define XFS_FSOP_GEOM_VERSION		0
+#define XFS_FSOP_GEOM_VERSION_V5	5
+
+#define XFS_FSOP_GEOM_FLAGS_ATTR	(1 << 0)  /* attributes in use	   */
+#define XFS_FSOP_GEOM_FLAGS_NLINK	(1 << 1)  /* 32-bit nlink values   */
+#define XFS_FSOP_GEOM_FLAGS_QUOTA	(1 << 2)  /* quotas enabled	   */
+#define XFS_FSOP_GEOM_FLAGS_IALIGN	(1 << 3)  /* inode alignment	   */
+#define XFS_FSOP_GEOM_FLAGS_DALIGN	(1 << 4)  /* large data alignment  */
+#define XFS_FSOP_GEOM_FLAGS_SHARED	(1 << 5)  /* read-only shared	   */
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG	(1 << 6)  /* special extent flag   */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2	(1 << 7)  /* directory version 2   */
+#define XFS_FSOP_GEOM_FLAGS_LOGV2	(1 << 8)  /* log format version 2  */
+#define XFS_FSOP_GEOM_FLAGS_SECTOR	(1 << 9)  /* sector sizes >1BB	   */
+#define XFS_FSOP_GEOM_FLAGS_ATTR2	(1 << 10) /* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32	(1 << 11) /* 32-bit project IDs	   */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI	(1 << 12) /* ASCII only CI names   */
+	/*  -- Do not use --		(1 << 13)    SGI parent pointers   */
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB	(1 << 14) /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB	(1 << 15) /* version 5 superblock  */
+#define XFS_FSOP_GEOM_FLAGS_FTYPE	(1 << 16) /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT	(1 << 17) /* free inode btree	   */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES	(1 << 18) /* sparse inode chunks   */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT	(1 << 19) /* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK	(1 << 20) /* files can share blocks */
+#define XFS_FSOP_GEOM_FLAGS_BIGTIME	(1 << 21) /* 64-bit nsec timestamps */
+#define XFS_FSOP_GEOM_FLAGS_INOBTCNT	(1 << 22) /* inobt btree counter */
+#define XFS_FSOP_GEOM_FLAGS_NREXT64	(1 << 23) /* large extent counters */
+
+/*
+ * Minimum and maximum sizes need for growth checks.
+ *
+ * Block counts are in units of filesystem blocks, not basic blocks.
+ */
+#define XFS_MIN_AG_BLOCKS	64
+#define XFS_MIN_LOG_BLOCKS	512ULL
+#define XFS_MAX_LOG_BLOCKS	(1024 * 1024ULL)
+#define XFS_MIN_LOG_BYTES	(10 * 1024 * 1024ULL)
+
+/*
+ * Limits on sb_agblocks/sb_agblklog -- mkfs won't format AGs smaller than
+ * 16MB or larger than 1TB.
+ */
+#define XFS_MIN_AG_BYTES	(1ULL << 24)	/* 16 MB */
+#define XFS_MAX_AG_BYTES	(1ULL << 40)	/* 1 TB */
+#define XFS_MAX_AG_BLOCKS	(XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE)
+#define XFS_MAX_CRC_AG_BLOCKS	(XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE)
+
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+	((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
+
+/* Used for sanity checks on superblock */
+#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks)
+#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) *	\
+			 (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
+
+/*
+ * Output for XFS_IOC_AG_GEOMETRY
+ */
+struct xfs_ag_geometry {
+	uint32_t	ag_number;	/* i/o: AG number */
+	uint32_t	ag_length;	/* o: length in blocks */
+	uint32_t	ag_freeblks;	/* o: free space */
+	uint32_t	ag_icount;	/* o: inodes allocated */
+	uint32_t	ag_ifree;	/* o: inodes free */
+	uint32_t	ag_sick;	/* o: sick things in ag */
+	uint32_t	ag_checked;	/* o: checked metadata in ag */
+	uint32_t	ag_flags;	/* i/o: flags for this ag */
+	uint64_t	ag_reserved[12];/* o: zero */
+};
+#define XFS_AG_GEOM_SICK_SB	(1 << 0)  /* superblock */
+#define XFS_AG_GEOM_SICK_AGF	(1 << 1)  /* AGF header */
+#define XFS_AG_GEOM_SICK_AGFL	(1 << 2)  /* AGFL header */
+#define XFS_AG_GEOM_SICK_AGI	(1 << 3)  /* AGI header */
+#define XFS_AG_GEOM_SICK_BNOBT	(1 << 4)  /* free space by block */
+#define XFS_AG_GEOM_SICK_CNTBT	(1 << 5)  /* free space by length */
+#define XFS_AG_GEOM_SICK_INOBT	(1 << 6)  /* inode index */
+#define XFS_AG_GEOM_SICK_FINOBT	(1 << 7)  /* free inode index */
+#define XFS_AG_GEOM_SICK_RMAPBT	(1 << 8)  /* reverse mappings */
+#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9)  /* reference counts */
+
+/*
+ * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
+ */
+typedef struct xfs_growfs_data {
+	__u64		newblocks;	/* new data subvol size, fsblocks */
+	__u32		imaxpct;	/* new inode space percentage limit */
+} xfs_growfs_data_t;
+
+typedef struct xfs_growfs_log {
+	__u32		newblocks;	/* new log size, fsblocks */
+	__u32		isint;		/* 1 if new log is internal */
+} xfs_growfs_log_t;
+
+typedef struct xfs_growfs_rt {
+	__u64		newblocks;	/* new realtime size, fsblocks */
+	__u32		extsize;	/* new realtime extent size, fsblocks */
+} xfs_growfs_rt_t;
+
+
+/*
+ * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
+ */
+typedef struct xfs_bstime {
+	__kernel_long_t tv_sec;		/* seconds		*/
+	__s32		tv_nsec;	/* and nanoseconds	*/
+} xfs_bstime_t;
+
+struct xfs_bstat {
+	__u64		bs_ino;		/* inode number			*/
+	__u16		bs_mode;	/* type and mode		*/
+	__u16		bs_nlink;	/* number of links		*/
+	__u32		bs_uid;		/* user id			*/
+	__u32		bs_gid;		/* group id			*/
+	__u32		bs_rdev;	/* device value			*/
+	__s32		bs_blksize;	/* block size			*/
+	__s64		bs_size;	/* file size			*/
+	xfs_bstime_t	bs_atime;	/* access time			*/
+	xfs_bstime_t	bs_mtime;	/* modify time			*/
+	xfs_bstime_t	bs_ctime;	/* inode change time		*/
+	int64_t		bs_blocks;	/* number of blocks		*/
+	__u32		bs_xflags;	/* extended flags		*/
+	__s32		bs_extsize;	/* extent size			*/
+	__s32		bs_extents;	/* number of extents		*/
+	__u32		bs_gen;		/* generation count		*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
+	__u16		bs_projid_hi;	/* higher part of project id	*/
+	uint16_t	bs_sick;	/* sick inode metadata		*/
+	uint16_t	bs_checked;	/* checked inode metadata	*/
+	unsigned char	bs_pad[2];	/* pad space, unused		*/
+	__u32		bs_cowextsize;	/* cow extent size		*/
+	__u32		bs_dmevmask;	/* DMIG event mask		*/
+	__u16		bs_dmstate;	/* DMIG state info		*/
+	__u16		bs_aextents;	/* attribute number of extents	*/
+};
+
+/* New bulkstat structure that reports v5 features and fixes padding issues */
+struct xfs_bulkstat {
+	uint64_t	bs_ino;		/* inode number			*/
+	uint64_t	bs_size;	/* file size			*/
+
+	uint64_t	bs_blocks;	/* number of blocks		*/
+	uint64_t	bs_xflags;	/* extended flags		*/
+
+	int64_t		bs_atime;	/* access time, seconds		*/
+	int64_t		bs_mtime;	/* modify time, seconds		*/
+
+	int64_t		bs_ctime;	/* inode change time, seconds	*/
+	int64_t		bs_btime;	/* creation time, seconds	*/
+
+	uint32_t	bs_gen;		/* generation count		*/
+	uint32_t	bs_uid;		/* user id			*/
+	uint32_t	bs_gid;		/* group id			*/
+	uint32_t	bs_projectid;	/* project id			*/
+
+	uint32_t	bs_atime_nsec;	/* access time, nanoseconds	*/
+	uint32_t	bs_mtime_nsec;	/* modify time, nanoseconds	*/
+	uint32_t	bs_ctime_nsec;	/* inode change time, nanoseconds */
+	uint32_t	bs_btime_nsec;	/* creation time, nanoseconds	*/
+
+	uint32_t	bs_blksize;	/* block size			*/
+	uint32_t	bs_rdev;	/* device value			*/
+	uint32_t	bs_cowextsize_blks; /* cow extent size hint, blocks */
+	uint32_t	bs_extsize_blks; /* extent size hint, blocks	*/
+
+	uint32_t	bs_nlink;	/* number of links		*/
+	uint32_t	bs_extents;	/* 32-bit data fork extent counter */
+	uint32_t	bs_aextents;	/* attribute number of extents	*/
+	uint16_t	bs_version;	/* structure version		*/
+	uint16_t	bs_forkoff;	/* inode fork offset in bytes	*/
+
+	uint16_t	bs_sick;	/* sick inode metadata		*/
+	uint16_t	bs_checked;	/* checked inode metadata	*/
+	uint16_t	bs_mode;	/* type and mode		*/
+	uint16_t	bs_pad2;	/* zeroed			*/
+	uint64_t	bs_extents64;	/* 64-bit data fork extent counter */
+
+	uint64_t	bs_pad[6];	/* zeroed			*/
+};
+
+#define XFS_BULKSTAT_VERSION_V1	(1)
+#define XFS_BULKSTAT_VERSION_V5	(5)
+
+/* bs_sick flags */
+#define XFS_BS_SICK_INODE	(1 << 0)  /* inode core */
+#define XFS_BS_SICK_BMBTD	(1 << 1)  /* data fork */
+#define XFS_BS_SICK_BMBTA	(1 << 2)  /* attr fork */
+#define XFS_BS_SICK_BMBTC	(1 << 3)  /* cow fork */
+#define XFS_BS_SICK_DIR		(1 << 4)  /* directory */
+#define XFS_BS_SICK_XATTR	(1 << 5)  /* extended attributes */
+#define XFS_BS_SICK_SYMLINK	(1 << 6)  /* symbolic link remote target */
+#define XFS_BS_SICK_PARENT	(1 << 7)  /* parent pointers */
+
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was chosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline uint32_t
+bstat_get_projid(const struct xfs_bstat *bs)
+{
+	return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
+}
+
+/*
+ * The user-level BulkStat Request interface structure.
+ */
+struct xfs_fsop_bulkreq {
+	__u64		__user *lastip;	/* last inode # pointer		*/
+	__s32		icount;		/* count of entries in buffer	*/
+	void		__user *ubuffer;/* user buffer for inode desc.	*/
+	__s32		__user *ocount;	/* output count pointer		*/
+};
+
+/*
+ * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
+ */
+struct xfs_inogrp {
+	__u64		xi_startino;	/* starting inode number	*/
+	__s32		xi_alloccount;	/* # bits set in allocmask	*/
+	__u64		xi_allocmask;	/* mask of allocated inodes	*/
+};
+
+/* New inumbers structure that reports v5 features and fixes padding issues */
+struct xfs_inumbers {
+	uint64_t	xi_startino;	/* starting inode number	*/
+	uint64_t	xi_allocmask;	/* mask of allocated inodes	*/
+	uint8_t		xi_alloccount;	/* # bits set in allocmask	*/
+	uint8_t		xi_version;	/* version			*/
+	uint8_t		xi_padding[6];	/* zero				*/
+};
+
+#define XFS_INUMBERS_VERSION_V1	(1)
+#define XFS_INUMBERS_VERSION_V5	(5)
+
+/* Header for bulk inode requests. */
+struct xfs_bulk_ireq {
+	uint64_t	ino;		/* I/O: start with this inode	*/
+	uint32_t	flags;		/* I/O: operation flags		*/
+	uint32_t	icount;		/* I: count of entries in buffer */
+	uint32_t	ocount;		/* O: count of entries filled out */
+	uint32_t	agno;		/* I: see comment for IREQ_AGNO	*/
+	uint64_t	reserved[5];	/* must be zero			*/
+};
+
+/*
+ * Only return results from the specified @agno.  If @ino is zero, start
+ * with the first inode of @agno.
+ */
+#define XFS_BULK_IREQ_AGNO	(1U << 0)
+
+/*
+ * Return bulkstat information for a single inode, where @ino value is a
+ * special value, not a literal inode number.  See the XFS_BULK_IREQ_SPECIAL_*
+ * values below.  Not compatible with XFS_BULK_IREQ_AGNO.
+ */
+#define XFS_BULK_IREQ_SPECIAL	(1U << 1)
+
+/*
+ * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
+ * 0 to xfs_bulkstat->bs_extents when the flag is set.  Otherwise, use
+ * xfs_bulkstat->bs_extents for returning data fork extent count and set
+ * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
+ * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
+ * XFS_MAX_EXTCNT_DATA_FORK_OLD.
+ */
+#define XFS_BULK_IREQ_NREXT64	(1U << 2)
+
+#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO |	 \
+				 XFS_BULK_IREQ_SPECIAL | \
+				 XFS_BULK_IREQ_NREXT64)
+
+/* Operate on the root directory inode. */
+#define XFS_BULK_IREQ_SPECIAL_ROOT	(1)
+
+/*
+ * ioctl structures for v5 bulkstat and inumbers requests
+ */
+struct xfs_bulkstat_req {
+	struct xfs_bulk_ireq	hdr;
+	struct xfs_bulkstat	bulkstat[];
+};
+#define XFS_BULKSTAT_REQ_SIZE(nr)	(sizeof(struct xfs_bulkstat_req) + \
+					 (nr) * sizeof(struct xfs_bulkstat))
+
+struct xfs_inumbers_req {
+	struct xfs_bulk_ireq	hdr;
+	struct xfs_inumbers	inumbers[];
+};
+#define XFS_INUMBERS_REQ_SIZE(nr)	(sizeof(struct xfs_inumbers_req) + \
+					 (nr) * sizeof(struct xfs_inumbers))
+
+/*
+ * Error injection.
+ */
+typedef struct xfs_error_injection {
+	__s32		fd;
+	__s32		errtag;
+} xfs_error_injection_t;
+
+
+/*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION		1
+struct xfs_fs_eofblocks {
+	__u32		eof_version;
+	__u32		eof_flags;
+	uid_t		eof_uid;
+	gid_t		eof_gid;
+	prid_t		eof_prid;
+	__u32		pad32;
+	__u64		eof_min_file_size;
+	__u64		pad64[12];
+};
+
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC		(1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID		(1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID		(1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID		(1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE	(1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_UNION		(1 << 5) /* union filter algorithm;
+						  * kernel only, not included in
+						  * valid mask */
+#define XFS_EOF_FLAGS_VALID	\
+	(XFS_EOF_FLAGS_SYNC |	\
+	 XFS_EOF_FLAGS_UID |	\
+	 XFS_EOF_FLAGS_GID |	\
+	 XFS_EOF_FLAGS_PRID |	\
+	 XFS_EOF_FLAGS_MINFILESIZE)
+
+
+/*
+ * The user-level Handle Request interface structure.
+ */
+typedef struct xfs_fsop_handlereq {
+	__u32		fd;		/* fd for FD_TO_HANDLE		*/
+	void		__user *path;	/* user pathname		*/
+	__u32		oflags;		/* open flags			*/
+	void		__user *ihandle;/* user supplied handle		*/
+	__u32		ihandlen;	/* user supplied length		*/
+	void		__user *ohandle;/* user buffer for handle	*/
+	__u32		__user *ohandlen;/* user buffer length		*/
+} xfs_fsop_handlereq_t;
+
+/*
+ * Compound structures for passing args through Handle Request interfaces
+ * xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE
+ */
+
+/*
+ * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface.
+ *
+ * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix.
+ */
+#define XFS_IOC_ATTR_ROOT	0x0002	/* use attrs in root namespace */
+#define XFS_IOC_ATTR_SECURE	0x0008	/* use attrs in security namespace */
+#define XFS_IOC_ATTR_CREATE	0x0010	/* fail if attr already exists */
+#define XFS_IOC_ATTR_REPLACE	0x0020	/* fail if attr does not exist */
+
+typedef struct xfs_attrlist_cursor {
+	__u32		opaque[4];
+} xfs_attrlist_cursor_t;
+
+/*
+ * Define how lists of attribute names are returned to userspace from the
+ * XFS_IOC_ATTRLIST_BY_HANDLE ioctl.  struct xfs_attrlist is the header at the
+ * beginning of the returned buffer, and a each entry in al_offset contains the
+ * relative offset of an xfs_attrlist_ent containing the actual entry.
+ *
+ * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and
+ * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr.
+ */
+struct xfs_attrlist {
+	__s32	al_count;	/* number of entries in attrlist */
+	__s32	al_more;	/* T/F: more attrs (do call again) */
+	__s32	al_offset[1];	/* byte offsets of attrs [var-sized] */
+};
+
+struct xfs_attrlist_ent {	/* data from attr_list() */
+	__u32	a_valuelen;	/* number bytes in value of attr */
+	char	a_name[1];	/* attr name (NULL terminated) */
+};
+
+typedef struct xfs_fsop_attrlist_handlereq {
+	struct xfs_fsop_handlereq	hreq; /* handle interface structure */
+	struct xfs_attrlist_cursor	pos; /* opaque cookie, list offset */
+	__u32				flags;	/* which namespace to use */
+	__u32				buflen;	/* length of buffer supplied */
+	void				__user *buffer;	/* returned names */
+} xfs_fsop_attrlist_handlereq_t;
+
+typedef struct xfs_attr_multiop {
+	__u32		am_opcode;
+#define ATTR_OP_GET	1	/* return the indicated attr's value */
+#define ATTR_OP_SET	2	/* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE	3	/* remove the indicated attr */
+	__s32		am_error;
+	void		__user *am_attrname;
+	void		__user *am_attrvalue;
+	__u32		am_length;
+	__u32		am_flags; /* XFS_IOC_ATTR_* */
+} xfs_attr_multiop_t;
+
+typedef struct xfs_fsop_attrmulti_handlereq {
+	struct xfs_fsop_handlereq	hreq; /* handle interface structure */
+	__u32				opcount;/* count of following multiop */
+	struct xfs_attr_multiop		__user *ops; /* attr_multi data */
+} xfs_fsop_attrmulti_handlereq_t;
+
+/*
+ * per machine unique filesystem identifier types.
+ */
+typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
+
+typedef struct xfs_fid {
+	__u16	fid_len;		/* length of remainder	*/
+	__u16	fid_pad;
+	__u32	fid_gen;		/* generation number	*/
+	__u64	fid_ino;		/* 64 bits inode number */
+} xfs_fid_t;
+
+typedef struct xfs_handle {
+	union {
+		__s64	    align;	/* force alignment of ha_fid	 */
+		xfs_fsid_t  _ha_fsid;	/* unique file system identifier */
+	} ha_u;
+	xfs_fid_t	ha_fid;		/* file system specific file ID	 */
+} xfs_handle_t;
+#define ha_fsid ha_u._ha_fsid
+
+/*
+ * Structure passed to XFS_IOC_SWAPEXT
+ */
+typedef struct xfs_swapext
+{
+	int64_t		sx_version;	/* version */
+#define XFS_SX_VERSION		0
+	int64_t		sx_fdtarget;	/* fd of target file */
+	int64_t		sx_fdtmp;	/* fd of tmp file */
+	xfs_off_t	sx_offset;	/* offset into file */
+	xfs_off_t	sx_length;	/* leng from offset */
+	char		sx_pad[16];	/* pad space, unused */
+	struct xfs_bstat sx_stat;	/* stat of target b4 copy */
+} xfs_swapext_t;
+
+/*
+ * Flags for going down operation
+ */
+#define XFS_FSOP_GOING_FLAGS_DEFAULT		0x0	/* going down */
+#define XFS_FSOP_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
+#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
+
+/* metadata scrubbing */
+struct xfs_scrub_metadata {
+	__u32 sm_type;		/* What to check? */
+	__u32 sm_flags;		/* flags; see below. */
+	__u64 sm_ino;		/* inode number. */
+	__u32 sm_gen;		/* inode generation. */
+	__u32 sm_agno;		/* ag number. */
+	__u64 sm_reserved[5];	/* pad to 64 bytes */
+};
+
+/*
+ * Metadata types and flags for scrub operation.
+ */
+
+/* Scrub subcommands. */
+#define XFS_SCRUB_TYPE_PROBE	0	/* presence test ioctl */
+#define XFS_SCRUB_TYPE_SB	1	/* superblock */
+#define XFS_SCRUB_TYPE_AGF	2	/* AG free header */
+#define XFS_SCRUB_TYPE_AGFL	3	/* AG free list */
+#define XFS_SCRUB_TYPE_AGI	4	/* AG inode header */
+#define XFS_SCRUB_TYPE_BNOBT	5	/* freesp by block btree */
+#define XFS_SCRUB_TYPE_CNTBT	6	/* freesp by length btree */
+#define XFS_SCRUB_TYPE_INOBT	7	/* inode btree */
+#define XFS_SCRUB_TYPE_FINOBT	8	/* free inode btree */
+#define XFS_SCRUB_TYPE_RMAPBT	9	/* reverse mapping btree */
+#define XFS_SCRUB_TYPE_REFCNTBT	10	/* reference count btree */
+#define XFS_SCRUB_TYPE_INODE	11	/* inode record */
+#define XFS_SCRUB_TYPE_BMBTD	12	/* data fork block mapping */
+#define XFS_SCRUB_TYPE_BMBTA	13	/* attr fork block mapping */
+#define XFS_SCRUB_TYPE_BMBTC	14	/* CoW fork block mapping */
+#define XFS_SCRUB_TYPE_DIR	15	/* directory */
+#define XFS_SCRUB_TYPE_XATTR	16	/* extended attribute */
+#define XFS_SCRUB_TYPE_SYMLINK	17	/* symbolic link */
+#define XFS_SCRUB_TYPE_PARENT	18	/* parent pointers */
+#define XFS_SCRUB_TYPE_RTBITMAP	19	/* realtime bitmap */
+#define XFS_SCRUB_TYPE_RTSUM	20	/* realtime summary */
+#define XFS_SCRUB_TYPE_UQUOTA	21	/* user quotas */
+#define XFS_SCRUB_TYPE_GQUOTA	22	/* group quotas */
+#define XFS_SCRUB_TYPE_PQUOTA	23	/* project quotas */
+#define XFS_SCRUB_TYPE_FSCOUNTERS 24	/* fs summary counters */
+
+/* Number of scrub subcommands. */
+#define XFS_SCRUB_TYPE_NR	25
+
+/* i: Repair this metadata. */
+#define XFS_SCRUB_IFLAG_REPAIR		(1u << 0)
+
+/* o: Metadata object needs repair. */
+#define XFS_SCRUB_OFLAG_CORRUPT		(1u << 1)
+
+/*
+ * o: Metadata object could be optimized.  It's not corrupt, but
+ *    we could improve on it somehow.
+ */
+#define XFS_SCRUB_OFLAG_PREEN		(1u << 2)
+
+/* o: Cross-referencing failed. */
+#define XFS_SCRUB_OFLAG_XFAIL		(1u << 3)
+
+/* o: Metadata object disagrees with cross-referenced metadata. */
+#define XFS_SCRUB_OFLAG_XCORRUPT	(1u << 4)
+
+/* o: Scan was not complete. */
+#define XFS_SCRUB_OFLAG_INCOMPLETE	(1u << 5)
+
+/* o: Metadata object looked funny but isn't corrupt. */
+#define XFS_SCRUB_OFLAG_WARNING		(1u << 6)
+
+/*
+ * o: IFLAG_REPAIR was set but metadata object did not need fixing or
+ *    optimization and has therefore not been altered.
+ */
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
+
+#define XFS_SCRUB_FLAGS_IN	(XFS_SCRUB_IFLAG_REPAIR)
+#define XFS_SCRUB_FLAGS_OUT	(XFS_SCRUB_OFLAG_CORRUPT | \
+				 XFS_SCRUB_OFLAG_PREEN | \
+				 XFS_SCRUB_OFLAG_XFAIL | \
+				 XFS_SCRUB_OFLAG_XCORRUPT | \
+				 XFS_SCRUB_OFLAG_INCOMPLETE | \
+				 XFS_SCRUB_OFLAG_WARNING | \
+				 XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
+#define XFS_SCRUB_FLAGS_ALL	(XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
+
+/*
+ * ioctl limits
+ */
+#ifdef XATTR_LIST_MAX
+#  define XFS_XATTR_LIST_MAX XATTR_LIST_MAX
+#else
+#  define XFS_XATTR_LIST_MAX 65536
+#endif
+
+
+/*
+ * ioctl commands that are used by Linux filesystems
+ */
+#define XFS_IOC_GETXFLAGS	FS_IOC_GETFLAGS
+#define XFS_IOC_SETXFLAGS	FS_IOC_SETFLAGS
+#define XFS_IOC_GETVERSION	FS_IOC_GETVERSION
+
+/*
+ * ioctl commands that replace IRIX fcntl()'s
+ * For 'documentation' purposed more than anything else,
+ * the "cmd #" field reflects the IRIX fcntl number.
+ */
+/*	XFS_IOC_ALLOCSP ------- deprecated 10	 */
+/*	XFS_IOC_FREESP -------- deprecated 11	 */
+#define XFS_IOC_DIOINFO		_IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR	FS_IOC_FSGETXATTR
+#define XFS_IOC_FSSETXATTR	FS_IOC_FSSETXATTR
+/*	XFS_IOC_ALLOCSP64 ----- deprecated 36	 */
+/*	XFS_IOC_FREESP64 ------ deprecated 37	 */
+#define XFS_IOC_GETBMAP		_IOWR('X', 38, struct getbmap)
+/*      XFS_IOC_FSSETDM ------- deprecated 39    */
+#define XFS_IOC_RESVSP		_IOW ('X', 40, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP	_IOW ('X', 41, struct xfs_flock64)
+#define XFS_IOC_RESVSP64	_IOW ('X', 42, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP64	_IOW ('X', 43, struct xfs_flock64)
+#define XFS_IOC_GETBMAPA	_IOWR('X', 44, struct getbmap)
+#define XFS_IOC_FSGETXATTRA	_IOR ('X', 45, struct fsxattr)
+/*	XFS_IOC_SETBIOSIZE ---- deprecated 46	   */
+/*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
+#define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_fs_eofblocks)
+/*	XFS_IOC_GETFSMAP ------ hoisted 59         */
+#define XFS_IOC_SCRUB_METADATA	_IOWR('X', 60, struct xfs_scrub_metadata)
+#define XFS_IOC_AG_GEOMETRY	_IOWR('X', 61, struct xfs_ag_geometry)
+
+/*
+ * ioctl commands that replace IRIX syssgi()'s
+ */
+#define XFS_IOC_FSGEOMETRY_V1	     _IOR ('X', 100, struct xfs_fsop_geom_v1)
+#define XFS_IOC_FSBULKSTAT	     _IOWR('X', 101, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE    _IOWR('X', 102, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS	     _IOWR('X', 103, struct xfs_fsop_bulkreq)
+#define XFS_IOC_PATH_TO_FSHANDLE     _IOWR('X', 104, struct xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE	     _IOWR('X', 105, struct xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE	     _IOWR('X', 106, struct xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE	     _IOWR('X', 107, struct xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE   _IOWR('X', 108, struct xfs_fsop_handlereq)
+#define XFS_IOC_SWAPEXT		     _IOWR('X', 109, struct xfs_swapext)
+#define XFS_IOC_FSGROWFSDATA	     _IOW ('X', 110, struct xfs_growfs_data)
+#define XFS_IOC_FSGROWFSLOG	     _IOW ('X', 111, struct xfs_growfs_log)
+#define XFS_IOC_FSGROWFSRT	     _IOW ('X', 112, struct xfs_growfs_rt)
+#define XFS_IOC_FSCOUNTS	     _IOR ('X', 113, struct xfs_fsop_counts)
+#define XFS_IOC_SET_RESBLKS	     _IOWR('X', 114, struct xfs_fsop_resblks)
+#define XFS_IOC_GET_RESBLKS	     _IOR ('X', 115, struct xfs_fsop_resblks)
+#define XFS_IOC_ERROR_INJECTION	     _IOW ('X', 116, struct xfs_error_injection)
+#define XFS_IOC_ERROR_CLEARALL	     _IOW ('X', 117, struct xfs_error_injection)
+/*	XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118	 */
+
+#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)	/* aka FIFREEZE */
+#define XFS_IOC_THAW		     _IOWR('X', 120, int)	/* aka FITHAW */
+
+/*      XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121      */
+#define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
+#define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
+#define XFS_IOC_FSGEOMETRY_V4	     _IOR ('X', 124, struct xfs_fsop_geom_v4)
+#define XFS_IOC_GOINGDOWN	     _IOR ('X', 125, uint32_t)
+#define XFS_IOC_FSGEOMETRY	     _IOR ('X', 126, struct xfs_fsop_geom)
+#define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
+#define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
+/*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
+
+
+#ifndef HAVE_BBMACROS
+/*
+ * Block I/O parameterization.	A basic block (BB) is the lowest size of
+ * filesystem allocation, and must equal 512.  Length units given to bio
+ * routines are in BB's.
+ */
+#define BBSHIFT		9
+#define BBSIZE		(1<<BBSHIFT)
+#define BBMASK		(BBSIZE-1)
+#define BTOBB(bytes)	(((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define BTOBBT(bytes)	((__u64)(bytes) >> BBSHIFT)
+#define BBTOB(bbs)	((bbs) << BBSHIFT)
+#endif
+
+#endif	/* __XFS_FS_H__ */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
new file mode 100644
index 000000000..99e796256
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_HEALTH_H__
+#define __XFS_HEALTH_H__
+
+/*
+ * In-Core Filesystem Health Assessments
+ * =====================================
+ *
+ * We'd like to be able to summarize the current health status of the
+ * filesystem so that the administrator knows when it's necessary to schedule
+ * some downtime for repairs.  Until then, we would also like to avoid abrupt
+ * shutdowns due to corrupt metadata.
+ *
+ * The online scrub feature evaluates the health of all filesystem metadata.
+ * When scrub detects corruption in a piece of metadata it will set the
+ * corresponding sickness flag, and repair will clear it if successful.  If
+ * problems remain at unmount time, we can also request manual intervention by
+ * logging a notice to run xfs_repair.
+ *
+ * Each health tracking group uses a pair of fields for reporting.  The
+ * "checked" field tell us if a given piece of metadata has ever been examined,
+ * and the "sick" field tells us if that piece was found to need repairs.
+ * Therefore we can conclude that for a given sick flag value:
+ *
+ *  - checked && sick  => metadata needs repair
+ *  - checked && !sick => metadata is ok
+ *  - !checked         => has not been examined since mount
+ */
+
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_inode;
+struct xfs_fsop_geom;
+
+/* Observable health issues for metadata spanning the entire filesystem. */
+#define XFS_SICK_FS_COUNTERS	(1 << 0)  /* summary counters */
+#define XFS_SICK_FS_UQUOTA	(1 << 1)  /* user quota */
+#define XFS_SICK_FS_GQUOTA	(1 << 2)  /* group quota */
+#define XFS_SICK_FS_PQUOTA	(1 << 3)  /* project quota */
+
+/* Observable health issues for realtime volume metadata. */
+#define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
+#define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
+
+/* Observable health issues for AG metadata. */
+#define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
+#define XFS_SICK_AG_AGF		(1 << 1)  /* AGF header */
+#define XFS_SICK_AG_AGFL	(1 << 2)  /* AGFL header */
+#define XFS_SICK_AG_AGI		(1 << 3)  /* AGI header */
+#define XFS_SICK_AG_BNOBT	(1 << 4)  /* free space by block */
+#define XFS_SICK_AG_CNTBT	(1 << 5)  /* free space by length */
+#define XFS_SICK_AG_INOBT	(1 << 6)  /* inode index */
+#define XFS_SICK_AG_FINOBT	(1 << 7)  /* free inode index */
+#define XFS_SICK_AG_RMAPBT	(1 << 8)  /* reverse mappings */
+#define XFS_SICK_AG_REFCNTBT	(1 << 9)  /* reference counts */
+
+/* Observable health issues for inode metadata. */
+#define XFS_SICK_INO_CORE	(1 << 0)  /* inode core */
+#define XFS_SICK_INO_BMBTD	(1 << 1)  /* data fork */
+#define XFS_SICK_INO_BMBTA	(1 << 2)  /* attr fork */
+#define XFS_SICK_INO_BMBTC	(1 << 3)  /* cow fork */
+#define XFS_SICK_INO_DIR	(1 << 4)  /* directory */
+#define XFS_SICK_INO_XATTR	(1 << 5)  /* extended attributes */
+#define XFS_SICK_INO_SYMLINK	(1 << 6)  /* symbolic link remote target */
+#define XFS_SICK_INO_PARENT	(1 << 7)  /* parent pointers */
+
+/* Primary evidence of health problems in a given group. */
+#define XFS_SICK_FS_PRIMARY	(XFS_SICK_FS_COUNTERS | \
+				 XFS_SICK_FS_UQUOTA | \
+				 XFS_SICK_FS_GQUOTA | \
+				 XFS_SICK_FS_PQUOTA)
+
+#define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
+				 XFS_SICK_RT_SUMMARY)
+
+#define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
+				 XFS_SICK_AG_AGF | \
+				 XFS_SICK_AG_AGFL | \
+				 XFS_SICK_AG_AGI | \
+				 XFS_SICK_AG_BNOBT | \
+				 XFS_SICK_AG_CNTBT | \
+				 XFS_SICK_AG_INOBT | \
+				 XFS_SICK_AG_FINOBT | \
+				 XFS_SICK_AG_RMAPBT | \
+				 XFS_SICK_AG_REFCNTBT)
+
+#define XFS_SICK_INO_PRIMARY	(XFS_SICK_INO_CORE | \
+				 XFS_SICK_INO_BMBTD | \
+				 XFS_SICK_INO_BMBTA | \
+				 XFS_SICK_INO_BMBTC | \
+				 XFS_SICK_INO_DIR | \
+				 XFS_SICK_INO_XATTR | \
+				 XFS_SICK_INO_SYMLINK | \
+				 XFS_SICK_INO_PARENT)
+
+/* These functions must be provided by the xfs implementation. */
+
+void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_health_unmount(struct xfs_mount *mp);
+
+/* Now some helpers. */
+
+static inline bool
+xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_fs_measure_sickness(mp, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_rt_measure_sickness(mp, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_ag_measure_sickness(pag, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_inode_measure_sickness(ip, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_fs_is_healthy(struct xfs_mount *mp)
+{
+	return !xfs_fs_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_rt_is_healthy(struct xfs_mount *mp)
+{
+	return !xfs_rt_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_ag_is_healthy(struct xfs_perag *pag)
+{
+	return !xfs_ag_has_sickness(pag, -1U);
+}
+
+static inline bool
+xfs_inode_is_healthy(struct xfs_inode *ip)
+{
+	return !xfs_inode_has_sickness(ip, -1U);
+}
+
+void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
+void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
+
+#endif	/* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
new file mode 100644
index 000000000..94db50eb7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -0,0 +1,2969 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	xfs_lookup_t		dir,	/* <=, >=, == */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_holemask = 0;
+	cur->bc_rec.i.ir_count = 0;
+	cur->bc_rec.i.ir_freecount = 0;
+	cur->bc_rec.i.ir_free = 0;
+	return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_inobt_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_inobt_rec_incore_t	*irec)	/* btree record */
+{
+	union xfs_btree_rec	rec;
+
+	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+	if (xfs_has_sparseinodes(cur->bc_mp)) {
+		rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
+		rec.inobt.ir_u.sp.ir_count = irec->ir_count;
+		rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
+	} else {
+		/* ir_holemask/ir_count not supported on-disk */
+		rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
+	}
+	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+	return xfs_btree_update(cur, &rec);
+}
+
+/* Convert on-disk btree record to incore inobt record. */
+void
+xfs_inobt_btrec_to_irec(
+	struct xfs_mount		*mp,
+	const union xfs_btree_rec	*rec,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+	if (xfs_has_sparseinodes(mp)) {
+		irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
+		irec->ir_count = rec->inobt.ir_u.sp.ir_count;
+		irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
+	} else {
+		/*
+		 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
+		 * values for full inode chunks.
+		 */
+		irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
+		irec->ir_count = XFS_INODES_PER_CHUNK;
+		irec->ir_freecount =
+				be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
+	}
+	irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_inobt_get_rec(
+	struct xfs_btree_cur		*cur,
+	struct xfs_inobt_rec_incore	*irec,
+	int				*stat)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	union xfs_btree_rec		*rec;
+	int				error;
+	uint64_t			realfree;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (error || *stat == 0)
+		return error;
+
+	xfs_inobt_btrec_to_irec(mp, rec, irec);
+
+	if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
+		goto out_bad_rec;
+	if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
+	    irec->ir_count > XFS_INODES_PER_CHUNK)
+		goto out_bad_rec;
+	if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
+		goto out_bad_rec;
+
+	/* if there are no holes, return the first available offset */
+	if (!xfs_inobt_issparse(irec->ir_holemask))
+		realfree = irec->ir_free;
+	else
+		realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
+	if (hweight64(realfree) != irec->ir_freecount)
+		goto out_bad_rec;
+
+	return 0;
+
+out_bad_rec:
+	xfs_warn(mp,
+		"%s Inode BTree record corruption in AG %d detected!",
+		cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
+		cur->bc_ag.pag->pag_agno);
+	xfs_warn(mp,
+"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
+		irec->ir_startino, irec->ir_count, irec->ir_freecount,
+		irec->ir_free, irec->ir_holemask);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+int
+xfs_inobt_insert_rec(
+	struct xfs_btree_cur	*cur,
+	uint16_t		holemask,
+	uint8_t			count,
+	int32_t			freecount,
+	xfs_inofree_t		free,
+	int			*stat)
+{
+	cur->bc_rec.i.ir_holemask = holemask;
+	cur->bc_rec.i.ir_count = count;
+	cur->bc_rec.i.ir_freecount = freecount;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag,
+	xfs_agino_t		newino,
+	xfs_agino_t		newlen,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+	xfs_agino_t		thisino;
+	int			i;
+	int			error;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
+
+	for (thisino = newino;
+	     thisino < newino + newlen;
+	     thisino += XFS_INODES_PER_CHUNK) {
+		error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+		if (error) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 0);
+
+		error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
+					     XFS_INODES_PER_CHUNK,
+					     XFS_INODES_PER_CHUNK,
+					     XFS_INOBT_ALL_FREE, &i);
+		if (error) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 1);
+	}
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+	return 0;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+static int
+xfs_check_agi_freecount(
+	struct xfs_btree_cur	*cur)
+{
+	if (cur->bc_nlevels == 1) {
+		xfs_inobt_rec_incore_t rec;
+		int		freecount = 0;
+		int		error;
+		int		i;
+
+		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+		if (error)
+			return error;
+
+		do {
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
+				return error;
+
+			if (i) {
+				freecount += rec.ir_freecount;
+				error = xfs_btree_increment(cur, 0, &i);
+				if (error)
+					return error;
+			}
+		} while (i == 1);
+
+		if (!xfs_is_shutdown(cur->bc_mp))
+			ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
+	}
+	return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur)	0
+#endif
+
+/*
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
+ */
+int
+xfs_ialloc_inode_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct list_head	*buffer_list,
+	int			icount,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		length,
+	unsigned int		gen)
+{
+	struct xfs_buf		*fbuf;
+	struct xfs_dinode	*free;
+	int			nbufs;
+	int			version;
+	int			i, j;
+	xfs_daddr_t		d;
+	xfs_ino_t		ino = 0;
+	int			error;
+
+	/*
+	 * Loop over the new block(s), filling in the inodes.  For small block
+	 * sizes, manipulate the inodes in buffers  which are multiples of the
+	 * blocks size.
+	 */
+	nbufs = length / M_IGEO(mp)->blocks_per_cluster;
+
+	/*
+	 * Figure out what version number to use in the inodes we create.  If
+	 * the superblock version has caught up to the one that supports the new
+	 * inode format, then use the new inode version.  Otherwise use the old
+	 * version so that old kernels will continue to be able to use the file
+	 * system.
+	 *
+	 * For v3 inodes, we also need to write the inode number into the inode,
+	 * so calculate the first inode number of the chunk here as
+	 * XFS_AGB_TO_AGINO() only works within a filesystem block, not
+	 * across multiple filesystem blocks (such as a cluster) and so cannot
+	 * be used in the cluster buffer loop below.
+	 *
+	 * Further, because we are writing the inode directly into the buffer
+	 * and calculating a CRC on the entire inode, we have ot log the entire
+	 * inode so that the entire range the CRC covers is present in the log.
+	 * That means for v3 inode we log the entire buffer rather than just the
+	 * inode cores.
+	 */
+	if (xfs_has_v3inodes(mp)) {
+		version = 3;
+		ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
+
+		/*
+		 * log the initialisation that is about to take place as an
+		 * logical operation. This means the transaction does not
+		 * need to log the physical changes to the inode buffers as log
+		 * recovery will know what initialisation is actually needed.
+		 * Hence we only need to log the buffers as "ordered" buffers so
+		 * they track in the AIL as if they were physically logged.
+		 */
+		if (tp)
+			xfs_icreate_log(tp, agno, agbno, icount,
+					mp->m_sb.sb_inodesize, length, gen);
+	} else
+		version = 2;
+
+	for (j = 0; j < nbufs; j++) {
+		/*
+		 * Get the block.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno +
+				(j * M_IGEO(mp)->blocks_per_cluster));
+		error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+				mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
+				XBF_UNMAPPED, &fbuf);
+		if (error)
+			return error;
+
+		/* Initialize the inode buffers and log them appropriately. */
+		fbuf->b_ops = &xfs_inode_buf_ops;
+		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+		for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
+			int	ioffset = i << mp->m_sb.sb_inodelog;
+
+			free = xfs_make_iptr(mp, fbuf, i);
+			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+			free->di_version = version;
+			free->di_gen = cpu_to_be32(gen);
+			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+
+			if (version == 3) {
+				free->di_ino = cpu_to_be64(ino);
+				ino++;
+				uuid_copy(&free->di_uuid,
+					  &mp->m_sb.sb_meta_uuid);
+				xfs_dinode_calc_crc(mp, free);
+			} else if (tp) {
+				/* just log the inode core */
+				xfs_trans_log_buf(tp, fbuf, ioffset,
+					  ioffset + XFS_DINODE_SIZE(mp) - 1);
+			}
+		}
+
+		if (tp) {
+			/*
+			 * Mark the buffer as an inode allocation buffer so it
+			 * sticks in AIL at the point of this allocation
+			 * transaction. This ensures the they are on disk before
+			 * the tail of the log can be moved past this
+			 * transaction (i.e. by preventing relogging from moving
+			 * it forward in the log).
+			 */
+			xfs_trans_inode_alloc_buf(tp, fbuf);
+			if (version == 3) {
+				/*
+				 * Mark the buffer as ordered so that they are
+				 * not physically logged in the transaction but
+				 * still tracked in the AIL as part of the
+				 * transaction and pin the log appropriately.
+				 */
+				xfs_trans_ordered_buf(tp, fbuf);
+			}
+		} else {
+			fbuf->b_flags |= XBF_DONE;
+			xfs_buf_delwri_queue(fbuf, buffer_list);
+			xfs_buf_relse(fbuf);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Align startino and allocmask for a recently allocated sparse chunk such that
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
+ *
+ * Background:
+ *
+ * When enabled, sparse inode support increases the inode alignment from cluster
+ * size to inode chunk size. This means that the minimum range between two
+ * non-adjacent inode records in the inobt is large enough for a full inode
+ * record. This allows for cluster sized, cluster aligned block allocation
+ * without need to worry about whether the resulting inode record overlaps with
+ * another record in the tree. Without this basic rule, we would have to deal
+ * with the consequences of overlap by potentially undoing recent allocations in
+ * the inode allocation codepath.
+ *
+ * Because of this alignment rule (which is enforced on mount), there are two
+ * inobt possibilities for newly allocated sparse chunks. One is that the
+ * aligned inode record for the chunk covers a range of inodes not already
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
+ * other is that a record already exists at the aligned startino that considers
+ * the newly allocated range as sparse. In the latter case, record content is
+ * merged in hope that sparse inode chunks fill to full chunks over time.
+ */
+STATIC void
+xfs_align_sparse_ino(
+	struct xfs_mount		*mp,
+	xfs_agino_t			*startino,
+	uint16_t			*allocmask)
+{
+	xfs_agblock_t			agbno;
+	xfs_agblock_t			mod;
+	int				offset;
+
+	agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
+	mod = agbno % mp->m_sb.sb_inoalignmt;
+	if (!mod)
+		return;
+
+	/* calculate the inode offset and align startino */
+	offset = XFS_AGB_TO_AGINO(mp, mod);
+	*startino -= offset;
+
+	/*
+	 * Since startino has been aligned down, left shift allocmask such that
+	 * it continues to represent the same physical inodes relative to the
+	 * new startino.
+	 */
+	*allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+}
+
+/*
+ * Determine whether the source inode record can merge into the target. Both
+ * records must be sparse, the inode ranges must match and there must be no
+ * allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+	struct xfs_inobt_rec_incore	*trec,	/* tgt record */
+	struct xfs_inobt_rec_incore	*srec)	/* src record */
+{
+	uint64_t			talloc;
+	uint64_t			salloc;
+
+	/* records must cover the same inode range */
+	if (trec->ir_startino != srec->ir_startino)
+		return false;
+
+	/* both records must be sparse */
+	if (!xfs_inobt_issparse(trec->ir_holemask) ||
+	    !xfs_inobt_issparse(srec->ir_holemask))
+		return false;
+
+	/* both records must track some inodes */
+	if (!trec->ir_count || !srec->ir_count)
+		return false;
+
+	/* can't exceed capacity of a full record */
+	if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+		return false;
+
+	/* verify there is no allocation overlap */
+	talloc = xfs_inobt_irec_to_allocmask(trec);
+	salloc = xfs_inobt_irec_to_allocmask(srec);
+	if (talloc & salloc)
+		return false;
+
+	return true;
+}
+
+/*
+ * Merge the source inode record into the target. The caller must call
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+	struct xfs_inobt_rec_incore	*trec,	/* target */
+	struct xfs_inobt_rec_incore	*srec)	/* src */
+{
+	ASSERT(trec->ir_startino == srec->ir_startino);
+
+	/* combine the counts */
+	trec->ir_count += srec->ir_count;
+	trec->ir_freecount += srec->ir_freecount;
+
+	/*
+	 * Merge the holemask and free mask. For both fields, 0 bits refer to
+	 * allocated inodes. We combine the allocated ranges with bitwise AND.
+	 */
+	trec->ir_holemask &= srec->ir_holemask;
+	trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * This function supports two modes of handling preexisting records depending on
+ * the merge flag. If merge is true, the provided record is merged with the
+ * existing record and updated in place. The merged record is returned in nrec.
+ * If merge is false, an existing record is replaced with the provided record.
+ * If no preexisting record exists, the provided record is always inserted.
+ *
+ * It is considered corruption if a merge is requested and not possible. Given
+ * the sparse inode alignment constraints, this should never happen.
+ */
+STATIC int
+xfs_inobt_insert_sprec(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	struct xfs_perag		*pag,
+	int				btnum,
+	struct xfs_inobt_rec_incore	*nrec,	/* in/out: new/merged rec. */
+	bool				merge)	/* merge or replace */
+{
+	struct xfs_btree_cur		*cur;
+	int				error;
+	int				i;
+	struct xfs_inobt_rec_incore	rec;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
+
+	/* the new record is pre-aligned so we know where to look */
+	error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		goto error;
+	/* if nothing there, insert a new record and return */
+	if (i == 0) {
+		error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+					     nrec->ir_count, nrec->ir_freecount,
+					     nrec->ir_free, &i);
+		if (error)
+			goto error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error;
+		}
+
+		goto out;
+	}
+
+	/*
+	 * A record exists at this startino. Merge or replace the record
+	 * depending on what we've been asked to do.
+	 */
+	if (merge) {
+		error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (error)
+			goto error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error;
+		}
+		if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
+			error = -EFSCORRUPTED;
+			goto error;
+		}
+
+		/*
+		 * This should never fail. If we have coexisting records that
+		 * cannot merge, something is seriously wrong.
+		 */
+		if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
+			error = -EFSCORRUPTED;
+			goto error;
+		}
+
+		trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
+					 rec.ir_holemask, nrec->ir_startino,
+					 nrec->ir_holemask);
+
+		/* merge to nrec to output the updated record */
+		__xfs_inobt_rec_merge(nrec, &rec);
+
+		trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
+					  nrec->ir_holemask);
+
+		error = xfs_inobt_rec_check_count(mp, nrec);
+		if (error)
+			goto error;
+	}
+
+	error = xfs_inobt_update(cur, nrec);
+	if (error)
+		goto error;
+
+out:
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+error:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp.  Returns 0 if
+ * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
+ * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
+ * inode count threshold, or the usual negative error code for other errors.
+ */
+STATIC int
+xfs_ialloc_ag_alloc(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag)
+{
+	struct xfs_agi		*agi;
+	struct xfs_alloc_arg	args;
+	int			error;
+	xfs_agino_t		newino;		/* new first inode's number */
+	xfs_agino_t		newlen;		/* new number of inodes */
+	int			isaligned = 0;	/* inode allocation at stripe */
+						/* unit boundary */
+	/* init. to full chunk */
+	struct xfs_inobt_rec_incore rec;
+	struct xfs_ino_geometry	*igeo = M_IGEO(tp->t_mountp);
+	uint16_t		allocmask = (uint16_t) -1;
+	int			do_sparse = 0;
+
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+	args.fsbno = NULLFSBLOCK;
+	args.oinfo = XFS_RMAP_OINFO_INODES;
+
+#ifdef DEBUG
+	/* randomly do sparse inode allocations */
+	if (xfs_has_sparseinodes(tp->t_mountp) &&
+	    igeo->ialloc_min_blks < igeo->ialloc_blks)
+		do_sparse = prandom_u32_max(2);
+#endif
+
+	/*
+	 * Locking will ensure that we don't have two callers in here
+	 * at one time.
+	 */
+	newlen = igeo->ialloc_inos;
+	if (igeo->maxicount &&
+	    percpu_counter_read_positive(&args.mp->m_icount) + newlen >
+							igeo->maxicount)
+		return -ENOSPC;
+	args.minlen = args.maxlen = igeo->ialloc_blks;
+	/*
+	 * First try to allocate inodes contiguous with the last-allocated
+	 * chunk of inodes.  If the filesystem is striped, this will fill
+	 * an entire stripe unit with inodes.
+	 */
+	agi = agbp->b_addr;
+	newino = be32_to_cpu(agi->agi_newino);
+	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+		     igeo->ialloc_blks;
+	if (do_sparse)
+		goto sparse_alloc;
+	if (likely(newino != NULLAGINO &&
+		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
+		args.type = XFS_ALLOCTYPE_THIS_BNO;
+		args.prod = 1;
+
+		/*
+		 * We need to take into account alignment here to ensure that
+		 * we don't modify the free list if we fail to have an exact
+		 * block. If we don't have an exact match, and every oher
+		 * attempt allocation attempt fails, we'll end up cancelling
+		 * a dirty transaction and shutting down.
+		 *
+		 * For an exact allocation, alignment must be 1,
+		 * however we need to take cluster alignment into account when
+		 * fixing up the freelist. Use the minalignslop field to
+		 * indicate that extra blocks might be required for alignment,
+		 * but not to use them in the actual exact allocation.
+		 */
+		args.alignment = 1;
+		args.minalignslop = igeo->cluster_align - 1;
+
+		/* Allow space for the inode btree to split. */
+		args.minleft = igeo->inobt_maxlevels;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+
+		/*
+		 * This request might have dirtied the transaction if the AG can
+		 * satisfy the request, but the exact block was not available.
+		 * If the allocation did fail, subsequent requests will relax
+		 * the exact agbno requirement and increase the alignment
+		 * instead. It is critical that the total size of the request
+		 * (len + alignment + slop) does not increase from this point
+		 * on, so reset minalignslop to ensure it is not included in
+		 * subsequent requests.
+		 */
+		args.minalignslop = 0;
+	}
+
+	if (unlikely(args.fsbno == NULLFSBLOCK)) {
+		/*
+		 * Set the alignment for the allocation.
+		 * If stripe alignment is turned on then align at stripe unit
+		 * boundary.
+		 * If the cluster size is smaller than a filesystem block
+		 * then we're doing I/O for inodes in filesystem block size
+		 * pieces, so don't need alignment anyway.
+		 */
+		isaligned = 0;
+		if (igeo->ialloc_align) {
+			ASSERT(!xfs_has_noalign(args.mp));
+			args.alignment = args.mp->m_dalign;
+			isaligned = 1;
+		} else
+			args.alignment = igeo->cluster_align;
+		/*
+		 * Need to figure out where to allocate the inode blocks.
+		 * Ideally they should be spaced out through the a.g.
+		 * For now, just allocate blocks up front.
+		 */
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
+		/*
+		 * Allocate a fixed-size extent of inodes.
+		 */
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.prod = 1;
+		/*
+		 * Allow space for the inode btree to split.
+		 */
+		args.minleft = igeo->inobt_maxlevels;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+
+	/*
+	 * If stripe alignment is turned on, then try again with cluster
+	 * alignment.
+	 */
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
+		args.alignment = igeo->cluster_align;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+
+	/*
+	 * Finally, try a sparse allocation if the filesystem supports it and
+	 * the sparse allocation length is smaller than a full chunk.
+	 */
+	if (xfs_has_sparseinodes(args.mp) &&
+	    igeo->ialloc_min_blks < igeo->ialloc_blks &&
+	    args.fsbno == NULLFSBLOCK) {
+sparse_alloc:
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
+		args.alignment = args.mp->m_sb.sb_spino_align;
+		args.prod = 1;
+
+		args.minlen = igeo->ialloc_min_blks;
+		args.maxlen = args.minlen;
+
+		/*
+		 * The inode record will be aligned to full chunk size. We must
+		 * prevent sparse allocation from AG boundaries that result in
+		 * invalid inode records, such as records that start at agbno 0
+		 * or extend beyond the AG.
+		 *
+		 * Set min agbno to the first aligned, non-zero agbno and max to
+		 * the last aligned agbno that is at least one full chunk from
+		 * the end of the AG.
+		 */
+		args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+		args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+					    args.mp->m_sb.sb_inoalignmt) -
+				 igeo->ialloc_blks;
+
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			return error;
+
+		newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
+		ASSERT(newlen <= XFS_INODES_PER_CHUNK);
+		allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+	}
+
+	if (args.fsbno == NULLFSBLOCK)
+		return -EAGAIN;
+
+	ASSERT(args.len == args.minlen);
+
+	/*
+	 * Stamp and write the inode buffers.
+	 *
+	 * Seed the new inode cluster with a random generation number. This
+	 * prevents short-term reuse of generation numbers if a chunk is
+	 * freed and then immediately reallocated. We use random numbers
+	 * rather than a linear progression to prevent the next generation
+	 * number from being easily guessable.
+	 */
+	error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
+			args.agbno, args.len, get_random_u32());
+
+	if (error)
+		return error;
+	/*
+	 * Convert the results.
+	 */
+	newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
+
+	if (xfs_inobt_issparse(~allocmask)) {
+		/*
+		 * We've allocated a sparse chunk. Align the startino and mask.
+		 */
+		xfs_align_sparse_ino(args.mp, &newino, &allocmask);
+
+		rec.ir_startino = newino;
+		rec.ir_holemask = ~allocmask;
+		rec.ir_count = newlen;
+		rec.ir_freecount = newlen;
+		rec.ir_free = XFS_INOBT_ALL_FREE;
+
+		/*
+		 * Insert the sparse record into the inobt and allow for a merge
+		 * if necessary. If a merge does occur, rec is updated to the
+		 * merged record.
+		 */
+		error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+				XFS_BTNUM_INO, &rec, true);
+		if (error == -EFSCORRUPTED) {
+			xfs_alert(args.mp,
+	"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+				  XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
+						   rec.ir_startino),
+				  rec.ir_holemask, rec.ir_count);
+			xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+		}
+		if (error)
+			return error;
+
+		/*
+		 * We can't merge the part we've just allocated as for the inobt
+		 * due to finobt semantics. The original record may or may not
+		 * exist independent of whether physical inodes exist in this
+		 * sparse chunk.
+		 *
+		 * We must update the finobt record based on the inobt record.
+		 * rec contains the fully merged and up to date inobt record
+		 * from the previous call. Set merge false to replace any
+		 * existing record with this one.
+		 */
+		if (xfs_has_finobt(args.mp)) {
+			error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+				       XFS_BTNUM_FINO, &rec, false);
+			if (error)
+				return error;
+		}
+	} else {
+		/* full chunk - insert new records to both btrees */
+		error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen,
+					 XFS_BTNUM_INO);
+		if (error)
+			return error;
+
+		if (xfs_has_finobt(args.mp)) {
+			error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino,
+						 newlen, XFS_BTNUM_FINO);
+			if (error)
+				return error;
+		}
+	}
+
+	/*
+	 * Update AGI counts and newino.
+	 */
+	be32_add_cpu(&agi->agi_count, newlen);
+	be32_add_cpu(&agi->agi_freecount, newlen);
+	pag->pagi_freecount += newlen;
+	pag->pagi_count += newlen;
+	agi->agi_newino = cpu_to_be32(newino);
+
+	/*
+	 * Log allocation group header fields
+	 */
+	xfs_ialloc_log_agi(tp, agbp,
+		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+	/*
+	 * Modify/log superblock values for inode count and inode free count.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+	return 0;
+}
+
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	if (left)
+		error = xfs_btree_decrement(cur, 0, &i);
+	else
+		error = xfs_btree_increment(cur, 0, &i);
+
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+			return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_agino_t		agino,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done)
+{
+	int                     error;
+	int			i;
+
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+			return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+/*
+ * Return the offset of the first free inode in the record. If the inode chunk
+ * is sparsely allocated, we convert the record holemask to inode granularity
+ * and mask off the unallocated regions from the inode free mask.
+ */
+STATIC int
+xfs_inobt_first_free_inode(
+	struct xfs_inobt_rec_incore	*rec)
+{
+	xfs_inofree_t			realfree;
+
+	/* if there are no holes, return the first available offset */
+	if (!xfs_inobt_issparse(rec->ir_holemask))
+		return xfs_lowbit64(rec->ir_free);
+
+	realfree = xfs_inobt_irec_to_allocmask(rec);
+	realfree &= rec->ir_free;
+
+	return xfs_lowbit64(realfree);
+}
+
+/*
+ * Allocate an inode using the inobt-only algorithm.
+ */
+STATIC int
+xfs_dialloc_ag_inobt(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag,
+	xfs_ino_t		parent,
+	xfs_ino_t		*inop)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = agbp->b_addr;
+	xfs_agnumber_t		pagno = XFS_INO_TO_AGNO(mp, parent);
+	xfs_agino_t		pagino = XFS_INO_TO_AGINO(mp, parent);
+	struct xfs_btree_cur	*cur, *tcur;
+	struct xfs_inobt_rec_incore rec, trec;
+	xfs_ino_t		ino;
+	int			error;
+	int			offset;
+	int			i, j;
+	int			searchdistance = 10;
+
+	ASSERT(pag->pagi_init);
+	ASSERT(pag->pagi_inodeok);
+	ASSERT(pag->pagi_freecount > 0);
+
+ restart_pagno:
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = be32_to_cpu(agi->agi_newino);
+
+	error = xfs_check_agi_freecount(cur);
+	if (error)
+		goto error0;
+
+	/*
+	 * If in the same AG as the parent, try to get near the parent.
+	 */
+	if (pagno == pag->pag_agno) {
+		int		doneleft;	/* done, to the left */
+		int		doneright;	/* done, to the right */
+
+		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+		if (error)
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+
+		error = xfs_inobt_get_rec(cur, &rec, &j);
+		if (error)
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, j != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+
+		if (rec.ir_freecount > 0) {
+			/*
+			 * Found a free inode in the same chunk
+			 * as the parent, done.
+			 */
+			goto alloc_inode;
+		}
+
+
+		/*
+		 * In the same AG as parent, but parent's chunk is full.
+		 */
+
+		/* duplicate the cursor, search left & right simultaneously */
+		error = xfs_btree_dup_cursor(cur, &tcur);
+		if (error)
+			goto error0;
+
+		/*
+		 * Skip to last blocks looked up if same parent inode.
+		 */
+		if (pagino != NULLAGINO &&
+		    pag->pagl_pagino == pagino &&
+		    pag->pagl_leftrec != NULLAGINO &&
+		    pag->pagl_rightrec != NULLAGINO) {
+			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+						   &trec, &doneleft);
+			if (error)
+				goto error1;
+
+			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+						   &rec, &doneright);
+			if (error)
+				goto error1;
+		} else {
+			/* search left with tcur, back up 1 record */
+			error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+			if (error)
+				goto error1;
+
+			/* search right with cur, go forward 1 record. */
+			error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		}
+
+		/*
+		 * Loop until we find an inode chunk with a free inode.
+		 */
+		while (--searchdistance > 0 && (!doneleft || !doneright)) {
+			int	useleft;  /* using left inode chunk this time */
+
+			/* figure out the closer block if both are valid. */
+			if (!doneleft && !doneright) {
+				useleft = pagino -
+				 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+				  rec.ir_startino - pagino;
+			} else {
+				useleft = !doneleft;
+			}
+
+			/* free inodes to the left? */
+			if (useleft && trec.ir_freecount) {
+				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+				cur = tcur;
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				rec = trec;
+				goto alloc_inode;
+			}
+
+			/* free inodes to the right? */
+			if (!useleft && rec.ir_freecount) {
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto alloc_inode;
+			}
+
+			/* get next record to check */
+			if (useleft) {
+				error = xfs_ialloc_next_rec(tcur, &trec,
+								 &doneleft, 1);
+			} else {
+				error = xfs_ialloc_next_rec(cur, &rec,
+								 &doneright, 0);
+			}
+			if (error)
+				goto error1;
+		}
+
+		if (searchdistance <= 0) {
+			/*
+			 * Not in range - save last search
+			 * location and allocate a new inode
+			 */
+			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+			pag->pagl_leftrec = trec.ir_startino;
+			pag->pagl_rightrec = rec.ir_startino;
+			pag->pagl_pagino = pagino;
+
+		} else {
+			/*
+			 * We've reached the end of the btree. because
+			 * we are only searching a small chunk of the
+			 * btree each search, there is obviously free
+			 * inodes closer to the parent inode than we
+			 * are now. restart the search again.
+			 */
+			pag->pagl_pagino = NULLAGINO;
+			pag->pagl_leftrec = NULLAGINO;
+			pag->pagl_rightrec = NULLAGINO;
+			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			goto restart_pagno;
+		}
+	}
+
+	/*
+	 * In a different AG from the parent.
+	 * See if the most recently allocated block has any free.
+	 */
+	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
+		if (error)
+			goto error0;
+
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, &rec, &j);
+			if (error)
+				goto error0;
+
+			if (j == 1 && rec.ir_freecount > 0) {
+				/*
+				 * The last chunk allocated in the group
+				 * still has a free inode.
+				 */
+				goto alloc_inode;
+			}
+		}
+	}
+
+	/*
+	 * None left in the last group, search the whole AG
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		goto error0;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+
+	for (;;) {
+		error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (error)
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+		if (rec.ir_freecount > 0)
+			break;
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto error0;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto error0;
+		}
+	}
+
+alloc_inode:
+	offset = xfs_inobt_first_free_inode(&rec);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+	error = xfs_inobt_update(cur, &rec);
+	if (error)
+		goto error0;
+	be32_add_cpu(&agi->agi_freecount, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	pag->pagi_freecount--;
+
+	error = xfs_check_agi_freecount(cur);
+	if (error)
+		goto error0;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+	*inop = ino;
+	return 0;
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+	xfs_agino_t			pagino,
+	struct xfs_btree_cur		**ocur,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	struct xfs_btree_cur		*lcur = *ocur;	/* left search cursor */
+	struct xfs_btree_cur		*rcur;	/* right search cursor */
+	struct xfs_inobt_rec_incore	rrec;
+	int				error;
+	int				i, j;
+
+	error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+	if (error)
+		return error;
+
+	if (i == 1) {
+		error = xfs_inobt_get_rec(lcur, rec, &i);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
+			return -EFSCORRUPTED;
+
+		/*
+		 * See if we've landed in the parent inode record. The finobt
+		 * only tracks chunks with at least one free inode, so record
+		 * existence is enough.
+		 */
+		if (pagino >= rec->ir_startino &&
+		    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+			return 0;
+	}
+
+	error = xfs_btree_dup_cursor(lcur, &rcur);
+	if (error)
+		return error;
+
+	error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+	if (error)
+		goto error_rcur;
+	if (j == 1) {
+		error = xfs_inobt_get_rec(rcur, &rrec, &j);
+		if (error)
+			goto error_rcur;
+		if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
+			error = -EFSCORRUPTED;
+			goto error_rcur;
+		}
+	}
+
+	if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
+		error = -EFSCORRUPTED;
+		goto error_rcur;
+	}
+	if (i == 1 && j == 1) {
+		/*
+		 * Both the left and right records are valid. Choose the closer
+		 * inode chunk to the target.
+		 */
+		if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+		    (rrec.ir_startino - pagino)) {
+			*rec = rrec;
+			xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+			*ocur = rcur;
+		} else {
+			xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+		}
+	} else if (j == 1) {
+		/* only the right record is valid */
+		*rec = rrec;
+		xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+		*ocur = rcur;
+	} else if (i == 1) {
+		/* only the left record is valid */
+		xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+	}
+
+	return 0;
+
+error_rcur:
+	xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+	struct xfs_agi			*agi,
+	struct xfs_btree_cur		*cur,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	int error;
+	int i;
+
+	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
+		if (error)
+			return error;
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, rec, &i);
+			if (error)
+				return error;
+			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+				return -EFSCORRUPTED;
+			return 0;
+		}
+	}
+
+	/*
+	 * Find the first inode available in the AG.
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+		return -EFSCORRUPTED;
+
+	error = xfs_inobt_get_rec(cur, rec, &i);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+	struct xfs_btree_cur		*cur,	/* inobt cursor */
+	struct xfs_inobt_rec_incore	*frec,	/* finobt record */
+	int				offset) /* inode offset */
+{
+	struct xfs_inobt_rec_incore	rec;
+	int				error;
+	int				i;
+
+	error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+		return -EFSCORRUPTED;
+
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+		return -EFSCORRUPTED;
+	ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+
+	if (XFS_IS_CORRUPT(cur->bc_mp,
+			   rec.ir_free != frec->ir_free ||
+			   rec.ir_freecount != frec->ir_freecount))
+		return -EFSCORRUPTED;
+
+	return xfs_inobt_update(cur, &rec);
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+static int
+xfs_dialloc_ag(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag,
+	xfs_ino_t		parent,
+	xfs_ino_t		*inop)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_agi			*agi = agbp->b_addr;
+	xfs_agnumber_t			pagno = XFS_INO_TO_AGNO(mp, parent);
+	xfs_agino_t			pagino = XFS_INO_TO_AGINO(mp, parent);
+	struct xfs_btree_cur		*cur;	/* finobt cursor */
+	struct xfs_btree_cur		*icur;	/* inobt cursor */
+	struct xfs_inobt_rec_incore	rec;
+	xfs_ino_t			ino;
+	int				error;
+	int				offset;
+	int				i;
+
+	if (!xfs_has_finobt(mp))
+		return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop);
+
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = be32_to_cpu(agi->agi_newino);
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
+
+	error = xfs_check_agi_freecount(cur);
+	if (error)
+		goto error_cur;
+
+	/*
+	 * The search algorithm depends on whether we're in the same AG as the
+	 * parent. If so, find the closest available inode to the parent. If
+	 * not, consider the agi hint or find the first free inode in the AG.
+	 */
+	if (pag->pag_agno == pagno)
+		error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+	else
+		error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+	if (error)
+		goto error_cur;
+
+	offset = xfs_inobt_first_free_inode(&rec);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+
+	/*
+	 * Modify or remove the finobt record.
+	 */
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+	if (rec.ir_freecount)
+		error = xfs_inobt_update(cur, &rec);
+	else
+		error = xfs_btree_delete(cur, &i);
+	if (error)
+		goto error_cur;
+
+	/*
+	 * The finobt has now been updated appropriately. We haven't updated the
+	 * agi and superblock yet, so we can create an inobt cursor and validate
+	 * the original freecount. If all is well, make the equivalent update to
+	 * the inobt using the finobt record and offset information.
+	 */
+	icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+
+	error = xfs_check_agi_freecount(icur);
+	if (error)
+		goto error_icur;
+
+	error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+	if (error)
+		goto error_icur;
+
+	/*
+	 * Both trees have now been updated. We must update the perag and
+	 * superblock before we can check the freecount for each btree.
+	 */
+	be32_add_cpu(&agi->agi_freecount, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	pag->pagi_freecount--;
+
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+	error = xfs_check_agi_freecount(icur);
+	if (error)
+		goto error_icur;
+	error = xfs_check_agi_freecount(cur);
+	if (error)
+		goto error_icur;
+
+	xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	*inop = ino;
+	return 0;
+
+error_icur:
+	xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+static int
+xfs_dialloc_roll(
+	struct xfs_trans	**tpp,
+	struct xfs_buf		*agibp)
+{
+	struct xfs_trans	*tp = *tpp;
+	struct xfs_dquot_acct	*dqinfo;
+	int			error;
+
+	/*
+	 * Hold to on to the agibp across the commit so no other allocation can
+	 * come in and take the free inodes we just allocated for our caller.
+	 */
+	xfs_trans_bhold(tp, agibp);
+
+	/*
+	 * We want the quota changes to be associated with the next transaction,
+	 * NOT this one. So, detach the dqinfo from this and attach it to the
+	 * next transaction.
+	 */
+	dqinfo = tp->t_dqinfo;
+	tp->t_dqinfo = NULL;
+
+	error = xfs_trans_roll(&tp);
+
+	/* Re-attach the quota info that we detached from prev trx. */
+	tp->t_dqinfo = dqinfo;
+
+	/*
+	 * Join the buffer even on commit error so that the buffer is released
+	 * when the caller cancels the transaction and doesn't have to handle
+	 * this error case specially.
+	 */
+	xfs_trans_bjoin(tp, agibp);
+	*tpp = tp;
+	return error;
+}
+
+static xfs_agnumber_t
+xfs_ialloc_next_ag(
+	xfs_mount_t	*mp)
+{
+	xfs_agnumber_t	agno;
+
+	spin_lock(&mp->m_agirotor_lock);
+	agno = mp->m_agirotor;
+	if (++mp->m_agirotor >= mp->m_maxagi)
+		mp->m_agirotor = 0;
+	spin_unlock(&mp->m_agirotor_lock);
+
+	return agno;
+}
+
+static bool
+xfs_dialloc_good_ag(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	umode_t			mode,
+	int			flags,
+	bool			ok_alloc)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_extlen_t		ineed;
+	xfs_extlen_t		longest = 0;
+	int			needspace;
+	int			error;
+
+	if (!pag->pagi_inodeok)
+		return false;
+
+	if (!pag->pagi_init) {
+		error = xfs_ialloc_read_agi(pag, tp, NULL);
+		if (error)
+			return false;
+	}
+
+	if (pag->pagi_freecount)
+		return true;
+	if (!ok_alloc)
+		return false;
+
+	if (!pag->pagf_init) {
+		error = xfs_alloc_read_agf(pag, tp, flags, NULL);
+		if (error)
+			return false;
+	}
+
+	/*
+	 * Check that there is enough free space for the file plus a chunk of
+	 * inodes if we need to allocate some. If this is the first pass across
+	 * the AGs, take into account the potential space needed for alignment
+	 * of inode chunks when checking the longest contiguous free space in
+	 * the AG - this prevents us from getting ENOSPC because we have free
+	 * space larger than ialloc_blks but alignment constraints prevent us
+	 * from using it.
+	 *
+	 * If we can't find an AG with space for full alignment slack to be
+	 * taken into account, we must be near ENOSPC in all AGs.  Hence we
+	 * don't include alignment for the second pass and so if we fail
+	 * allocation due to alignment issues then it is most likely a real
+	 * ENOSPC condition.
+	 *
+	 * XXX(dgc): this calculation is now bogus thanks to the per-ag
+	 * reservations that xfs_alloc_fix_freelist() now does via
+	 * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
+	 * be more than large enough for the check below to succeed, but
+	 * xfs_alloc_space_available() will fail because of the non-zero
+	 * metadata reservation and hence we won't actually be able to allocate
+	 * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
+	 * because of this.
+	 */
+	ineed = M_IGEO(mp)->ialloc_min_blks;
+	if (flags && ineed > 1)
+		ineed += M_IGEO(mp)->cluster_align;
+	longest = pag->pagf_longest;
+	if (!longest)
+		longest = pag->pagf_flcount > 0;
+	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+
+	if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
+		return false;
+	return true;
+}
+
+static int
+xfs_dialloc_try_ag(
+	struct xfs_trans	**tpp,
+	struct xfs_perag	*pag,
+	xfs_ino_t		parent,
+	xfs_ino_t		*new_ino,
+	bool			ok_alloc)
+{
+	struct xfs_buf		*agbp;
+	xfs_ino_t		ino;
+	int			error;
+
+	/*
+	 * Then read in the AGI buffer and recheck with the AGI buffer
+	 * lock held.
+	 */
+	error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
+	if (error)
+		return error;
+
+	if (!pag->pagi_freecount) {
+		if (!ok_alloc) {
+			error = -EAGAIN;
+			goto out_release;
+		}
+
+		error = xfs_ialloc_ag_alloc(*tpp, agbp, pag);
+		if (error < 0)
+			goto out_release;
+
+		/*
+		 * We successfully allocated space for an inode cluster in this
+		 * AG.  Roll the transaction so that we can allocate one of the
+		 * new inodes.
+		 */
+		ASSERT(pag->pagi_freecount > 0);
+		error = xfs_dialloc_roll(tpp, agbp);
+		if (error)
+			goto out_release;
+	}
+
+	/* Allocate an inode in the found AG */
+	error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino);
+	if (!error)
+		*new_ino = ino;
+	return error;
+
+out_release:
+	xfs_trans_brelse(*tpp, agbp);
+	return error;
+}
+
+/*
+ * Allocate an on-disk inode.
+ *
+ * Mode is used to tell whether the new inode is a directory and hence where to
+ * locate it. The on-disk inode that is allocated will be returned in @new_ino
+ * on success, otherwise an error will be set to indicate the failure (e.g.
+ * -ENOSPC).
+ */
+int
+xfs_dialloc(
+	struct xfs_trans	**tpp,
+	xfs_ino_t		parent,
+	umode_t			mode,
+	xfs_ino_t		*new_ino)
+{
+	struct xfs_mount	*mp = (*tpp)->t_mountp;
+	xfs_agnumber_t		agno;
+	int			error = 0;
+	xfs_agnumber_t		start_agno;
+	struct xfs_perag	*pag;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	bool			ok_alloc = true;
+	int			flags;
+	xfs_ino_t		ino;
+
+	/*
+	 * Directories, symlinks, and regular files frequently allocate at least
+	 * one block, so factor that potential expansion when we examine whether
+	 * an AG has enough space for file creation.
+	 */
+	if (S_ISDIR(mode))
+		start_agno = xfs_ialloc_next_ag(mp);
+	else {
+		start_agno = XFS_INO_TO_AGNO(mp, parent);
+		if (start_agno >= mp->m_maxagi)
+			start_agno = 0;
+	}
+
+	/*
+	 * If we have already hit the ceiling of inode blocks then clear
+	 * ok_alloc so we scan all available agi structures for a free
+	 * inode.
+	 *
+	 * Read rough value of mp->m_icount by percpu_counter_read_positive,
+	 * which will sacrifice the preciseness but improve the performance.
+	 */
+	if (igeo->maxicount &&
+	    percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
+							> igeo->maxicount) {
+		ok_alloc = false;
+	}
+
+	/*
+	 * Loop until we find an allocation group that either has free inodes
+	 * or in which we can allocate some inodes.  Iterate through the
+	 * allocation groups upward, wrapping at the end.
+	 */
+	agno = start_agno;
+	flags = XFS_ALLOC_FLAG_TRYLOCK;
+	for (;;) {
+		pag = xfs_perag_get(mp, agno);
+		if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) {
+			error = xfs_dialloc_try_ag(tpp, pag, parent,
+					&ino, ok_alloc);
+			if (error != -EAGAIN)
+				break;
+		}
+
+		if (xfs_is_shutdown(mp)) {
+			error = -EFSCORRUPTED;
+			break;
+		}
+		if (++agno == mp->m_maxagi)
+			agno = 0;
+		if (agno == start_agno) {
+			if (!flags) {
+				error = -ENOSPC;
+				break;
+			}
+			flags = 0;
+		}
+		xfs_perag_put(pag);
+	}
+
+	if (!error)
+		*new_ino = ino;
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Free the blocks of an inode chunk. We must consider that the inode chunk
+ * might be sparse and only free the regions that are allocated as part of the
+ * chunk.
+ */
+STATIC void
+xfs_difree_inode_chunk(
+	struct xfs_trans		*tp,
+	xfs_agnumber_t			agno,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_agblock_t			sagbno = XFS_AGINO_TO_AGBNO(mp,
+							rec->ir_startino);
+	int				startidx, endidx;
+	int				nextbit;
+	xfs_agblock_t			agbno;
+	int				contigblk;
+	DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+
+	if (!xfs_inobt_issparse(rec->ir_holemask)) {
+		/* not sparse, calculate extent info directly */
+		xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
+				  M_IGEO(mp)->ialloc_blks,
+				  &XFS_RMAP_OINFO_INODES);
+		return;
+	}
+
+	/* holemask is only 16-bits (fits in an unsigned long) */
+	ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
+	holemask[0] = rec->ir_holemask;
+
+	/*
+	 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
+	 * holemask and convert the start/end index of each range to an extent.
+	 * We start with the start and end index both pointing at the first 0 in
+	 * the mask.
+	 */
+	startidx = endidx = find_first_zero_bit(holemask,
+						XFS_INOBT_HOLEMASK_BITS);
+	nextbit = startidx + 1;
+	while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+		nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
+					     nextbit);
+		/*
+		 * If the next zero bit is contiguous, update the end index of
+		 * the current range and continue.
+		 */
+		if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
+		    nextbit == endidx + 1) {
+			endidx = nextbit;
+			goto next;
+		}
+
+		/*
+		 * nextbit is not contiguous with the current end index. Convert
+		 * the current start/end to an extent and add it to the free
+		 * list.
+		 */
+		agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
+				  mp->m_sb.sb_inopblock;
+		contigblk = ((endidx - startidx + 1) *
+			     XFS_INODES_PER_HOLEMASK_BIT) /
+			    mp->m_sb.sb_inopblock;
+
+		ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
+		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
+		xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
+				  contigblk, &XFS_RMAP_OINFO_INODES);
+
+		/* reset range to current bit and carry on... */
+		startidx = endidx = nextbit;
+
+next:
+		nextbit++;
+	}
+}
+
+STATIC int
+xfs_difree_inobt(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	struct xfs_perag		*pag,
+	xfs_agino_t			agino,
+	struct xfs_icluster		*xic,
+	struct xfs_inobt_rec_incore	*orec)
+{
+	struct xfs_agi			*agi = agbp->b_addr;
+	struct xfs_btree_cur		*cur;
+	struct xfs_inobt_rec_incore	rec;
+	int				ilen;
+	int				error;
+	int				i;
+	int				off;
+
+	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+	ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
+	/*
+	 * Initialize the cursor.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+
+	error = xfs_check_agi_freecount(cur);
+	if (error)
+		goto error0;
+
+	/*
+	 * Look for the entry describing this inode.
+	 */
+	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+		xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+			__func__, error);
+		goto error0;
+	}
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+			__func__, error);
+		goto error0;
+	}
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto error0;
+	}
+	/*
+	 * Get the offset in the inode chunk.
+	 */
+	off = agino - rec.ir_startino;
+	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+	ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
+	/*
+	 * Mark the inode free & increment the count.
+	 */
+	rec.ir_free |= XFS_INOBT_MASK(off);
+	rec.ir_freecount++;
+
+	/*
+	 * When an inode chunk is free, it becomes eligible for removal. Don't
+	 * remove the chunk if the block size is large enough for multiple inode
+	 * chunks (that might not be free).
+	 */
+	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+		struct xfs_perag	*pag = agbp->b_pag;
+
+		xic->deleted = true;
+		xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+				rec.ir_startino);
+		xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
+
+		/*
+		 * Remove the inode cluster from the AGI B+Tree, adjust the
+		 * AGI and Superblock inode counts, and mark the disk space
+		 * to be freed when the transaction is committed.
+		 */
+		ilen = rec.ir_freecount;
+		be32_add_cpu(&agi->agi_count, -ilen);
+		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
+		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+		pag->pagi_freecount -= ilen - 1;
+		pag->pagi_count -= ilen;
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+
+		if ((error = xfs_btree_delete(cur, &i))) {
+			xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+				__func__, error);
+			goto error0;
+		}
+
+		xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
+	} else {
+		xic->deleted = false;
+
+		error = xfs_inobt_update(cur, &rec);
+		if (error) {
+			xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+				__func__, error);
+			goto error0;
+		}
+
+		/*
+		 * Change the inode free counts and log the ag/sb changes.
+		 */
+		be32_add_cpu(&agi->agi_freecount, 1);
+		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+		pag->pagi_freecount++;
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+	}
+
+	error = xfs_check_agi_freecount(cur);
+	if (error)
+		goto error0;
+
+	*orec = rec;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	struct xfs_perag		*pag,
+	xfs_agino_t			agino,
+	struct xfs_inobt_rec_incore	*ibtrec) /* inobt record */
+{
+	struct xfs_btree_cur		*cur;
+	struct xfs_inobt_rec_incore	rec;
+	int				offset = agino - ibtrec->ir_startino;
+	int				error;
+	int				i;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
+
+	error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		goto error;
+	if (i == 0) {
+		/*
+		 * If the record does not exist in the finobt, we must have just
+		 * freed an inode in a previously fully allocated chunk. If not,
+		 * something is out of sync.
+		 */
+		if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
+			error = -EFSCORRUPTED;
+			goto error;
+		}
+
+		error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+					     ibtrec->ir_count,
+					     ibtrec->ir_freecount,
+					     ibtrec->ir_free, &i);
+		if (error)
+			goto error;
+		ASSERT(i == 1);
+
+		goto out;
+	}
+
+	/*
+	 * Read and update the existing record. We could just copy the ibtrec
+	 * across here, but that would defeat the purpose of having redundant
+	 * metadata. By making the modifications independently, we can catch
+	 * corruptions that we wouldn't see if we just copied from one record
+	 * to another.
+	 */
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error)
+		goto error;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto error;
+	}
+
+	rec.ir_free |= XFS_INOBT_MASK(offset);
+	rec.ir_freecount++;
+
+	if (XFS_IS_CORRUPT(mp,
+			   rec.ir_free != ibtrec->ir_free ||
+			   rec.ir_freecount != ibtrec->ir_freecount)) {
+		error = -EFSCORRUPTED;
+		goto error;
+	}
+
+	/*
+	 * The content of inobt records should always match between the inobt
+	 * and finobt. The lifecycle of records in the finobt is different from
+	 * the inobt in that the finobt only tracks records with at least one
+	 * free inode. Hence, if all of the inodes are free and we aren't
+	 * keeping inode chunks permanently on disk, remove the record.
+	 * Otherwise, update the record with the new information.
+	 *
+	 * Note that we currently can't free chunks when the block size is large
+	 * enough for multiple chunks. Leave the finobt record to remain in sync
+	 * with the inobt.
+	 */
+	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+		error = xfs_btree_delete(cur, &i);
+		if (error)
+			goto error;
+		ASSERT(i == 1);
+	} else {
+		error = xfs_inobt_update(cur, &rec);
+		if (error)
+			goto error;
+	}
+
+out:
+	error = xfs_check_agi_freecount(cur);
+	if (error)
+		goto error;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+
+error:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_ino_t		inode,
+	struct xfs_icluster	*xic)
+{
+	/* REFERENCED */
+	xfs_agblock_t		agbno;	/* block number containing inode */
+	struct xfs_buf		*agbp;	/* buffer for allocation group header */
+	xfs_agino_t		agino;	/* allocation group inode number */
+	int			error;	/* error return value */
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inobt_rec_incore rec;/* btree record */
+
+	/*
+	 * Break up inode number into its components.
+	 */
+	if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
+		xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
+			__func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
+		ASSERT(0);
+		return -EINVAL;
+	}
+	agino = XFS_INO_TO_AGINO(mp, inode);
+	if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino))  {
+		xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+			__func__, (unsigned long long)inode,
+			(unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
+		ASSERT(0);
+		return -EINVAL;
+	}
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agbno >= mp->m_sb.sb_agblocks)  {
+		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+			__func__, agbno, mp->m_sb.sb_agblocks);
+		ASSERT(0);
+		return -EINVAL;
+	}
+	/*
+	 * Get the allocation group header.
+	 */
+	error = xfs_ialloc_read_agi(pag, tp, &agbp);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+			__func__, error);
+		return error;
+	}
+
+	/*
+	 * Fix up the inode allocation btree.
+	 */
+	error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec);
+	if (error)
+		goto error0;
+
+	/*
+	 * Fix up the free inode btree.
+	 */
+	if (xfs_has_finobt(mp)) {
+		error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec);
+		if (error)
+			goto error0;
+	}
+
+	return 0;
+
+error0:
+	return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		*chunk_agbno,
+	xfs_agblock_t		*offset_agbno,
+	int			flags)
+{
+	struct xfs_inobt_rec_incore rec;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	int			error;
+	int			i;
+
+	error = xfs_ialloc_read_agi(pag, tp, &agbp);
+	if (error) {
+		xfs_alert(mp,
+			"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+			__func__, error, pag->pag_agno);
+		return error;
+	}
+
+	/*
+	 * Lookup the inode record for the given agino. If the record cannot be
+	 * found, then it's an invalid inode number and we should abort. Once
+	 * we have a record, we need to ensure it contains the inode number
+	 * we are looking up.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+	if (!error) {
+		if (i)
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (!error && i == 0)
+			error = -EINVAL;
+	}
+
+	xfs_trans_brelse(tp, agbp);
+	xfs_btree_del_cursor(cur, error);
+	if (error)
+		return error;
+
+	/* check that the returned record contains the required inode */
+	if (rec.ir_startino > agino ||
+	    rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
+		return -EINVAL;
+
+	/* for untrusted inodes check it is allocated first */
+	if ((flags & XFS_IGET_UNTRUSTED) &&
+	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+		return -EINVAL;
+
+	*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+	*offset_agbno = agbno - *chunk_agbno;
+	return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+	struct xfs_mount	 *mp,	/* file system mount structure */
+	struct xfs_trans	 *tp,	/* transaction pointer */
+	xfs_ino_t		ino,	/* inode to locate */
+	struct xfs_imap		*imap,	/* location map structure */
+	uint			flags)	/* flags for inode btree lookup */
+{
+	xfs_agblock_t		agbno;	/* block number of inode in the alloc group */
+	xfs_agino_t		agino;	/* inode number within alloc group */
+	xfs_agblock_t		chunk_agbno;	/* first block in inode chunk */
+	xfs_agblock_t		cluster_agbno;	/* first block in inode cluster */
+	int			error;	/* error code */
+	int			offset;	/* index of inode in its buffer */
+	xfs_agblock_t		offset_agbno;	/* blks from chunk start to inode */
+	struct xfs_perag	*pag;
+
+	ASSERT(ino != NULLFSINO);
+
+	/*
+	 * Split up the inode number into its parts.
+	 */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (!pag || agbno >= mp->m_sb.sb_agblocks ||
+	    ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+		error = -EINVAL;
+#ifdef DEBUG
+		/*
+		 * Don't output diagnostic information for untrusted inodes
+		 * as they can be invalid without implying corruption.
+		 */
+		if (flags & XFS_IGET_UNTRUSTED)
+			goto out_drop;
+		if (!pag) {
+			xfs_alert(mp,
+				"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+				__func__, XFS_INO_TO_AGNO(mp, ino),
+				mp->m_sb.sb_agcount);
+		}
+		if (agbno >= mp->m_sb.sb_agblocks) {
+			xfs_alert(mp,
+		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+				__func__, (unsigned long long)agbno,
+				(unsigned long)mp->m_sb.sb_agblocks);
+		}
+		if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+			xfs_alert(mp,
+		"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+				__func__, ino,
+				XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
+		}
+		xfs_stack_trace();
+#endif /* DEBUG */
+		goto out_drop;
+	}
+
+	/*
+	 * For bulkstat and handle lookups, we have an untrusted inode number
+	 * that we have to verify is valid. We cannot do this just by reading
+	 * the inode buffer as it may have been unlinked and removed leaving
+	 * inodes in stale state on disk. Hence we have to do a btree lookup
+	 * in all cases where an untrusted inode number is passed.
+	 */
+	if (flags & XFS_IGET_UNTRUSTED) {
+		error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			goto out_drop;
+		goto out_map;
+	}
+
+	/*
+	 * If the inode cluster size is the same as the blocksize or
+	 * smaller we get to the buffer by simple arithmetics.
+	 */
+	if (M_IGEO(mp)->blocks_per_cluster == 1) {
+		offset = XFS_INO_TO_OFFSET(mp, ino);
+		ASSERT(offset < mp->m_sb.sb_inopblock);
+
+		imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
+		imap->im_len = XFS_FSB_TO_BB(mp, 1);
+		imap->im_boffset = (unsigned short)(offset <<
+							mp->m_sb.sb_inodelog);
+		error = 0;
+		goto out_drop;
+	}
+
+	/*
+	 * If the inode chunks are aligned then use simple maths to
+	 * find the location. Otherwise we have to do a btree
+	 * lookup to find the location.
+	 */
+	if (M_IGEO(mp)->inoalign_mask) {
+		offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
+		chunk_agbno = agbno - offset_agbno;
+	} else {
+		error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			goto out_drop;
+	}
+
+out_map:
+	ASSERT(agbno >= chunk_agbno);
+	cluster_agbno = chunk_agbno +
+		((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
+		 M_IGEO(mp)->blocks_per_cluster);
+	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+		XFS_INO_TO_OFFSET(mp, ino);
+
+	imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
+	imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
+	imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
+
+	/*
+	 * If the inode number maps to a block outside the bounds
+	 * of the file system then return NULL rather than calling
+	 * read_buf and panicing when we get an error from the
+	 * driver.
+	 */
+	if ((imap->im_blkno + imap->im_len) >
+	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+		xfs_alert(mp,
+	"%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+			__func__, (unsigned long long) imap->im_blkno,
+			(unsigned long long) imap->im_len,
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+		error = -EINVAL;
+		goto out_drop;
+	}
+	error = 0;
+out_drop:
+	if (pag)
+		xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
+ */
+void
+xfs_ialloc_log_agi(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	uint32_t		fields)
+{
+	int			first;		/* first byte number */
+	int			last;		/* last byte number */
+	static const short	offsets[] = {	/* field starting offsets */
+					/* keep in sync with bit definitions */
+		offsetof(xfs_agi_t, agi_magicnum),
+		offsetof(xfs_agi_t, agi_versionnum),
+		offsetof(xfs_agi_t, agi_seqno),
+		offsetof(xfs_agi_t, agi_length),
+		offsetof(xfs_agi_t, agi_count),
+		offsetof(xfs_agi_t, agi_root),
+		offsetof(xfs_agi_t, agi_level),
+		offsetof(xfs_agi_t, agi_freecount),
+		offsetof(xfs_agi_t, agi_newino),
+		offsetof(xfs_agi_t, agi_dirino),
+		offsetof(xfs_agi_t, agi_unlinked),
+		offsetof(xfs_agi_t, agi_free_root),
+		offsetof(xfs_agi_t, agi_free_level),
+		offsetof(xfs_agi_t, agi_iblocks),
+		sizeof(xfs_agi_t)
+	};
+#ifdef DEBUG
+	struct xfs_agi		*agi = bp->b_addr;
+
+	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+#endif
+
+	/*
+	 * Compute byte offsets for the first and last fields in the first
+	 * region and log the agi buffer. This only logs up through
+	 * agi_unlinked.
+	 */
+	if (fields & XFS_AGI_ALL_BITS_R1) {
+		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+				  &first, &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	}
+
+	/*
+	 * Mask off the bits in the first region and calculate the first and
+	 * last field offsets for any bits in the second region.
+	 */
+	fields &= ~XFS_AGI_ALL_BITS_R1;
+	if (fields) {
+		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+				  &first, &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	}
+}
+
+static xfs_failaddr_t
+xfs_agi_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	struct xfs_agi	*agi = bp->b_addr;
+	int		i;
+
+	if (xfs_has_crc(mp)) {
+		if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
+			return __this_address;
+	}
+
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	if (!xfs_verify_magic(bp, agi->agi_magicnum))
+		return __this_address;
+	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+		return __this_address;
+
+	if (be32_to_cpu(agi->agi_level) < 1 ||
+	    be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
+		return __this_address;
+
+	if (xfs_has_finobt(mp) &&
+	    (be32_to_cpu(agi->agi_free_level) < 1 ||
+	     be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
+		return __this_address;
+
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+		return __this_address;
+
+	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+		if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
+			continue;
+		if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
+			return __this_address;
+	}
+
+	return NULL;
+}
+
+static void
+xfs_agi_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	xfs_failaddr_t	fa;
+
+	if (xfs_has_crc(mp) &&
+	    !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_agi_verify(bp);
+		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_agi_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_agi		*agi = bp->b_addr;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_agi_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (bip)
+		agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+	.name = "xfs_agi",
+	.magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
+	.verify_read = xfs_agi_read_verify,
+	.verify_write = xfs_agi_write_verify,
+	.verify_struct = xfs_agi_verify,
+};
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_read_agi(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	struct xfs_buf		**agibpp)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	int			error;
+
+	trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
+	if (error)
+		return error;
+	if (tp)
+		xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
+
+	xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
+	return 0;
+}
+
+/*
+ * Read in the agi and initialise the per-ag data. If the caller supplies a
+ * @agibpp, return the locked AGI buffer to them, otherwise release it.
+ */
+int
+xfs_ialloc_read_agi(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	struct xfs_buf		**agibpp)
+{
+	struct xfs_buf		*agibp;
+	struct xfs_agi		*agi;
+	int			error;
+
+	trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
+
+	error = xfs_read_agi(pag, tp, &agibp);
+	if (error)
+		return error;
+
+	agi = agibp->b_addr;
+	if (!pag->pagi_init) {
+		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+		pag->pagi_count = be32_to_cpu(agi->agi_count);
+		pag->pagi_init = 1;
+	}
+
+	/*
+	 * It's possible for these to be out of sync if
+	 * we are in the middle of a forced shutdown.
+	 */
+	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+		xfs_is_shutdown(pag->pag_mount));
+	if (agibpp)
+		*agibpp = agibp;
+	else
+		xfs_trans_brelse(tp, agibp);
+	return 0;
+}
+
+/* Is there an inode record covering a given range of inode numbers? */
+int
+xfs_ialloc_has_inode_record(
+	struct xfs_btree_cur	*cur,
+	xfs_agino_t		low,
+	xfs_agino_t		high,
+	bool			*exists)
+{
+	struct xfs_inobt_rec_incore	irec;
+	xfs_agino_t		agino;
+	uint16_t		holemask;
+	int			has_record;
+	int			i;
+	int			error;
+
+	*exists = false;
+	error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
+	while (error == 0 && has_record) {
+		error = xfs_inobt_get_rec(cur, &irec, &has_record);
+		if (error || irec.ir_startino > high)
+			break;
+
+		agino = irec.ir_startino;
+		holemask = irec.ir_holemask;
+		for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1,
+				i++, agino += XFS_INODES_PER_HOLEMASK_BIT) {
+			if (holemask & 1)
+				continue;
+			if (agino + XFS_INODES_PER_HOLEMASK_BIT > low &&
+					agino <= high) {
+				*exists = true;
+				return 0;
+			}
+		}
+
+		error = xfs_btree_increment(cur, 0, &has_record);
+	}
+	return error;
+}
+
+/* Is there an inode record covering a given extent? */
+int
+xfs_ialloc_has_inodes_at_extent(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			*exists)
+{
+	xfs_agino_t		low;
+	xfs_agino_t		high;
+
+	low = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
+	high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
+
+	return xfs_ialloc_has_inode_record(cur, low, high, exists);
+}
+
+struct xfs_ialloc_count_inodes {
+	xfs_agino_t			count;
+	xfs_agino_t			freecount;
+};
+
+/* Record inode counts across all inobt records. */
+STATIC int
+xfs_ialloc_count_inodes_rec(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*rec,
+	void				*priv)
+{
+	struct xfs_inobt_rec_incore	irec;
+	struct xfs_ialloc_count_inodes	*ci = priv;
+
+	xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
+	ci->count += irec.ir_count;
+	ci->freecount += irec.ir_freecount;
+
+	return 0;
+}
+
+/* Count allocated and free inodes under an inobt. */
+int
+xfs_ialloc_count_inodes(
+	struct xfs_btree_cur		*cur,
+	xfs_agino_t			*count,
+	xfs_agino_t			*freecount)
+{
+	struct xfs_ialloc_count_inodes	ci = {0};
+	int				error;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+	error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
+	if (error)
+		return error;
+
+	*count = ci.count;
+	*freecount = ci.freecount;
+	return 0;
+}
+
+/*
+ * Initialize inode-related geometry information.
+ *
+ * Compute the inode btree min and max levels and set maxicount.
+ *
+ * Set the inode cluster size.  This may still be overridden by the file
+ * system block size if it is larger than the chosen cluster size.
+ *
+ * For v5 filesystems, scale the cluster size with the inode size to keep a
+ * constant ratio of inode per cluster buffer, but only if mkfs has set the
+ * inode alignment value appropriately for larger cluster sizes.
+ *
+ * Then compute the inode cluster alignment information.
+ */
+void
+xfs_ialloc_setup_geometry(
+	struct xfs_mount	*mp)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	uint64_t		icount;
+	uint			inodes;
+
+	igeo->new_diflags2 = 0;
+	if (xfs_has_bigtime(mp))
+		igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+	if (xfs_has_large_extent_counts(mp))
+		igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
+
+	/* Compute inode btree geometry. */
+	igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+	igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+	igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+	igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
+	igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
+
+	igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
+			sbp->sb_inopblock);
+	igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
+
+	if (sbp->sb_spino_align)
+		igeo->ialloc_min_blks = sbp->sb_spino_align;
+	else
+		igeo->ialloc_min_blks = igeo->ialloc_blks;
+
+	/* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
+	inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+	igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
+			inodes);
+	ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
+
+	/*
+	 * Set the maximum inode count for this filesystem, being careful not
+	 * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
+	 * users should never get here due to failing sb verification, but
+	 * certain users (xfs_db) need to be usable even with corrupt metadata.
+	 */
+	if (sbp->sb_imax_pct && igeo->ialloc_blks) {
+		/*
+		 * Make sure the maximum inode count is a multiple
+		 * of the units we allocate inodes in.
+		 */
+		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+		do_div(icount, 100);
+		do_div(icount, igeo->ialloc_blks);
+		igeo->maxicount = XFS_FSB_TO_INO(mp,
+				icount * igeo->ialloc_blks);
+	} else {
+		igeo->maxicount = 0;
+	}
+
+	/*
+	 * Compute the desired size of an inode cluster buffer size, which
+	 * starts at 8K and (on v5 filesystems) scales up with larger inode
+	 * sizes.
+	 *
+	 * Preserve the desired inode cluster size because the sparse inodes
+	 * feature uses that desired size (not the actual size) to compute the
+	 * sparse inode alignment.  The mount code validates this value, so we
+	 * cannot change the behavior.
+	 */
+	igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
+	if (xfs_has_v3inodes(mp)) {
+		int	new_size = igeo->inode_cluster_size_raw;
+
+		new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+			igeo->inode_cluster_size_raw = new_size;
+	}
+
+	/* Calculate inode cluster ratios. */
+	if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
+		igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
+				igeo->inode_cluster_size_raw);
+	else
+		igeo->blocks_per_cluster = 1;
+	igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
+	igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
+
+	/* Calculate inode cluster alignment. */
+	if (xfs_has_align(mp) &&
+	    mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
+		igeo->cluster_align = mp->m_sb.sb_inoalignmt;
+	else
+		igeo->cluster_align = 1;
+	igeo->inoalign_mask = igeo->cluster_align - 1;
+	igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
+
+	/*
+	 * If we are using stripe alignment, check whether
+	 * the stripe unit is a multiple of the inode alignment
+	 */
+	if (mp->m_dalign && igeo->inoalign_mask &&
+	    !(mp->m_dalign & igeo->inoalign_mask))
+		igeo->ialloc_align = mp->m_dalign;
+	else
+		igeo->ialloc_align = 0;
+}
+
+/* Compute the location of the root directory inode that is laid out by mkfs. */
+xfs_ino_t
+xfs_ialloc_calc_rootino(
+	struct xfs_mount	*mp,
+	int			sunit)
+{
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	xfs_agblock_t		first_bno;
+
+	/*
+	 * Pre-calculate the geometry of AG 0.  We know what it looks like
+	 * because libxfs knows how to create allocation groups now.
+	 *
+	 * first_bno is the first block in which mkfs could possibly have
+	 * allocated the root directory inode, once we factor in the metadata
+	 * that mkfs formats before it.  Namely, the four AG headers...
+	 */
+	first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
+
+	/* ...the two free space btree roots... */
+	first_bno += 2;
+
+	/* ...the inode btree root... */
+	first_bno += 1;
+
+	/* ...the initial AGFL... */
+	first_bno += xfs_alloc_min_freelist(mp, NULL);
+
+	/* ...the free inode btree root... */
+	if (xfs_has_finobt(mp))
+		first_bno++;
+
+	/* ...the reverse mapping btree root... */
+	if (xfs_has_rmapbt(mp))
+		first_bno++;
+
+	/* ...the reference count btree... */
+	if (xfs_has_reflink(mp))
+		first_bno++;
+
+	/*
+	 * ...and the log, if it is allocated in the first allocation group.
+	 *
+	 * This can happen with filesystems that only have a single
+	 * allocation group, or very odd geometries created by old mkfs
+	 * versions on very small filesystems.
+	 */
+	if (xfs_ag_contains_log(mp, 0))
+		 first_bno += mp->m_sb.sb_logblocks;
+
+	/*
+	 * Now round first_bno up to whatever allocation alignment is given
+	 * by the filesystem or was passed in.
+	 */
+	if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
+		first_bno = roundup(first_bno, sunit);
+	else if (xfs_has_align(mp) &&
+			mp->m_sb.sb_inoalignmt > 1)
+		first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
+
+	return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
+}
+
+/*
+ * Ensure there are not sparse inode clusters that cross the new EOAG.
+ *
+ * This is a no-op for non-spinode filesystems since clusters are always fully
+ * allocated and checking the bnobt suffices.  However, a spinode filesystem
+ * could have a record where the upper inodes are free blocks.  If those blocks
+ * were removed from the filesystem, the inode record would extend beyond EOAG,
+ * which will be flagged as corruption.
+ */
+int
+xfs_ialloc_check_shrink(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		*agibp,
+	xfs_agblock_t		new_length)
+{
+	struct xfs_inobt_rec_incore rec;
+	struct xfs_btree_cur	*cur;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_perag	*pag;
+	xfs_agino_t		agino = XFS_AGB_TO_AGINO(mp, new_length);
+	int			has;
+	int			error;
+
+	if (!xfs_has_sparseinodes(mp))
+		return 0;
+
+	pag = xfs_perag_get(mp, agno);
+	cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO);
+
+	/* Look up the inobt record that would correspond to the new EOFS. */
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
+	if (error || !has)
+		goto out;
+
+	error = xfs_inobt_get_rec(cur, &rec, &has);
+	if (error)
+		goto out;
+
+	if (!has) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* If the record covers inodes that would be beyond EOFS, bail out. */
+	if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
+		error = -ENOSPC;
+		goto out;
+	}
+out:
+	xfs_btree_del_cursor(cur, error);
+	xfs_perag_put(pag);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
new file mode 100644
index 000000000..9bbbca6ac
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_IALLOC_H__
+#define	__XFS_IALLOC_H__
+
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_imap;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_btree_cur;
+
+/* Move inodes in clusters of this size */
+#define	XFS_INODE_BIG_CLUSTER_SIZE	8192
+
+struct xfs_icluster {
+	bool		deleted;	/* record is deleted */
+	xfs_ino_t	first_ino;	/* first inode number */
+	uint64_t	alloc;		/* inode phys. allocation bitmap for
+					 * sparse chunks */
+};
+
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+static inline struct xfs_dinode *
+xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
+{
+	return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
+}
+
+/*
+ * Allocate an inode on disk.  Mode is used to tell whether the new inode will
+ * need space, and whether it is a directory.
+ */
+int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode,
+		xfs_ino_t *new_ino);
+
+int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag,
+		xfs_ino_t ino, struct xfs_icluster *ifree);
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode to locate */
+	struct xfs_imap	*imap,		/* location map structure */
+	uint		flags);		/* flags for inode btree lookup */
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+	struct xfs_trans *tp,		/* transaction pointer */
+	struct xfs_buf	*bp,		/* allocation group header buffer */
+	uint32_t	fields);	/* bitmask of fields to log */
+
+int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf **agibpp);
+int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf **agibpp);
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+		xfs_lookup_t dir, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+		xfs_inobt_rec_incore_t *rec, int *stat);
+
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+			  struct list_head *buffer_list, int icount,
+			  xfs_agnumber_t agno, xfs_agblock_t agbno,
+			  xfs_agblock_t length, unsigned int gen);
+
+
+union xfs_btree_rec;
+void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
+		const union xfs_btree_rec *rec,
+		struct xfs_inobt_rec_incore *irec);
+int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
+		xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
+int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low,
+		xfs_agino_t high, bool *exists);
+int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count,
+		xfs_agino_t *freecount);
+int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
+		uint8_t count, int32_t freecount, xfs_inofree_t free,
+		int *stat);
+
+int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
+void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
+xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
+
+int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno,
+		struct xfs_buf *agibp, xfs_agblock_t new_length);
+
+#endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000..8c83e2657
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -0,0 +1,833 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+static struct kmem_cache	*xfs_inobt_cur_cache;
+
+STATIC int
+xfs_inobt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return M_IGEO(cur->bc_mp)->inobt_mnr[level != 0];
+}
+
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+}
+
+STATIC void
+xfs_inobt_set_root(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*nptr,
+	int				inc)	/* level change */
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agi		*agi = agbp->b_addr;
+
+	agi->agi_root = nptr->s;
+	be32_add_cpu(&agi->agi_level, inc);
+	xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
+
+STATIC void
+xfs_finobt_set_root(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*nptr,
+	int				inc)	/* level change */
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agi		*agi = agbp->b_addr;
+
+	agi->agi_free_root = nptr->s;
+	be32_add_cpu(&agi->agi_free_level, inc);
+	xfs_ialloc_log_agi(cur->bc_tp, agbp,
+			   XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+
+/* Update the inode btree block counter for this btree. */
+static inline void
+xfs_inobt_mod_blockcount(
+	struct xfs_btree_cur	*cur,
+	int			howmuch)
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agi		*agi = agbp->b_addr;
+
+	if (!xfs_has_inobtcounts(cur->bc_mp))
+		return;
+
+	if (cur->bc_btnum == XFS_BTNUM_FINO)
+		be32_add_cpu(&agi->agi_fblocks, howmuch);
+	else if (cur->bc_btnum == XFS_BTNUM_INO)
+		be32_add_cpu(&agi->agi_iblocks, howmuch);
+	xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
+}
+
+STATIC int
+__xfs_inobt_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat,
+	enum xfs_ag_resv_type		resv)
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+	xfs_agblock_t		sbno = be32_to_cpu(start->s);
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.oinfo = XFS_RMAP_OINFO_INOBT;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno);
+	args.minlen = 1;
+	args.maxlen = 1;
+	args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.resv = resv;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		return error;
+
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+
+	new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
+	*stat = 1;
+	xfs_inobt_mod_blockcount(cur, 1);
+	return 0;
+}
+
+STATIC int
+xfs_inobt_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
+}
+
+STATIC int
+xfs_finobt_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	if (cur->bc_mp->m_finobt_nores)
+		return xfs_inobt_alloc_block(cur, start, new, stat);
+	return __xfs_inobt_alloc_block(cur, start, new, stat,
+			XFS_AG_RESV_METADATA);
+}
+
+STATIC int
+__xfs_inobt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	enum xfs_ag_resv_type	resv)
+{
+	xfs_inobt_mod_blockcount(cur, -1);
+	return xfs_free_extent(cur->bc_tp,
+			XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1,
+			&XFS_RMAP_OINFO_INOBT, resv);
+}
+
+STATIC int
+xfs_inobt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_NONE);
+}
+
+STATIC int
+xfs_finobt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	if (cur->bc_mp->m_finobt_nores)
+		return xfs_inobt_free_block(cur, bp);
+	return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
+}
+
+STATIC int
+xfs_inobt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return M_IGEO(cur->bc_mp)->inobt_mxr[level != 0];
+}
+
+STATIC void
+xfs_inobt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->inobt.ir_startino = rec->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	__u32				x;
+
+	x = be32_to_cpu(rec->inobt.ir_startino);
+	x += XFS_INODES_PER_CHUNK - 1;
+	key->inobt.ir_startino = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_inobt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+	if (xfs_has_sparseinodes(cur->bc_mp)) {
+		rec->inobt.ir_u.sp.ir_holemask =
+					cpu_to_be16(cur->bc_rec.i.ir_holemask);
+		rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
+		rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
+	} else {
+		/* ir_holemask/ir_count not supported on-disk */
+		rec->inobt.ir_u.f.ir_freecount =
+					cpu_to_be32(cur->bc_rec.i.ir_freecount);
+	}
+	rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+}
+
+/*
+ * initial value of ptr for lookup
+ */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agi		*agi = cur->bc_ag.agbp->b_addr;
+
+	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
+
+	ptr->s = agi->agi_root;
+}
+
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agi		*agi = cur->bc_ag.agbp->b_addr;
+
+	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
+	ptr->s = agi->agi_free_root;
+}
+
+STATIC int64_t
+xfs_inobt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	return (int64_t)be32_to_cpu(key->inobt.ir_startino) -
+			  cur->bc_rec.i.ir_startino;
+}
+
+STATIC int64_t
+xfs_inobt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
+			  be32_to_cpu(k2->inobt.ir_startino);
+}
+
+static xfs_failaddr_t
+xfs_inobt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	/*
+	 * During growfs operations, we can't verify the exact owner as the
+	 * perag is not fully initialised and hence not attached to the buffer.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agi information will not yet have been initialised
+	 * from the on disk AGI. We don't currently use any of this information,
+	 * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+	 * ever do.
+	 */
+	if (xfs_has_crc(mp)) {
+		fa = xfs_btree_sblock_v5hdr_verify(bp);
+		if (fa)
+			return fa;
+	}
+
+	/* level verification */
+	level = be16_to_cpu(block->bb_level);
+	if (level >= M_IGEO(mp)->inobt_maxlevels)
+		return __this_address;
+
+	return xfs_btree_sblock_verify(bp,
+			M_IGEO(mp)->inobt_mxr[level != 0]);
+}
+
+static void
+xfs_inobt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_inobt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_inobt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_inobt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+	.name = "xfs_inobt",
+	.magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+	.verify_read = xfs_inobt_read_verify,
+	.verify_write = xfs_inobt_write_verify,
+	.verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+	.name = "xfs_finobt",
+	.magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+		   cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
+	.verify_read = xfs_inobt_read_verify,
+	.verify_write = xfs_inobt_write_verify,
+	.verify_struct = xfs_inobt_verify,
+};
+
+STATIC int
+xfs_inobt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->inobt.ir_startino) <
+		be32_to_cpu(k2->inobt.ir_startino);
+}
+
+STATIC int
+xfs_inobt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+		be32_to_cpu(r2->inobt.ir_startino);
+}
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+	.rec_len		= sizeof(xfs_inobt_rec_t),
+	.key_len		= sizeof(xfs_inobt_key_t),
+
+	.dup_cursor		= xfs_inobt_dup_cursor,
+	.set_root		= xfs_inobt_set_root,
+	.alloc_block		= xfs_inobt_alloc_block,
+	.free_block		= xfs_inobt_free_block,
+	.get_minrecs		= xfs_inobt_get_minrecs,
+	.get_maxrecs		= xfs_inobt_get_maxrecs,
+	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_inobt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
+	.key_diff		= xfs_inobt_key_diff,
+	.buf_ops		= &xfs_inobt_buf_ops,
+	.diff_two_keys		= xfs_inobt_diff_two_keys,
+	.keys_inorder		= xfs_inobt_keys_inorder,
+	.recs_inorder		= xfs_inobt_recs_inorder,
+};
+
+static const struct xfs_btree_ops xfs_finobt_ops = {
+	.rec_len		= sizeof(xfs_inobt_rec_t),
+	.key_len		= sizeof(xfs_inobt_key_t),
+
+	.dup_cursor		= xfs_inobt_dup_cursor,
+	.set_root		= xfs_finobt_set_root,
+	.alloc_block		= xfs_finobt_alloc_block,
+	.free_block		= xfs_finobt_free_block,
+	.get_minrecs		= xfs_inobt_get_minrecs,
+	.get_maxrecs		= xfs_inobt_get_maxrecs,
+	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_inobt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_finobt_init_ptr_from_cur,
+	.key_diff		= xfs_inobt_key_diff,
+	.buf_ops		= &xfs_finobt_buf_ops,
+	.diff_two_keys		= xfs_inobt_diff_two_keys,
+	.keys_inorder		= xfs_inobt_keys_inorder,
+	.recs_inorder		= xfs_inobt_recs_inorder,
+};
+
+/*
+ * Initialize a new inode btree cursor.
+ */
+static struct xfs_btree_cur *
+xfs_inobt_init_common(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum)		/* ialloc or free ino btree */
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+			M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
+	if (btnum == XFS_BTNUM_INO) {
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
+		cur->bc_ops = &xfs_inobt_ops;
+	} else {
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
+		cur->bc_ops = &xfs_finobt_ops;
+	}
+
+	if (xfs_has_crc(mp))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	/* take a reference for the cursor */
+	atomic_inc(&pag->pag_ref);
+	cur->bc_ag.pag = pag;
+	return cur;
+}
+
+/* Create an inode btree cursor. */
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_agi		*agi = agbp->b_addr;
+
+	cur = xfs_inobt_init_common(mp, tp, pag, btnum);
+	if (btnum == XFS_BTNUM_INO)
+		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+	else
+		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+	cur->bc_ag.agbp = agbp;
+	return cur;
+}
+
+/* Create an inode btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_inobt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xbtree_afakeroot	*afake,
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_inobt_init_common(mp, NULL, pag, btnum);
+	xfs_btree_stage_afakeroot(cur, afake);
+	return cur;
+}
+
+/*
+ * Install a new inobt btree root.  Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_inobt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp)
+{
+	struct xfs_agi		*agi = agbp->b_addr;
+	struct xbtree_afakeroot	*afake = cur->bc_ag.afake;
+	int			fields;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	if (cur->bc_btnum == XFS_BTNUM_INO) {
+		fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
+		agi->agi_root = cpu_to_be32(afake->af_root);
+		agi->agi_level = cpu_to_be32(afake->af_levels);
+		if (xfs_has_inobtcounts(cur->bc_mp)) {
+			agi->agi_iblocks = cpu_to_be32(afake->af_blocks);
+			fields |= XFS_AGI_IBLOCKS;
+		}
+		xfs_ialloc_log_agi(tp, agbp, fields);
+		xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+	} else {
+		fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
+		agi->agi_free_root = cpu_to_be32(afake->af_root);
+		agi->agi_free_level = cpu_to_be32(afake->af_levels);
+		if (xfs_has_inobtcounts(cur->bc_mp)) {
+			agi->agi_fblocks = cpu_to_be32(afake->af_blocks);
+			fields |= XFS_AGI_IBLOCKS;
+		}
+		xfs_ialloc_log_agi(tp, agbp, fields);
+		xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+	}
+}
+
+/* Calculate number of records in an inode btree block. */
+static inline unsigned int
+xfs_inobt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(xfs_inobt_rec_t);
+	return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
+
+/*
+ * Calculate number of records in an inobt btree block.
+ */
+int
+xfs_inobt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+	return xfs_inobt_block_maxrecs(blocklen, leaf);
+}
+
+/*
+ * Maximum number of inode btree records per AG.  Pretend that we can fill an
+ * entire AG completely full of inodes except for the AG headers.
+ */
+#define XFS_MAX_INODE_RECORDS \
+	((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \
+			XFS_INODES_PER_CHUNK
+
+/* Compute the max possible height for the inode btree. */
+static inline unsigned int
+xfs_inobt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+		       XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+	minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for the free inode btree. */
+static inline unsigned int
+xfs_finobt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for either inode btree. */
+unsigned int
+xfs_iallocbt_maxlevels_ondisk(void)
+{
+	return max(xfs_inobt_maxlevels_ondisk(),
+		   xfs_finobt_maxlevels_ondisk());
+}
+
+/*
+ * Convert the inode record holemask to an inode allocation bitmap. The inode
+ * allocation bitmap is inode granularity and specifies whether an inode is
+ * physically allocated on disk (not whether the inode is considered allocated
+ * or free by the fs).
+ *
+ * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
+ */
+uint64_t
+xfs_inobt_irec_to_allocmask(
+	struct xfs_inobt_rec_incore	*rec)
+{
+	uint64_t			bitmap = 0;
+	uint64_t			inodespbit;
+	int				nextbit;
+	uint				allocbitmap;
+
+	/*
+	 * The holemask has 16-bits for a 64 inode record. Therefore each
+	 * holemask bit represents multiple inodes. Create a mask of bits to set
+	 * in the allocmask for each holemask bit.
+	 */
+	inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
+
+	/*
+	 * Allocated inodes are represented by 0 bits in holemask. Invert the 0
+	 * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
+	 * anything beyond the 16 holemask bits since this casts to a larger
+	 * type.
+	 */
+	allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
+
+	/*
+	 * allocbitmap is the inverted holemask so every set bit represents
+	 * allocated inodes. To expand from 16-bit holemask granularity to
+	 * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
+	 * bitmap for every holemask bit.
+	 */
+	nextbit = xfs_next_bit(&allocbitmap, 1, 0);
+	while (nextbit != -1) {
+		ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
+
+		bitmap |= (inodespbit <<
+			   (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
+
+		nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
+	}
+
+	return bitmap;
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+/*
+ * Verify that an in-core inode record has a valid inode count.
+ */
+int
+xfs_inobt_rec_check_count(
+	struct xfs_mount		*mp,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	int				inocount = 0;
+	int				nextbit = 0;
+	uint64_t			allocbmap;
+	int				wordsz;
+
+	wordsz = sizeof(allocbmap) / sizeof(unsigned int);
+	allocbmap = xfs_inobt_irec_to_allocmask(rec);
+
+	nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
+	while (nextbit != -1) {
+		inocount++;
+		nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
+				       nextbit + 1);
+	}
+
+	if (inocount != rec->ir_count)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+#endif	/* DEBUG */
+
+static xfs_extlen_t
+xfs_inobt_max_size(
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	xfs_agblock_t		agblocks = pag->block_count;
+
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (M_IGEO(mp)->inobt_mxr[0] == 0)
+		return 0;
+
+	/*
+	 * The log is permanently allocated, so the space it occupies will
+	 * never be available for the kinds of things that would require btree
+	 * expansion.  We therefore can pretend the space isn't there.
+	 */
+	if (xfs_ag_contains_log(mp, pag->pag_agno))
+		agblocks -= mp->m_sb.sb_logblocks;
+
+	return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
+				(uint64_t)agblocks * mp->m_sb.sb_inopblock /
+					XFS_INODES_PER_CHUNK);
+}
+
+/* Read AGI and create inobt cursor. */
+int
+xfs_inobt_cur(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_btnum_t		which,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp)
+{
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ASSERT(*agi_bpp == NULL);
+	ASSERT(*curpp == NULL);
+
+	error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+	if (error)
+		return error;
+
+	cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which);
+	*curpp = cur;
+	return 0;
+}
+
+static int
+xfs_inobt_count_blocks(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum,
+	xfs_extlen_t		*tree_blocks)
+{
+	struct xfs_buf		*agbp = NULL;
+	struct xfs_btree_cur	*cur = NULL;
+	int			error;
+
+	error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp);
+	if (error)
+		return error;
+
+	error = xfs_btree_count_blocks(cur, tree_blocks);
+	xfs_btree_del_cursor(cur, error);
+	xfs_trans_brelse(tp, agbp);
+
+	return error;
+}
+
+/* Read finobt block count from AGI header. */
+static int
+xfs_finobt_read_blocks(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	xfs_extlen_t		*tree_blocks)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_agi		*agi;
+	int			error;
+
+	error = xfs_ialloc_read_agi(pag, tp, &agbp);
+	if (error)
+		return error;
+
+	agi = agbp->b_addr;
+	*tree_blocks = be32_to_cpu(agi->agi_fblocks);
+	xfs_trans_brelse(tp, agbp);
+	return 0;
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_finobt_calc_reserves(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	xfs_extlen_t		tree_len = 0;
+	int			error;
+
+	if (!xfs_has_finobt(mp))
+		return 0;
+
+	if (xfs_has_inobtcounts(mp))
+		error = xfs_finobt_read_blocks(pag, tp, &tree_len);
+	else
+		error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
+				&tree_len);
+	if (error)
+		return error;
+
+	*ask += xfs_inobt_max_size(pag);
+	*used += tree_len;
+	return 0;
+}
+
+/* Calculate the inobt btree size for some records. */
+xfs_extlen_t
+xfs_iallocbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
+}
+
+int __init
+xfs_inobt_init_cur_cache(void)
+{
+	xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur",
+			xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_inobt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_inobt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_inobt_cur_cache);
+	xfs_inobt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
new file mode 100644
index 000000000..26451cb76
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define	__XFS_IALLOC_BTREE_H__
+
+/*
+ * Inode map on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_INOBT_BLOCK_LEN(mp) \
+	(xfs_has_crc(((mp))) ? \
+		XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+	((xfs_inobt_rec_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+	((xfs_inobt_key_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_inobt_ptr_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_inobt_key_t) + \
+		 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_buf *agbp,
+		struct xfs_perag *pag, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp,
+		struct xbtree_afakeroot *afake, struct xfs_perag *pag,
+		xfs_btnum_t btnum);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+
+/* ir_holemask to inode allocation bitmap conversion */
+uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int xfs_inobt_rec_check_count(struct xfs_mount *,
+			      struct xfs_inobt_rec_incore *);
+#else
+#define xfs_inobt_rec_check_count(mp, rec)	0
+#endif	/* DEBUG */
+
+int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
+		struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
+extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
+		struct xfs_perag *pag, xfs_btnum_t btnum,
+		struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
+
+void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_iallocbt_maxlevels_ondisk(void);
+
+int __init xfs_inobt_init_cur_cache(void);
+void xfs_inobt_destroy_cur_cache(void);
+
+#endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
new file mode 100644
index 000000000..773cf4349
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -0,0 +1,1050 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017 Christoph Hellwig.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+
+/*
+ * In-core extent record layout:
+ *
+ * +-------+----------------------------+
+ * | 00:53 | all 54 bits of startoff    |
+ * | 54:63 | low 10 bits of startblock  |
+ * +-------+----------------------------+
+ * | 00:20 | all 21 bits of length      |
+ * |    21 | unwritten extent bit       |
+ * | 22:63 | high 42 bits of startblock |
+ * +-------+----------------------------+
+ */
+#define XFS_IEXT_STARTOFF_MASK		xfs_mask64lo(BMBT_STARTOFF_BITLEN)
+#define XFS_IEXT_LENGTH_MASK		xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN)
+#define XFS_IEXT_STARTBLOCK_MASK	xfs_mask64lo(BMBT_STARTBLOCK_BITLEN)
+
+struct xfs_iext_rec {
+	uint64_t			lo;
+	uint64_t			hi;
+};
+
+/*
+ * Given that the length can't be a zero, only an empty hi value indicates an
+ * unused record.
+ */
+static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec)
+{
+	return rec->hi == 0;
+}
+
+static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec)
+{
+	rec->lo = 0;
+	rec->hi = 0;
+}
+
+static void
+xfs_iext_set(
+	struct xfs_iext_rec	*rec,
+	struct xfs_bmbt_irec	*irec)
+{
+	ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0);
+	ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0);
+	ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0);
+
+	rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK;
+	rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK;
+
+	rec->lo |= (irec->br_startblock << 54);
+	rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10));
+
+	if (irec->br_state == XFS_EXT_UNWRITTEN)
+		rec->hi |= (1 << 21);
+}
+
+static void
+xfs_iext_get(
+	struct xfs_bmbt_irec	*irec,
+	struct xfs_iext_rec	*rec)
+{
+	irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK;
+	irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK;
+
+	irec->br_startblock = rec->lo >> 54;
+	irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10);
+
+	if (rec->hi & (1 << 21))
+		irec->br_state = XFS_EXT_UNWRITTEN;
+	else
+		irec->br_state = XFS_EXT_NORM;
+}
+
+enum {
+	NODE_SIZE	= 256,
+	KEYS_PER_NODE	= NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
+	RECS_PER_LEAF	= (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) /
+				sizeof(struct xfs_iext_rec),
+};
+
+/*
+ * In-core extent btree block layout:
+ *
+ * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
+ *
+ * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
+ * contain the startoffset, blockcount, startblock and unwritten extent flag.
+ * See above for the exact format, followed by pointers to the previous and next
+ * leaf blocks (if there are any).
+ *
+ * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
+ * by an equal number of pointers to the btree blocks at the next lower level.
+ *
+ *		+-------+-------+-------+-------+-------+----------+----------+
+ * Leaf:	| rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
+ *		+-------+-------+-------+-------+-------+----------+----------+
+ *
+ *		+-------+-------+-------+-------+-------+-------+------+-------+
+ * Inner:	| key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
+ *		+-------+-------+-------+-------+-------+-------+------+-------+
+ */
+struct xfs_iext_node {
+	uint64_t		keys[KEYS_PER_NODE];
+#define XFS_IEXT_KEY_INVALID	(1ULL << 63)
+	void			*ptrs[KEYS_PER_NODE];
+};
+
+struct xfs_iext_leaf {
+	struct xfs_iext_rec	recs[RECS_PER_LEAF];
+	struct xfs_iext_leaf	*prev;
+	struct xfs_iext_leaf	*next;
+};
+
+inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
+{
+	return ifp->if_bytes / sizeof(struct xfs_iext_rec);
+}
+
+static inline int xfs_iext_max_recs(struct xfs_ifork *ifp)
+{
+	if (ifp->if_height == 1)
+		return xfs_iext_count(ifp);
+	return RECS_PER_LEAF;
+}
+
+static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur)
+{
+	return &cur->leaf->recs[cur->pos];
+}
+
+static inline bool xfs_iext_valid(struct xfs_ifork *ifp,
+		struct xfs_iext_cursor *cur)
+{
+	if (!cur->leaf)
+		return false;
+	if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp))
+		return false;
+	if (xfs_iext_rec_is_empty(cur_rec(cur)))
+		return false;
+	return true;
+}
+
+static void *
+xfs_iext_find_first_leaf(
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_iext_node	*node = ifp->if_u1.if_root;
+	int			height;
+
+	if (!ifp->if_height)
+		return NULL;
+
+	for (height = ifp->if_height; height > 1; height--) {
+		node = node->ptrs[0];
+		ASSERT(node);
+	}
+
+	return node;
+}
+
+static void *
+xfs_iext_find_last_leaf(
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_iext_node	*node = ifp->if_u1.if_root;
+	int			height, i;
+
+	if (!ifp->if_height)
+		return NULL;
+
+	for (height = ifp->if_height; height > 1; height--) {
+		for (i = 1; i < KEYS_PER_NODE; i++)
+			if (!node->ptrs[i])
+				break;
+		node = node->ptrs[i - 1];
+		ASSERT(node);
+	}
+
+	return node;
+}
+
+void
+xfs_iext_first(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_cursor	*cur)
+{
+	cur->pos = 0;
+	cur->leaf = xfs_iext_find_first_leaf(ifp);
+}
+
+void
+xfs_iext_last(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_cursor	*cur)
+{
+	int			i;
+
+	cur->leaf = xfs_iext_find_last_leaf(ifp);
+	if (!cur->leaf) {
+		cur->pos = 0;
+		return;
+	}
+
+	for (i = 1; i < xfs_iext_max_recs(ifp); i++) {
+		if (xfs_iext_rec_is_empty(&cur->leaf->recs[i]))
+			break;
+	}
+	cur->pos = i - 1;
+}
+
+void
+xfs_iext_next(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_cursor	*cur)
+{
+	if (!cur->leaf) {
+		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+		xfs_iext_first(ifp, cur);
+		return;
+	}
+
+	ASSERT(cur->pos >= 0);
+	ASSERT(cur->pos < xfs_iext_max_recs(ifp));
+
+	cur->pos++;
+	if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) &&
+	    cur->leaf->next) {
+		cur->leaf = cur->leaf->next;
+		cur->pos = 0;
+	}
+}
+
+void
+xfs_iext_prev(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_cursor	*cur)
+{
+	if (!cur->leaf) {
+		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+		xfs_iext_last(ifp, cur);
+		return;
+	}
+
+	ASSERT(cur->pos >= 0);
+	ASSERT(cur->pos <= RECS_PER_LEAF);
+
+recurse:
+	do {
+		cur->pos--;
+		if (xfs_iext_valid(ifp, cur))
+			return;
+	} while (cur->pos > 0);
+
+	if (ifp->if_height > 1 && cur->leaf->prev) {
+		cur->leaf = cur->leaf->prev;
+		cur->pos = RECS_PER_LEAF;
+		goto recurse;
+	}
+}
+
+static inline int
+xfs_iext_key_cmp(
+	struct xfs_iext_node	*node,
+	int			n,
+	xfs_fileoff_t		offset)
+{
+	if (node->keys[n] > offset)
+		return 1;
+	if (node->keys[n] < offset)
+		return -1;
+	return 0;
+}
+
+static inline int
+xfs_iext_rec_cmp(
+	struct xfs_iext_rec	*rec,
+	xfs_fileoff_t		offset)
+{
+	uint64_t		rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
+	uint32_t		rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
+
+	if (rec_offset > offset)
+		return 1;
+	if (rec_offset + rec_len <= offset)
+		return -1;
+	return 0;
+}
+
+static void *
+xfs_iext_find_level(
+	struct xfs_ifork	*ifp,
+	xfs_fileoff_t		offset,
+	int			level)
+{
+	struct xfs_iext_node	*node = ifp->if_u1.if_root;
+	int			height, i;
+
+	if (!ifp->if_height)
+		return NULL;
+
+	for (height = ifp->if_height; height > level; height--) {
+		for (i = 1; i < KEYS_PER_NODE; i++)
+			if (xfs_iext_key_cmp(node, i, offset) > 0)
+				break;
+
+		node = node->ptrs[i - 1];
+		if (!node)
+			break;
+	}
+
+	return node;
+}
+
+static int
+xfs_iext_node_pos(
+	struct xfs_iext_node	*node,
+	xfs_fileoff_t		offset)
+{
+	int			i;
+
+	for (i = 1; i < KEYS_PER_NODE; i++) {
+		if (xfs_iext_key_cmp(node, i, offset) > 0)
+			break;
+	}
+
+	return i - 1;
+}
+
+static int
+xfs_iext_node_insert_pos(
+	struct xfs_iext_node	*node,
+	xfs_fileoff_t		offset)
+{
+	int			i;
+
+	for (i = 0; i < KEYS_PER_NODE; i++) {
+		if (xfs_iext_key_cmp(node, i, offset) > 0)
+			return i;
+	}
+
+	return KEYS_PER_NODE;
+}
+
+static int
+xfs_iext_node_nr_entries(
+	struct xfs_iext_node	*node,
+	int			start)
+{
+	int			i;
+
+	for (i = start; i < KEYS_PER_NODE; i++) {
+		if (node->keys[i] == XFS_IEXT_KEY_INVALID)
+			break;
+	}
+
+	return i;
+}
+
+static int
+xfs_iext_leaf_nr_entries(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_leaf	*leaf,
+	int			start)
+{
+	int			i;
+
+	for (i = start; i < xfs_iext_max_recs(ifp); i++) {
+		if (xfs_iext_rec_is_empty(&leaf->recs[i]))
+			break;
+	}
+
+	return i;
+}
+
+static inline uint64_t
+xfs_iext_leaf_key(
+	struct xfs_iext_leaf	*leaf,
+	int			n)
+{
+	return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
+}
+
+static void
+xfs_iext_grow(
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_iext_node	*node = kmem_zalloc(NODE_SIZE, KM_NOFS);
+	int			i;
+
+	if (ifp->if_height == 1) {
+		struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
+
+		node->keys[0] = xfs_iext_leaf_key(prev, 0);
+		node->ptrs[0] = prev;
+	} else  {
+		struct xfs_iext_node *prev = ifp->if_u1.if_root;
+
+		ASSERT(ifp->if_height > 1);
+
+		node->keys[0] = prev->keys[0];
+		node->ptrs[0] = prev;
+	}
+
+	for (i = 1; i < KEYS_PER_NODE; i++)
+		node->keys[i] = XFS_IEXT_KEY_INVALID;
+
+	ifp->if_u1.if_root = node;
+	ifp->if_height++;
+}
+
+static void
+xfs_iext_update_node(
+	struct xfs_ifork	*ifp,
+	xfs_fileoff_t		old_offset,
+	xfs_fileoff_t		new_offset,
+	int			level,
+	void			*ptr)
+{
+	struct xfs_iext_node	*node = ifp->if_u1.if_root;
+	int			height, i;
+
+	for (height = ifp->if_height; height > level; height--) {
+		for (i = 0; i < KEYS_PER_NODE; i++) {
+			if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0)
+				break;
+			if (node->keys[i] == old_offset)
+				node->keys[i] = new_offset;
+		}
+		node = node->ptrs[i - 1];
+		ASSERT(node);
+	}
+
+	ASSERT(node == ptr);
+}
+
+static struct xfs_iext_node *
+xfs_iext_split_node(
+	struct xfs_iext_node	**nodep,
+	int			*pos,
+	int			*nr_entries)
+{
+	struct xfs_iext_node	*node = *nodep;
+	struct xfs_iext_node	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+	const int		nr_move = KEYS_PER_NODE / 2;
+	int			nr_keep = nr_move + (KEYS_PER_NODE & 1);
+	int			i = 0;
+
+	/* for sequential append operations just spill over into the new node */
+	if (*pos == KEYS_PER_NODE) {
+		*nodep = new;
+		*pos = 0;
+		*nr_entries = 0;
+		goto done;
+	}
+
+
+	for (i = 0; i < nr_move; i++) {
+		new->keys[i] = node->keys[nr_keep + i];
+		new->ptrs[i] = node->ptrs[nr_keep + i];
+
+		node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID;
+		node->ptrs[nr_keep + i] = NULL;
+	}
+
+	if (*pos >= nr_keep) {
+		*nodep = new;
+		*pos -= nr_keep;
+		*nr_entries = nr_move;
+	} else {
+		*nr_entries = nr_keep;
+	}
+done:
+	for (; i < KEYS_PER_NODE; i++)
+		new->keys[i] = XFS_IEXT_KEY_INVALID;
+	return new;
+}
+
+static void
+xfs_iext_insert_node(
+	struct xfs_ifork	*ifp,
+	uint64_t		offset,
+	void			*ptr,
+	int			level)
+{
+	struct xfs_iext_node	*node, *new;
+	int			i, pos, nr_entries;
+
+again:
+	if (ifp->if_height < level)
+		xfs_iext_grow(ifp);
+
+	new = NULL;
+	node = xfs_iext_find_level(ifp, offset, level);
+	pos = xfs_iext_node_insert_pos(node, offset);
+	nr_entries = xfs_iext_node_nr_entries(node, pos);
+
+	ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0);
+	ASSERT(nr_entries <= KEYS_PER_NODE);
+
+	if (nr_entries == KEYS_PER_NODE)
+		new = xfs_iext_split_node(&node, &pos, &nr_entries);
+
+	/*
+	 * Update the pointers in higher levels if the first entry changes
+	 * in an existing node.
+	 */
+	if (node != new && pos == 0 && nr_entries > 0)
+		xfs_iext_update_node(ifp, node->keys[0], offset, level, node);
+
+	for (i = nr_entries; i > pos; i--) {
+		node->keys[i] = node->keys[i - 1];
+		node->ptrs[i] = node->ptrs[i - 1];
+	}
+	node->keys[pos] = offset;
+	node->ptrs[pos] = ptr;
+
+	if (new) {
+		offset = new->keys[0];
+		ptr = new;
+		level++;
+		goto again;
+	}
+}
+
+static struct xfs_iext_leaf *
+xfs_iext_split_leaf(
+	struct xfs_iext_cursor	*cur,
+	int			*nr_entries)
+{
+	struct xfs_iext_leaf	*leaf = cur->leaf;
+	struct xfs_iext_leaf	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+	const int		nr_move = RECS_PER_LEAF / 2;
+	int			nr_keep = nr_move + (RECS_PER_LEAF & 1);
+	int			i;
+
+	/* for sequential append operations just spill over into the new node */
+	if (cur->pos == RECS_PER_LEAF) {
+		cur->leaf = new;
+		cur->pos = 0;
+		*nr_entries = 0;
+		goto done;
+	}
+
+	for (i = 0; i < nr_move; i++) {
+		new->recs[i] = leaf->recs[nr_keep + i];
+		xfs_iext_rec_clear(&leaf->recs[nr_keep + i]);
+	}
+
+	if (cur->pos >= nr_keep) {
+		cur->leaf = new;
+		cur->pos -= nr_keep;
+		*nr_entries = nr_move;
+	} else {
+		*nr_entries = nr_keep;
+	}
+done:
+	if (leaf->next)
+		leaf->next->prev = new;
+	new->next = leaf->next;
+	new->prev = leaf;
+	leaf->next = new;
+	return new;
+}
+
+static void
+xfs_iext_alloc_root(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_cursor	*cur)
+{
+	ASSERT(ifp->if_bytes == 0);
+
+	ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+	ifp->if_height = 1;
+
+	/* now that we have a node step into it */
+	cur->leaf = ifp->if_u1.if_root;
+	cur->pos = 0;
+}
+
+static void
+xfs_iext_realloc_root(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_cursor	*cur)
+{
+	int64_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
+	void *new;
+
+	/* account for the prev/next pointers */
+	if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
+		new_size = NODE_SIZE;
+
+	new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
+	memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
+	ifp->if_u1.if_root = new;
+	cur->leaf = new;
+}
+
+/*
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
+ */
+static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
+{
+	WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+}
+
+void
+xfs_iext_insert(
+	struct xfs_inode	*ip,
+	struct xfs_iext_cursor	*cur,
+	struct xfs_bmbt_irec	*irec,
+	int			state)
+{
+	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
+	xfs_fileoff_t		offset = irec->br_startoff;
+	struct xfs_iext_leaf	*new = NULL;
+	int			nr_entries, i;
+
+	xfs_iext_inc_seq(ifp);
+
+	if (ifp->if_height == 0)
+		xfs_iext_alloc_root(ifp, cur);
+	else if (ifp->if_height == 1)
+		xfs_iext_realloc_root(ifp, cur);
+
+	nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
+	ASSERT(nr_entries <= RECS_PER_LEAF);
+	ASSERT(cur->pos >= nr_entries ||
+	       xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0);
+
+	if (nr_entries == RECS_PER_LEAF)
+		new = xfs_iext_split_leaf(cur, &nr_entries);
+
+	/*
+	 * Update the pointers in higher levels if the first entry changes
+	 * in an existing node.
+	 */
+	if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
+		xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0),
+				offset, 1, cur->leaf);
+	}
+
+	for (i = nr_entries; i > cur->pos; i--)
+		cur->leaf->recs[i] = cur->leaf->recs[i - 1];
+	xfs_iext_set(cur_rec(cur), irec);
+	ifp->if_bytes += sizeof(struct xfs_iext_rec);
+
+	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+
+	if (new)
+		xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
+}
+
+static struct xfs_iext_node *
+xfs_iext_rebalance_node(
+	struct xfs_iext_node	*parent,
+	int			*pos,
+	struct xfs_iext_node	*node,
+	int			nr_entries)
+{
+	/*
+	 * If the neighbouring nodes are completely full, or have different
+	 * parents, we might never be able to merge our node, and will only
+	 * delete it once the number of entries hits zero.
+	 */
+	if (nr_entries == 0)
+		return node;
+
+	if (*pos > 0) {
+		struct xfs_iext_node *prev = parent->ptrs[*pos - 1];
+		int nr_prev = xfs_iext_node_nr_entries(prev, 0), i;
+
+		if (nr_prev + nr_entries <= KEYS_PER_NODE) {
+			for (i = 0; i < nr_entries; i++) {
+				prev->keys[nr_prev + i] = node->keys[i];
+				prev->ptrs[nr_prev + i] = node->ptrs[i];
+			}
+			return node;
+		}
+	}
+
+	if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) {
+		struct xfs_iext_node *next = parent->ptrs[*pos + 1];
+		int nr_next = xfs_iext_node_nr_entries(next, 0), i;
+
+		if (nr_entries + nr_next <= KEYS_PER_NODE) {
+			/*
+			 * Merge the next node into this node so that we don't
+			 * have to do an additional update of the keys in the
+			 * higher levels.
+			 */
+			for (i = 0; i < nr_next; i++) {
+				node->keys[nr_entries + i] = next->keys[i];
+				node->ptrs[nr_entries + i] = next->ptrs[i];
+			}
+
+			++*pos;
+			return next;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+xfs_iext_remove_node(
+	struct xfs_ifork	*ifp,
+	xfs_fileoff_t		offset,
+	void			*victim)
+{
+	struct xfs_iext_node	*node, *parent;
+	int			level = 2, pos, nr_entries, i;
+
+	ASSERT(level <= ifp->if_height);
+	node = xfs_iext_find_level(ifp, offset, level);
+	pos = xfs_iext_node_pos(node, offset);
+again:
+	ASSERT(node->ptrs[pos]);
+	ASSERT(node->ptrs[pos] == victim);
+	kmem_free(victim);
+
+	nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
+	offset = node->keys[0];
+	for (i = pos; i < nr_entries; i++) {
+		node->keys[i] = node->keys[i + 1];
+		node->ptrs[i] = node->ptrs[i + 1];
+	}
+	node->keys[nr_entries] = XFS_IEXT_KEY_INVALID;
+	node->ptrs[nr_entries] = NULL;
+
+	if (pos == 0 && nr_entries > 0) {
+		xfs_iext_update_node(ifp, offset, node->keys[0], level, node);
+		offset = node->keys[0];
+	}
+
+	if (nr_entries >= KEYS_PER_NODE / 2)
+		return;
+
+	if (level < ifp->if_height) {
+		/*
+		 * If we aren't at the root yet try to find a neighbour node to
+		 * merge with (or delete the node if it is empty), and then
+		 * recurse up to the next level.
+		 */
+		level++;
+		parent = xfs_iext_find_level(ifp, offset, level);
+		pos = xfs_iext_node_pos(parent, offset);
+
+		ASSERT(pos != KEYS_PER_NODE);
+		ASSERT(parent->ptrs[pos] == node);
+
+		node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries);
+		if (node) {
+			victim = node;
+			node = parent;
+			goto again;
+		}
+	} else if (nr_entries == 1) {
+		/*
+		 * If we are at the root and only one entry is left we can just
+		 * free this node and update the root pointer.
+		 */
+		ASSERT(node == ifp->if_u1.if_root);
+		ifp->if_u1.if_root = node->ptrs[0];
+		ifp->if_height--;
+		kmem_free(node);
+	}
+}
+
+static void
+xfs_iext_rebalance_leaf(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_cursor	*cur,
+	struct xfs_iext_leaf	*leaf,
+	xfs_fileoff_t		offset,
+	int			nr_entries)
+{
+	/*
+	 * If the neighbouring nodes are completely full we might never be able
+	 * to merge our node, and will only delete it once the number of
+	 * entries hits zero.
+	 */
+	if (nr_entries == 0)
+		goto remove_node;
+
+	if (leaf->prev) {
+		int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
+
+		if (nr_prev + nr_entries <= RECS_PER_LEAF) {
+			for (i = 0; i < nr_entries; i++)
+				leaf->prev->recs[nr_prev + i] = leaf->recs[i];
+
+			if (cur->leaf == leaf) {
+				cur->leaf = leaf->prev;
+				cur->pos += nr_prev;
+			}
+			goto remove_node;
+		}
+	}
+
+	if (leaf->next) {
+		int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
+
+		if (nr_entries + nr_next <= RECS_PER_LEAF) {
+			/*
+			 * Merge the next node into this node so that we don't
+			 * have to do an additional update of the keys in the
+			 * higher levels.
+			 */
+			for (i = 0; i < nr_next; i++) {
+				leaf->recs[nr_entries + i] =
+					leaf->next->recs[i];
+			}
+
+			if (cur->leaf == leaf->next) {
+				cur->leaf = leaf;
+				cur->pos += nr_entries;
+			}
+
+			offset = xfs_iext_leaf_key(leaf->next, 0);
+			leaf = leaf->next;
+			goto remove_node;
+		}
+	}
+
+	return;
+remove_node:
+	if (leaf->prev)
+		leaf->prev->next = leaf->next;
+	if (leaf->next)
+		leaf->next->prev = leaf->prev;
+	xfs_iext_remove_node(ifp, offset, leaf);
+}
+
+static void
+xfs_iext_free_last_leaf(
+	struct xfs_ifork	*ifp)
+{
+	ifp->if_height--;
+	kmem_free(ifp->if_u1.if_root);
+	ifp->if_u1.if_root = NULL;
+}
+
+void
+xfs_iext_remove(
+	struct xfs_inode	*ip,
+	struct xfs_iext_cursor	*cur,
+	int			state)
+{
+	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
+	struct xfs_iext_leaf	*leaf = cur->leaf;
+	xfs_fileoff_t		offset = xfs_iext_leaf_key(leaf, 0);
+	int			i, nr_entries;
+
+	trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
+
+	ASSERT(ifp->if_height > 0);
+	ASSERT(ifp->if_u1.if_root != NULL);
+	ASSERT(xfs_iext_valid(ifp, cur));
+
+	xfs_iext_inc_seq(ifp);
+
+	nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
+	for (i = cur->pos; i < nr_entries; i++)
+		leaf->recs[i] = leaf->recs[i + 1];
+	xfs_iext_rec_clear(&leaf->recs[nr_entries]);
+	ifp->if_bytes -= sizeof(struct xfs_iext_rec);
+
+	if (cur->pos == 0 && nr_entries > 0) {
+		xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1,
+				leaf);
+		offset = xfs_iext_leaf_key(leaf, 0);
+	} else if (cur->pos == nr_entries) {
+		if (ifp->if_height > 1 && leaf->next)
+			cur->leaf = leaf->next;
+		else
+			cur->leaf = NULL;
+		cur->pos = 0;
+	}
+
+	if (nr_entries >= RECS_PER_LEAF / 2)
+		return;
+
+	if (ifp->if_height > 1)
+		xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
+	else if (nr_entries == 0)
+		xfs_iext_free_last_leaf(ifp);
+}
+
+/*
+ * Lookup the extent covering bno.
+ *
+ * If there is an extent covering bno return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent cursor in *cur.
+ * If there is no extent covering bno, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its cursor in *cur
+ * instead.
+ * If bno is beyond the last extent return false, and return an invalid
+ * cursor value.
+ */
+bool
+xfs_iext_lookup_extent(
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp,
+	xfs_fileoff_t		offset,
+	struct xfs_iext_cursor	*cur,
+	struct xfs_bmbt_irec	*gotp)
+{
+	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
+
+	cur->leaf = xfs_iext_find_level(ifp, offset, 1);
+	if (!cur->leaf) {
+		cur->pos = 0;
+		return false;
+	}
+
+	for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) {
+		struct xfs_iext_rec *rec = cur_rec(cur);
+
+		if (xfs_iext_rec_is_empty(rec))
+			break;
+		if (xfs_iext_rec_cmp(rec, offset) >= 0)
+			goto found;
+	}
+
+	/* Try looking in the next node for an entry > offset */
+	if (ifp->if_height == 1 || !cur->leaf->next)
+		return false;
+	cur->leaf = cur->leaf->next;
+	cur->pos = 0;
+	if (!xfs_iext_valid(ifp, cur))
+		return false;
+found:
+	xfs_iext_get(gotp, cur_rec(cur));
+	return true;
+}
+
+/*
+ * Returns the last extent before end, and if this extent doesn't cover
+ * end, update end to the end of the extent.
+ */
+bool
+xfs_iext_lookup_extent_before(
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp,
+	xfs_fileoff_t		*end,
+	struct xfs_iext_cursor	*cur,
+	struct xfs_bmbt_irec	*gotp)
+{
+	/* could be optimized to not even look up the next on a match.. */
+	if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
+	    gotp->br_startoff <= *end - 1)
+		return true;
+	if (!xfs_iext_prev_extent(ifp, cur, gotp))
+		return false;
+	*end = gotp->br_startoff + gotp->br_blockcount;
+	return true;
+}
+
+void
+xfs_iext_update_extent(
+	struct xfs_inode	*ip,
+	int			state,
+	struct xfs_iext_cursor	*cur,
+	struct xfs_bmbt_irec	*new)
+{
+	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
+
+	xfs_iext_inc_seq(ifp);
+
+	if (cur->pos == 0) {
+		struct xfs_bmbt_irec	old;
+
+		xfs_iext_get(&old, cur_rec(cur));
+		if (new->br_startoff != old.br_startoff) {
+			xfs_iext_update_node(ifp, old.br_startoff,
+					new->br_startoff, 1, cur->leaf);
+		}
+	}
+
+	trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
+	xfs_iext_set(cur_rec(cur), new);
+	trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
+}
+
+/*
+ * Return true if the cursor points at an extent and return the extent structure
+ * in gotp.  Else return false.
+ */
+bool
+xfs_iext_get_extent(
+	struct xfs_ifork	*ifp,
+	struct xfs_iext_cursor	*cur,
+	struct xfs_bmbt_irec	*gotp)
+{
+	if (!xfs_iext_valid(ifp, cur))
+		return false;
+	xfs_iext_get(gotp, cur_rec(cur));
+	return true;
+}
+
+/*
+ * This is a recursive function, because of that we need to be extremely
+ * careful with stack usage.
+ */
+static void
+xfs_iext_destroy_node(
+	struct xfs_iext_node	*node,
+	int			level)
+{
+	int			i;
+
+	if (level > 1) {
+		for (i = 0; i < KEYS_PER_NODE; i++) {
+			if (node->keys[i] == XFS_IEXT_KEY_INVALID)
+				break;
+			xfs_iext_destroy_node(node->ptrs[i], level - 1);
+		}
+	}
+
+	kmem_free(node);
+}
+
+void
+xfs_iext_destroy(
+	struct xfs_ifork	*ifp)
+{
+	xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
+
+	ifp->if_bytes = 0;
+	ifp->if_height = 0;
+	ifp->if_u1.if_root = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
new file mode 100644
index 000000000..758aacd81
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -0,0 +1,773 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_inode.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_dir2.h"
+
+#include <linux/iversion.h>
+
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we need to mark it with an error and
+ * clear the DONE status of the buffer so that a followup read will re-read it
+ * from disk. We don't report the error otherwise to avoid warnings during log
+ * recovery and we don't get unnecessary panics on debug kernels. We use EIO here
+ * because all we want to do is say readahead failed; there is no-one to report
+ * the error to, so this will distinguish it from a non-ra verifier failure.
+ * Changes to this readahead error behaviour also need to be reflected in
+ * xfs_dquot_buf_readahead_verify().
+ */
+static void
+xfs_inode_buf_verify(
+	struct xfs_buf	*bp,
+	bool		readahead)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	int		i;
+	int		ni;
+
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 */
+	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+	for (i = 0; i < ni; i++) {
+		struct xfs_dinode	*dip;
+		xfs_agino_t		unlinked_ino;
+		int			di_ok;
+
+		dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
+		unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
+		di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
+			xfs_dinode_good_version(mp, dip->di_version) &&
+			xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
+		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+						XFS_ERRTAG_ITOBP_INOTOBP))) {
+			if (readahead) {
+				bp->b_flags &= ~XBF_DONE;
+				xfs_buf_ioerror(bp, -EIO);
+				return;
+			}
+
+#ifdef DEBUG
+			xfs_alert(mp,
+				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
+				(unsigned long long)xfs_buf_daddr(bp), i,
+				be16_to_cpu(dip->di_magic));
+#endif
+			xfs_buf_verifier_error(bp, -EFSCORRUPTED,
+					__func__, dip, sizeof(*dip),
+					NULL);
+			return;
+		}
+	}
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, false);
+}
+
+static void
+xfs_inode_buf_readahead_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, true);
+}
+
+static void
+xfs_inode_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, false);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+	.name = "xfs_inode",
+	.magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+		     cpu_to_be16(XFS_DINODE_MAGIC) },
+	.verify_read = xfs_inode_buf_read_verify,
+	.verify_write = xfs_inode_buf_write_verify,
+};
+
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+	.name = "xfs_inode_ra",
+	.magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+		     cpu_to_be16(XFS_DINODE_MAGIC) },
+	.verify_read = xfs_inode_buf_readahead_verify,
+	.verify_write = xfs_inode_buf_write_verify,
+};
+
+
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter.
+ */
+int
+xfs_imap_to_bp(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_imap		*imap,
+	struct xfs_buf		**bpp)
+{
+	return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+				   imap->im_len, XBF_UNMAPPED, bpp,
+				   &xfs_inode_buf_ops);
+}
+
+static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
+{
+	struct timespec64	tv;
+	uint32_t		n;
+
+	tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n));
+	tv.tv_nsec = n;
+
+	return tv;
+}
+
+/* Convert an ondisk timestamp to an incore timestamp. */
+struct timespec64
+xfs_inode_from_disk_ts(
+	struct xfs_dinode		*dip,
+	const xfs_timestamp_t		ts)
+{
+	struct timespec64		tv;
+	struct xfs_legacy_timestamp	*lts;
+
+	if (xfs_dinode_has_bigtime(dip))
+		return xfs_inode_decode_bigtime(be64_to_cpu(ts));
+
+	lts = (struct xfs_legacy_timestamp *)&ts;
+	tv.tv_sec = (int)be32_to_cpu(lts->t_sec);
+	tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec);
+
+	return tv;
+}
+
+int
+xfs_inode_from_disk(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*from)
+{
+	struct inode		*inode = VFS_I(ip);
+	int			error;
+	xfs_failaddr_t		fa;
+
+	ASSERT(ip->i_cowfp == NULL);
+
+	fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from);
+	if (fa) {
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from,
+				sizeof(*from), fa);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * First get the permanent information that is needed to allocate an
+	 * inode. If the inode is unused, mode is zero and we shouldn't mess
+	 * with the uninitialized part of it.
+	 */
+	if (!xfs_has_v3inodes(ip->i_mount))
+		ip->i_flushiter = be16_to_cpu(from->di_flushiter);
+	inode->i_generation = be32_to_cpu(from->di_gen);
+	inode->i_mode = be16_to_cpu(from->di_mode);
+	if (!inode->i_mode)
+		return 0;
+
+	/*
+	 * Convert v1 inodes immediately to v2 inode format as this is the
+	 * minimum inode version format we support in the rest of the code.
+	 * They will also be unconditionally written back to disk as v2 inodes.
+	 */
+	if (unlikely(from->di_version == 1)) {
+		set_nlink(inode, be16_to_cpu(from->di_onlink));
+		ip->i_projid = 0;
+	} else {
+		set_nlink(inode, be32_to_cpu(from->di_nlink));
+		ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
+					be16_to_cpu(from->di_projid_lo);
+	}
+
+	i_uid_write(inode, be32_to_cpu(from->di_uid));
+	i_gid_write(inode, be32_to_cpu(from->di_gid));
+
+	/*
+	 * Time is signed, so need to convert to signed 32 bit before
+	 * storing in inode timestamp which may be 64 bit. Otherwise
+	 * a time before epoch is converted to a time long after epoch
+	 * on 64 bit systems.
+	 */
+	inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
+	inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+	inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
+
+	ip->i_disk_size = be64_to_cpu(from->di_size);
+	ip->i_nblocks = be64_to_cpu(from->di_nblocks);
+	ip->i_extsize = be32_to_cpu(from->di_extsize);
+	ip->i_forkoff = from->di_forkoff;
+	ip->i_diflags = be16_to_cpu(from->di_flags);
+	ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked);
+
+	if (from->di_dmevmask || from->di_dmstate)
+		xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS);
+
+	if (xfs_has_v3inodes(ip->i_mount)) {
+		inode_set_iversion_queried(inode,
+					   be64_to_cpu(from->di_changecount));
+		ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
+		ip->i_diflags2 = be64_to_cpu(from->di_flags2);
+		ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
+	}
+
+	error = xfs_iformat_data_fork(ip, from);
+	if (error)
+		return error;
+	if (from->di_forkoff) {
+		error = xfs_iformat_attr_fork(ip, from);
+		if (error)
+			goto out_destroy_data_fork;
+	}
+	if (xfs_is_reflink_inode(ip))
+		xfs_ifork_init_cow(ip);
+	return 0;
+
+out_destroy_data_fork:
+	xfs_idestroy_fork(&ip->i_df);
+	return error;
+}
+
+/* Convert an incore timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_inode_to_disk_ts(
+	struct xfs_inode		*ip,
+	const struct timespec64		tv)
+{
+	struct xfs_legacy_timestamp	*lts;
+	xfs_timestamp_t			ts;
+
+	if (xfs_inode_has_bigtime(ip))
+		return cpu_to_be64(xfs_inode_encode_bigtime(tv));
+
+	lts = (struct xfs_legacy_timestamp *)&ts;
+	lts->t_sec = cpu_to_be32(tv.tv_sec);
+	lts->t_nsec = cpu_to_be32(tv.tv_nsec);
+
+	return ts;
+}
+
+static inline void
+xfs_inode_to_disk_iext_counters(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*to)
+{
+	if (xfs_inode_has_large_extent_counts(ip)) {
+		to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
+		to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af));
+		/*
+		 * We might be upgrading the inode to use larger extent counters
+		 * than was previously used. Hence zero the unused field.
+		 */
+		to->di_nrext64_pad = cpu_to_be16(0);
+	} else {
+		to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+		to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af));
+	}
+}
+
+void
+xfs_inode_to_disk(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*to,
+	xfs_lsn_t		lsn)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+	to->di_onlink = 0;
+
+	to->di_format = xfs_ifork_format(&ip->i_df);
+	to->di_uid = cpu_to_be32(i_uid_read(inode));
+	to->di_gid = cpu_to_be32(i_gid_read(inode));
+	to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
+	to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
+
+	to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
+	to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+	to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
+	to->di_nlink = cpu_to_be32(inode->i_nlink);
+	to->di_gen = cpu_to_be32(inode->i_generation);
+	to->di_mode = cpu_to_be16(inode->i_mode);
+
+	to->di_size = cpu_to_be64(ip->i_disk_size);
+	to->di_nblocks = cpu_to_be64(ip->i_nblocks);
+	to->di_extsize = cpu_to_be32(ip->i_extsize);
+	to->di_forkoff = ip->i_forkoff;
+	to->di_aformat = xfs_ifork_format(&ip->i_af);
+	to->di_flags = cpu_to_be16(ip->i_diflags);
+
+	if (xfs_has_v3inodes(ip->i_mount)) {
+		to->di_version = 3;
+		to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
+		to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
+		to->di_flags2 = cpu_to_be64(ip->i_diflags2);
+		to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
+		to->di_ino = cpu_to_be64(ip->i_ino);
+		to->di_lsn = cpu_to_be64(lsn);
+		memset(to->di_pad2, 0, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+		to->di_v3_pad = 0;
+	} else {
+		to->di_version = 2;
+		to->di_flushiter = cpu_to_be16(ip->i_flushiter);
+		memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
+	}
+
+	xfs_inode_to_disk_iext_counters(ip, to);
+}
+
+static xfs_failaddr_t
+xfs_dinode_verify_fork(
+	struct xfs_dinode	*dip,
+	struct xfs_mount	*mp,
+	int			whichfork)
+{
+	xfs_extnum_t		di_nextents;
+	xfs_extnum_t		max_extents;
+	mode_t			mode = be16_to_cpu(dip->di_mode);
+	uint32_t		fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+	uint32_t		fork_format = XFS_DFORK_FORMAT(dip, whichfork);
+
+	di_nextents = xfs_dfork_nextents(dip, whichfork);
+
+	/*
+	 * For fork types that can contain local data, check that the fork
+	 * format matches the size of local data contained within the fork.
+	 *
+	 * For all types, check that when the size says the should be in extent
+	 * or btree format, the inode isn't claiming it is in local format.
+	 */
+	if (whichfork == XFS_DATA_FORK) {
+		if (S_ISDIR(mode) || S_ISLNK(mode)) {
+			if (be64_to_cpu(dip->di_size) <= fork_size &&
+			    fork_format != XFS_DINODE_FMT_LOCAL)
+				return __this_address;
+		}
+
+		if (be64_to_cpu(dip->di_size) > fork_size &&
+		    fork_format == XFS_DINODE_FMT_LOCAL)
+			return __this_address;
+	}
+
+	switch (fork_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		/*
+		 * No local regular files yet.
+		 */
+		if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+			return __this_address;
+		if (di_nextents)
+			return __this_address;
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork))
+			return __this_address;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		max_extents = xfs_iext_max_nextents(
+					xfs_dinode_has_large_extent_counts(dip),
+					whichfork);
+		if (di_nextents > max_extents)
+			return __this_address;
+		break;
+	default:
+		return __this_address;
+	}
+	return NULL;
+}
+
+static xfs_failaddr_t
+xfs_dinode_verify_forkoff(
+	struct xfs_dinode	*dip,
+	struct xfs_mount	*mp)
+{
+	if (!dip->di_forkoff)
+		return NULL;
+
+	switch (dip->di_format)  {
+	case XFS_DINODE_FMT_DEV:
+		if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
+			return __this_address;
+		break;
+	case XFS_DINODE_FMT_LOCAL:	/* fall through ... */
+	case XFS_DINODE_FMT_EXTENTS:    /* fall through ... */
+	case XFS_DINODE_FMT_BTREE:
+		if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3))
+			return __this_address;
+		break;
+	default:
+		return __this_address;
+	}
+	return NULL;
+}
+
+static xfs_failaddr_t
+xfs_dinode_verify_nrext64(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip)
+{
+	if (xfs_dinode_has_large_extent_counts(dip)) {
+		if (!xfs_has_large_extent_counts(mp))
+			return __this_address;
+		if (dip->di_nrext64_pad != 0)
+			return __this_address;
+	} else if (dip->di_version >= 3) {
+		if (dip->di_v3_pad != 0)
+			return __this_address;
+	}
+
+	return NULL;
+}
+
+xfs_failaddr_t
+xfs_dinode_verify(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	struct xfs_dinode	*dip)
+{
+	xfs_failaddr_t		fa;
+	uint16_t		mode;
+	uint16_t		flags;
+	uint64_t		flags2;
+	uint64_t		di_size;
+	xfs_extnum_t		nextents;
+	xfs_extnum_t		naextents;
+	xfs_filblks_t		nblocks;
+
+	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+		return __this_address;
+
+	/* Verify v3 integrity information first */
+	if (dip->di_version >= 3) {
+		if (!xfs_has_v3inodes(mp))
+			return __this_address;
+		if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+				      XFS_DINODE_CRC_OFF))
+			return __this_address;
+		if (be64_to_cpu(dip->di_ino) != ino)
+			return __this_address;
+		if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+	}
+
+	/* don't allow invalid i_size */
+	di_size = be64_to_cpu(dip->di_size);
+	if (di_size & (1ULL << 63))
+		return __this_address;
+
+	mode = be16_to_cpu(dip->di_mode);
+	if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
+		return __this_address;
+
+	/* No zero-length symlinks/dirs. */
+	if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
+		return __this_address;
+
+	fa = xfs_dinode_verify_nrext64(mp, dip);
+	if (fa)
+		return fa;
+
+	nextents = xfs_dfork_data_extents(dip);
+	naextents = xfs_dfork_attr_extents(dip);
+	nblocks = be64_to_cpu(dip->di_nblocks);
+
+	/* Fork checks carried over from xfs_iformat_fork */
+	if (mode && nextents + naextents > nblocks)
+		return __this_address;
+
+	if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
+		return __this_address;
+
+	if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
+		return __this_address;
+
+	flags = be16_to_cpu(dip->di_flags);
+
+	if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
+		return __this_address;
+
+	/* check for illegal values of forkoff */
+	fa = xfs_dinode_verify_forkoff(dip, mp);
+	if (fa)
+		return fa;
+
+	/* Do we have appropriate data fork formats for the mode? */
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		if (dip->di_format != XFS_DINODE_FMT_DEV)
+			return __this_address;
+		break;
+	case S_IFREG:
+	case S_IFLNK:
+	case S_IFDIR:
+		fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK);
+		if (fa)
+			return fa;
+		break;
+	case 0:
+		/* Uninitialized inode ok. */
+		break;
+	default:
+		return __this_address;
+	}
+
+	if (dip->di_forkoff) {
+		fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK);
+		if (fa)
+			return fa;
+	} else {
+		/*
+		 * If there is no fork offset, this may be a freshly-made inode
+		 * in a new disk cluster, in which case di_aformat is zeroed.
+		 * Otherwise, such an inode must be in EXTENTS format; this goes
+		 * for freed inodes as well.
+		 */
+		switch (dip->di_aformat) {
+		case 0:
+		case XFS_DINODE_FMT_EXTENTS:
+			break;
+		default:
+			return __this_address;
+		}
+		if (naextents)
+			return __this_address;
+	}
+
+	/* extent size hint validation */
+	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
+			mode, flags);
+	if (fa)
+		return fa;
+
+	/* only version 3 or greater inodes are extensively verified here */
+	if (dip->di_version < 3)
+		return NULL;
+
+	flags2 = be64_to_cpu(dip->di_flags2);
+
+	/* don't allow reflink/cowextsize if we don't have reflink */
+	if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) &&
+	     !xfs_has_reflink(mp))
+		return __this_address;
+
+	/* only regular files get reflink */
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (mode & S_IFMT) != S_IFREG)
+		return __this_address;
+
+	/* don't let reflink and realtime mix */
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+		return __this_address;
+
+	/* COW extent size hint validation */
+	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
+			mode, flags, flags2);
+	if (fa)
+		return fa;
+
+	/* bigtime iflag can only happen on bigtime filesystems */
+	if (xfs_dinode_has_bigtime(dip) &&
+	    !xfs_has_bigtime(mp))
+		return __this_address;
+
+	return NULL;
+}
+
+void
+xfs_dinode_calc_crc(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip)
+{
+	uint32_t		crc;
+
+	if (dip->di_version < 3)
+		return;
+
+	ASSERT(xfs_has_crc(mp));
+	crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
+			      XFS_DINODE_CRC_OFF);
+	dip->di_crc = xfs_end_cksum(crc);
+}
+
+/*
+ * Validate di_extsize hint.
+ *
+ * 1. Extent size hint is only valid for directories and regular files.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files.
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 4. Hint cannot be larger than MAXTEXTLEN.
+ * 5. Can be changed on directories at any time.
+ * 6. Hint value of 0 turns off hints, clears inode flags.
+ * 7. Extent size must be a multiple of the appropriate block size.
+ *    For realtime files, this is the rt extent size.
+ * 8. For non-realtime files, the extent size hint must be limited
+ *    to half the AG size to avoid alignment extending the extent beyond the
+ *    limits of the AG.
+ */
+xfs_failaddr_t
+xfs_inode_validate_extsize(
+	struct xfs_mount		*mp,
+	uint32_t			extsize,
+	uint16_t			mode,
+	uint16_t			flags)
+{
+	bool				rt_flag;
+	bool				hint_flag;
+	bool				inherit_flag;
+	uint32_t			extsize_bytes;
+	uint32_t			blocksize_bytes;
+
+	rt_flag = (flags & XFS_DIFLAG_REALTIME);
+	hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
+	inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
+	extsize_bytes = XFS_FSB_TO_B(mp, extsize);
+
+	/*
+	 * This comment describes a historic gap in this verifier function.
+	 *
+	 * For a directory with both RTINHERIT and EXTSZINHERIT flags set, this
+	 * function has never checked that the extent size hint is an integer
+	 * multiple of the realtime extent size.  Since we allow users to set
+	 * this combination  on non-rt filesystems /and/ to change the rt
+	 * extent size when adding a rt device to a filesystem, the net effect
+	 * is that users can configure a filesystem anticipating one rt
+	 * geometry and change their minds later.  Directories do not use the
+	 * extent size hint, so this is harmless for them.
+	 *
+	 * If a directory with a misaligned extent size hint is allowed to
+	 * propagate that hint into a new regular realtime file, the result
+	 * is that the inode cluster buffer verifier will trigger a corruption
+	 * shutdown the next time it is run, because the verifier has always
+	 * enforced the alignment rule for regular files.
+	 *
+	 * Because we allow administrators to set a new rt extent size when
+	 * adding a rt section, we cannot add a check to this verifier because
+	 * that will result a new source of directory corruption errors when
+	 * reading an existing filesystem.  Instead, we rely on callers to
+	 * decide when alignment checks are appropriate, and fix things up as
+	 * needed.
+	 */
+
+	if (rt_flag)
+		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+	else
+		blocksize_bytes = mp->m_sb.sb_blocksize;
+
+	if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
+		return __this_address;
+
+	if (hint_flag && !S_ISREG(mode))
+		return __this_address;
+
+	if (inherit_flag && !S_ISDIR(mode))
+		return __this_address;
+
+	if ((hint_flag || inherit_flag) && extsize == 0)
+		return __this_address;
+
+	/* free inodes get flags set to zero but extsize remains */
+	if (mode && !(hint_flag || inherit_flag) && extsize != 0)
+		return __this_address;
+
+	if (extsize_bytes % blocksize_bytes)
+		return __this_address;
+
+	if (extsize > XFS_MAX_BMBT_EXTLEN)
+		return __this_address;
+
+	if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
+		return __this_address;
+
+	return NULL;
+}
+
+/*
+ * Validate di_cowextsize hint.
+ *
+ * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
+ *    The inode does not have to have any shared blocks, but it must be a v3.
+ * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
+ *    for a directory, the hint is propagated to new files.
+ * 3. Can be changed on files & directories at any time.
+ * 4. Hint value of 0 turns off hints, clears inode flags.
+ * 5. Extent size must be a multiple of the appropriate block size.
+ * 6. The extent size hint must be limited to half the AG size to avoid
+ *    alignment extending the extent beyond the limits of the AG.
+ */
+xfs_failaddr_t
+xfs_inode_validate_cowextsize(
+	struct xfs_mount		*mp,
+	uint32_t			cowextsize,
+	uint16_t			mode,
+	uint16_t			flags,
+	uint64_t			flags2)
+{
+	bool				rt_flag;
+	bool				hint_flag;
+	uint32_t			cowextsize_bytes;
+
+	rt_flag = (flags & XFS_DIFLAG_REALTIME);
+	hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
+	cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
+
+	if (hint_flag && !xfs_has_reflink(mp))
+		return __this_address;
+
+	if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode)))
+		return __this_address;
+
+	if (hint_flag && cowextsize == 0)
+		return __this_address;
+
+	/* free inodes get flags set to zero but cowextsize remains */
+	if (mode && !hint_flag && cowextsize != 0)
+		return __this_address;
+
+	if (hint_flag && rt_flag)
+		return __this_address;
+
+	if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+		return __this_address;
+
+	if (cowextsize > XFS_MAX_BMBT_EXTLEN)
+		return __this_address;
+
+	if (cowextsize > mp->m_sb.sb_agblocks / 2)
+		return __this_address;
+
+	return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
new file mode 100644
index 000000000..585ed5a11
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_INODE_BUF_H__
+#define	__XFS_INODE_BUF_H__
+
+struct xfs_inode;
+struct xfs_dinode;
+
+/*
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
+	unsigned short	im_len;		/* length in BBs of inode chunk */
+	unsigned short	im_boffset;	/* inode offset in block in bytes */
+};
+
+int	xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp,
+		       struct xfs_imap *imap, struct xfs_buf **bpp);
+void	xfs_dinode_calc_crc(struct xfs_mount *mp, struct xfs_dinode *dip);
+void	xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
+			  xfs_lsn_t lsn);
+int	xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
+
+xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
+			   struct xfs_dinode *dip);
+xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
+		uint32_t extsize, uint16_t mode, uint16_t flags);
+xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
+		uint32_t cowextsize, uint16_t mode, uint16_t flags,
+		uint64_t flags2);
+
+static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
+{
+	return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec;
+}
+
+struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip,
+		const xfs_timestamp_t ts);
+
+static inline bool
+xfs_dinode_good_version(struct xfs_mount *mp, uint8_t version)
+{
+	if (xfs_has_v3inodes(mp))
+		return version == 3;
+	return version == 1 || version == 2;
+}
+
+
+#endif	/* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
new file mode 100644
index 000000000..6b2176018
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -0,0 +1,779 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_types.h"
+#include "xfs_errortag.h"
+
+struct kmem_cache *xfs_ifork_cache;
+
+void
+xfs_init_local_fork(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	const void		*data,
+	int64_t			size)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	int			mem_size = size;
+	bool			zero_terminate;
+
+	/*
+	 * If we are using the local fork to store a symlink body we need to
+	 * zero-terminate it so that we can pass it back to the VFS directly.
+	 * Overallocate the in-memory fork by one for that and add a zero
+	 * to terminate it below.
+	 */
+	zero_terminate = S_ISLNK(VFS_I(ip)->i_mode);
+	if (zero_terminate)
+		mem_size++;
+
+	if (size) {
+		ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
+		memcpy(ifp->if_u1.if_data, data, size);
+		if (zero_terminate)
+			ifp->if_u1.if_data[size] = '\0';
+	} else {
+		ifp->if_u1.if_data = NULL;
+	}
+
+	ifp->if_bytes = size;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ */
+STATIC int
+xfs_iformat_local(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip,
+	int			whichfork,
+	int			size)
+{
+	/*
+	 * If the size is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or memcpy() below.
+	 */
+	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+		xfs_warn(ip->i_mount,
+	"corrupt inode %llu (bad size %d for local fork, size = %zd).",
+			(unsigned long long) ip->i_ino, size,
+			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+				"xfs_iformat_local", dip, sizeof(*dip),
+				__this_address);
+		return -EFSCORRUPTED;
+	}
+
+	xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
+	return 0;
+}
+
+/*
+ * The file consists of a set of extents all of which fit into the on-disk
+ * inode.
+ */
+STATIC int
+xfs_iformat_extents(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	int			state = xfs_bmap_fork_to_state(whichfork);
+	xfs_extnum_t		nex = xfs_dfork_nextents(dip, whichfork);
+	int			size = nex * sizeof(xfs_bmbt_rec_t);
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_rec	*dp;
+	struct xfs_bmbt_irec	new;
+	int			i;
+
+	/*
+	 * If the number of extents is unreasonable, then something is wrong and
+	 * we just bail out rather than crash in kmem_alloc() or memcpy() below.
+	 */
+	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
+		xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
+			ip->i_ino, nex);
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+				"xfs_iformat_extents(1)", dip, sizeof(*dip),
+				__this_address);
+		return -EFSCORRUPTED;
+	}
+
+	ifp->if_bytes = 0;
+	ifp->if_u1.if_root = NULL;
+	ifp->if_height = 0;
+	if (size) {
+		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+
+		xfs_iext_first(ifp, &icur);
+		for (i = 0; i < nex; i++, dp++) {
+			xfs_failaddr_t	fa;
+
+			xfs_bmbt_disk_get_all(dp, &new);
+			fa = xfs_bmap_validate_extent(ip, whichfork, &new);
+			if (fa) {
+				xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+						"xfs_iformat_extents(2)",
+						dp, sizeof(*dp), fa);
+				return -EFSCORRUPTED;
+			}
+
+			xfs_iext_insert(ip, &icur, &new, state);
+			trace_xfs_read_extent(ip, &icur, state, _THIS_IP_);
+			xfs_iext_next(ifp, &icur);
+		}
+	}
+	return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_bmdr_block_t	*dfp;
+	struct xfs_ifork	*ifp;
+	/* REFERENCED */
+	int			nrecs;
+	int			size;
+	int			level;
+
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	/*
+	 * blow out if -- fork has less extents than can fit in
+	 * fork (fork shouldn't be a btree format), root btree
+	 * block has more records than can fit into the fork,
+	 * or the number of extents is greater than the number of
+	 * blocks.
+	 */
+	if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) ||
+		     nrecs == 0 ||
+		     XFS_BMDR_SPACE_CALC(nrecs) >
+					XFS_DFORK_SIZE(dip, mp, whichfork) ||
+		     ifp->if_nextents > ip->i_nblocks) ||
+		     level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) {
+		xfs_warn(mp, "corrupt inode %llu (btree).",
+					(unsigned long long) ip->i_ino);
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+				"xfs_iformat_btree", dfp, size,
+				__this_address);
+		return -EFSCORRUPTED;
+	}
+
+	ifp->if_broot_bytes = size;
+	ifp->if_broot = kmem_alloc(size, KM_NOFS);
+	ASSERT(ifp->if_broot != NULL);
+	/*
+	 * Copy and convert from the on-disk structure
+	 * to the in-memory structure.
+	 */
+	xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+			 ifp->if_broot, size);
+
+	ifp->if_bytes = 0;
+	ifp->if_u1.if_root = NULL;
+	ifp->if_height = 0;
+	return 0;
+}
+
+int
+xfs_iformat_data_fork(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct inode		*inode = VFS_I(ip);
+	int			error;
+
+	/*
+	 * Initialize the extent count early, as the per-format routines may
+	 * depend on it.
+	 */
+	ip->i_df.if_format = dip->di_format;
+	ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		ip->i_disk_size = 0;
+		inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip));
+		return 0;
+	case S_IFREG:
+	case S_IFLNK:
+	case S_IFDIR:
+		switch (ip->i_df.if_format) {
+		case XFS_DINODE_FMT_LOCAL:
+			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK,
+					be64_to_cpu(dip->di_size));
+			if (!error)
+				error = xfs_ifork_verify_local_data(ip);
+			return error;
+		case XFS_DINODE_FMT_EXTENTS:
+			return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+		case XFS_DINODE_FMT_BTREE:
+			return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+		default:
+			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+					dip, sizeof(*dip), __this_address);
+			return -EFSCORRUPTED;
+		}
+		break;
+	default:
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+				sizeof(*dip), __this_address);
+		return -EFSCORRUPTED;
+	}
+}
+
+static uint16_t
+xfs_dfork_attr_shortform_size(
+	struct xfs_dinode		*dip)
+{
+	struct xfs_attr_shortform	*atp =
+		(struct xfs_attr_shortform *)XFS_DFORK_APTR(dip);
+
+	return be16_to_cpu(atp->hdr.totsize);
+}
+
+void
+xfs_ifork_init_attr(
+	struct xfs_inode	*ip,
+	enum xfs_dinode_fmt	format,
+	xfs_extnum_t		nextents)
+{
+	ip->i_af.if_format = format;
+	ip->i_af.if_nextents = nextents;
+}
+
+void
+xfs_ifork_zap_attr(
+	struct xfs_inode	*ip)
+{
+	xfs_idestroy_fork(&ip->i_af);
+	memset(&ip->i_af, 0, sizeof(struct xfs_ifork));
+	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
+}
+
+int
+xfs_iformat_attr_fork(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	xfs_extnum_t		naextents = xfs_dfork_attr_extents(dip);
+	int			error = 0;
+
+	/*
+	 * Initialize the extent count early, as the per-format routines may
+	 * depend on it.
+	 */
+	xfs_ifork_init_attr(ip, dip->di_aformat, naextents);
+
+	switch (ip->i_af.if_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK,
+				xfs_dfork_attr_shortform_size(dip));
+		if (!error)
+			error = xfs_ifork_verify_local_attr(ip);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+		break;
+	default:
+		xfs_inode_verifier_error(ip, error, __func__, dip,
+				sizeof(*dip), __this_address);
+		error = -EFSCORRUPTED;
+		break;
+	}
+
+	if (error)
+		xfs_ifork_zap_attr(ip);
+	return error;
+}
+
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *	 requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+	xfs_inode_t		*ip,
+	int			rec_diff,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			cur_max;
+	struct xfs_ifork	*ifp;
+	struct xfs_btree_block	*new_broot;
+	int			new_max;
+	size_t			new_size;
+	char			*np;
+	char			*op;
+
+	/*
+	 * Handle the degenerate case quietly.
+	 */
+	if (rec_diff == 0) {
+		return;
+	}
+
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	if (rec_diff > 0) {
+		/*
+		 * If there wasn't any memory allocated before, just
+		 * allocate it now and get out.
+		 */
+		if (ifp->if_broot_bytes == 0) {
+			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+			ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
+			ifp->if_broot_bytes = (int)new_size;
+			return;
+		}
+
+		/*
+		 * If there is already an existing if_broot, then we need
+		 * to realloc() it and shift the pointers to their new
+		 * location.  The records don't change location because
+		 * they are kept butted up against the btree block header.
+		 */
+		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+		new_max = cur_max + rec_diff;
+		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+		ifp->if_broot = krealloc(ifp->if_broot, new_size,
+					 GFP_NOFS | __GFP_NOFAIL);
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     (int)new_size);
+		ifp->if_broot_bytes = (int)new_size;
+		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, whichfork));
+		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
+		return;
+	}
+
+	/*
+	 * rec_diff is less than 0.  In this case, we are shrinking the
+	 * if_broot buffer.  It must already exist.  If we go to zero
+	 * records, just get rid of the root and clear the status bit.
+	 */
+	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+	new_max = cur_max + rec_diff;
+	ASSERT(new_max >= 0);
+	if (new_max > 0)
+		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+	else
+		new_size = 0;
+	if (new_size > 0) {
+		new_broot = kmem_alloc(new_size, KM_NOFS);
+		/*
+		 * First copy over the btree block header.
+		 */
+		memcpy(new_broot, ifp->if_broot,
+			XFS_BMBT_BLOCK_LEN(ip->i_mount));
+	} else {
+		new_broot = NULL;
+	}
+
+	/*
+	 * Only copy the records and pointers if there are any.
+	 */
+	if (new_max > 0) {
+		/*
+		 * First copy the records.
+		 */
+		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+		/*
+		 * Then copy the pointers.
+		 */
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+						     (int)new_size);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
+	}
+	kmem_free(ifp->if_broot);
+	ifp->if_broot = new_broot;
+	ifp->if_broot_bytes = (int)new_size;
+	if (ifp->if_broot)
+		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, whichfork));
+	return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *	 requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+	struct xfs_inode	*ip,
+	int64_t			byte_diff,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	int64_t			new_size = ifp->if_bytes + byte_diff;
+
+	ASSERT(new_size >= 0);
+	ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
+
+	if (byte_diff == 0)
+		return;
+
+	if (new_size == 0) {
+		kmem_free(ifp->if_u1.if_data);
+		ifp->if_u1.if_data = NULL;
+		ifp->if_bytes = 0;
+		return;
+	}
+
+	ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
+				      GFP_NOFS | __GFP_NOFAIL);
+	ifp->if_bytes = new_size;
+}
+
+void
+xfs_idestroy_fork(
+	struct xfs_ifork	*ifp)
+{
+	if (ifp->if_broot != NULL) {
+		kmem_free(ifp->if_broot);
+		ifp->if_broot = NULL;
+	}
+
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		kmem_free(ifp->if_u1.if_data);
+		ifp->if_u1.if_data = NULL;
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		if (ifp->if_height)
+			xfs_iext_destroy(ifp);
+		break;
+	}
+}
+
+/*
+ * Convert in-core extents to on-disk form
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine.  We will return the size actually
+ * used.
+ */
+int
+xfs_iextents_copy(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_rec	*dp,
+	int			whichfork)
+{
+	int			state = xfs_bmap_fork_to_state(whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	rec;
+	int64_t			copied = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+	ASSERT(ifp->if_bytes > 0);
+
+	for_each_xfs_iext(ifp, &icur, &rec) {
+		if (isnullstartblock(rec.br_startblock))
+			continue;
+		ASSERT(xfs_bmap_validate_extent(ip, whichfork, &rec) == NULL);
+		xfs_bmbt_disk_set_all(dp, &rec);
+		trace_xfs_write_extent(ip, &icur, state, _RET_IP_);
+		copied += sizeof(struct xfs_bmbt_rec);
+		dp++;
+	}
+
+	ASSERT(copied > 0);
+	ASSERT(copied <= ifp->if_bytes);
+	return copied;
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip,
+	struct xfs_inode_log_item *iip,
+	int			whichfork)
+{
+	char			*cp;
+	struct xfs_ifork	*ifp;
+	xfs_mount_t		*mp;
+	static const short	brootflag[2] =
+		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+	static const short	dataflag[2] =
+		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+	static const short	extflag[2] =
+		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+	if (!iip)
+		return;
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	/*
+	 * This can happen if we gave up in iformat in an error path,
+	 * for the attribute fork.
+	 */
+	if (!ifp) {
+		ASSERT(whichfork == XFS_ATTR_FORK);
+		return;
+	}
+	cp = XFS_DFORK_PTR(dip, whichfork);
+	mp = ip->i_mount;
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_fields & dataflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(ifp->if_u1.if_data != NULL);
+			ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
+			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+		}
+		break;
+
+	case XFS_DINODE_FMT_EXTENTS:
+		if ((iip->ili_fields & extflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(ifp->if_nextents > 0);
+			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+				whichfork);
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_fields & brootflag[whichfork]) &&
+		    (ifp->if_broot_bytes > 0)) {
+			ASSERT(ifp->if_broot != NULL);
+			ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			        xfs_inode_fork_size(ip, whichfork));
+			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+				(xfs_bmdr_block_t *)cp,
+				XFS_DFORK_SIZE(dip, mp, whichfork));
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		if (iip->ili_fields & XFS_ILOG_DEV) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			xfs_dinode_put_rdev(dip,
+					linux_to_xfs_dev_t(VFS_I(ip)->i_rdev));
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/* Convert bmap state flags to an inode fork. */
+struct xfs_ifork *
+xfs_iext_state_to_fork(
+	struct xfs_inode	*ip,
+	int			state)
+{
+	if (state & BMAP_COWFORK)
+		return ip->i_cowfp;
+	else if (state & BMAP_ATTRFORK)
+		return &ip->i_af;
+	return &ip->i_df;
+}
+
+/*
+ * Initialize an inode's copy-on-write fork.
+ */
+void
+xfs_ifork_init_cow(
+	struct xfs_inode	*ip)
+{
+	if (ip->i_cowfp)
+		return;
+
+	ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
+				       GFP_NOFS | __GFP_NOFAIL);
+	ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
+}
+
+/* Verify the inline contents of the data fork of an inode. */
+int
+xfs_ifork_verify_local_data(
+	struct xfs_inode	*ip)
+{
+	xfs_failaddr_t		fa = NULL;
+
+	switch (VFS_I(ip)->i_mode & S_IFMT) {
+	case S_IFDIR:
+		fa = xfs_dir2_sf_verify(ip);
+		break;
+	case S_IFLNK:
+		fa = xfs_symlink_shortform_verify(ip);
+		break;
+	default:
+		break;
+	}
+
+	if (fa) {
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
+				ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+/* Verify the inline contents of the attr fork of an inode. */
+int
+xfs_ifork_verify_local_attr(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*ifp = &ip->i_af;
+	xfs_failaddr_t		fa;
+
+	if (!xfs_inode_has_attr_fork(ip))
+		fa = __this_address;
+	else
+		fa = xfs_attr_shortform_verify(ip);
+
+	if (fa) {
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
+				ifp->if_u1.if_data, ifp->if_bytes, fa);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+int
+xfs_iext_count_may_overflow(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	int			nr_to_add)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	uint64_t		max_exts;
+	uint64_t		nr_exts;
+
+	if (whichfork == XFS_COW_FORK)
+		return 0;
+
+	max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
+				whichfork);
+
+	if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+		max_exts = 10;
+
+	nr_exts = ifp->if_nextents + nr_to_add;
+	if (nr_exts < ifp->if_nextents || nr_exts > max_exts)
+		return -EFBIG;
+
+	return 0;
+}
+
+/*
+ * Upgrade this inode's extent counter fields to be able to handle a potential
+ * increase in the extent count by nr_to_add.  Normally this is the same
+ * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
+ */
+int
+xfs_iext_count_upgrade(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	uint			nr_to_add)
+{
+	ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
+	if (!xfs_has_large_extent_counts(ip->i_mount) ||
+	    xfs_inode_has_large_extent_counts(ip) ||
+	    XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+		return -EFBIG;
+
+	ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
new file mode 100644
index 000000000..d3943d6ad
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_INODE_FORK_H__
+#define	__XFS_INODE_FORK_H__
+
+struct xfs_inode_log_item;
+struct xfs_dinode;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+struct xfs_ifork {
+	int64_t			if_bytes;	/* bytes in if_u1 */
+	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
+	unsigned int		if_seq;		/* fork mod counter */
+	int			if_height;	/* height of the extent tree */
+	union {
+		void		*if_root;	/* extent tree root */
+		char		*if_data;	/* inline file data */
+	} if_u1;
+	xfs_extnum_t		if_nextents;	/* # of extents in this fork */
+	short			if_broot_bytes;	/* bytes allocated for root */
+	int8_t			if_format;	/* format of this fork */
+};
+
+/*
+ * Worst-case increase in the fork extent count when we're adding a single
+ * extent to a fork and there's no possibility of splitting an existing mapping.
+ */
+#define XFS_IEXT_ADD_NOSPLIT_CNT	(1)
+
+/*
+ * Punching out an extent from the middle of an existing extent can cause the
+ * extent count to increase by 1.
+ * i.e. | Old extent | Hole | Old extent |
+ */
+#define XFS_IEXT_PUNCH_HOLE_CNT		(1)
+
+/*
+ * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
+ * be added. One extra extent for dabtree in case a local attr is
+ * large enough to cause a double split.  It can also cause extent
+ * count to increase proportional to the size of a remote xattr's
+ * value.
+ */
+#define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \
+	(XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks))
+
+/*
+ * A write to a sub-interval of an existing unwritten extent causes the original
+ * extent to be split into 3 extents
+ * i.e. | Unwritten | Real | Unwritten |
+ * Hence extent count can increase by 2.
+ */
+#define XFS_IEXT_WRITE_UNWRITTEN_CNT	(2)
+
+
+/*
+ * Moving an extent to data fork can cause a sub-interval of an existing extent
+ * to be unmapped. This will increase extent count by 1. Mapping in the new
+ * extent can increase the extent count by 1 again i.e.
+ * | Old extent | New extent | Old extent |
+ * Hence number of extents increases by 2.
+ */
+#define XFS_IEXT_REFLINK_END_COW_CNT	(2)
+
+/*
+ * Removing an initial range of source/donor file's extent and adding a new
+ * extent (from donor/source file) in its place will cause extent count to
+ * increase by 1.
+ */
+#define XFS_IEXT_SWAP_RMAP_CNT		(1)
+
+/*
+ * Fork handling.
+ */
+#define XFS_IFORK_MAXEXT(ip, w) \
+	(xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t))
+
+static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp)
+{
+	return ifp->if_format == XFS_DINODE_FMT_EXTENTS ||
+		ifp->if_format == XFS_DINODE_FMT_BTREE;
+}
+
+static inline xfs_extnum_t xfs_ifork_nextents(struct xfs_ifork *ifp)
+{
+	if (!ifp)
+		return 0;
+	return ifp->if_nextents;
+}
+
+static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
+{
+	if (!ifp)
+		return XFS_DINODE_FMT_EXTENTS;
+	return ifp->if_format;
+}
+
+static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
+				int whichfork)
+{
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+	case XFS_COW_FORK:
+		if (has_large_extent_counts)
+			return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
+		return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
+
+	case XFS_ATTR_FORK:
+		if (has_large_extent_counts)
+			return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+		return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+	default:
+		ASSERT(0);
+		return 0;
+	}
+}
+
+static inline xfs_extnum_t
+xfs_dfork_data_extents(
+	struct xfs_dinode	*dip)
+{
+	if (xfs_dinode_has_large_extent_counts(dip))
+		return be64_to_cpu(dip->di_big_nextents);
+
+	return be32_to_cpu(dip->di_nextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_attr_extents(
+	struct xfs_dinode	*dip)
+{
+	if (xfs_dinode_has_large_extent_counts(dip))
+		return be32_to_cpu(dip->di_big_anextents);
+
+	return be16_to_cpu(dip->di_anextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_nextents(
+	struct xfs_dinode	*dip,
+	int			whichfork)
+{
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		return xfs_dfork_data_extents(dip);
+	case XFS_ATTR_FORK:
+		return xfs_dfork_attr_extents(dip);
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	return 0;
+}
+
+void xfs_ifork_zap_attr(struct xfs_inode *ip);
+void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format,
+		xfs_extnum_t nextents);
+struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
+
+int		xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
+int		xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *);
+void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+				struct xfs_inode_log_item *, int);
+void		xfs_idestroy_fork(struct xfs_ifork *ifp);
+void		xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
+				int whichfork);
+void		xfs_iroot_realloc(struct xfs_inode *, int, int);
+int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
+				  int);
+void		xfs_init_local_fork(struct xfs_inode *ip, int whichfork,
+				const void *data, int64_t size);
+
+xfs_extnum_t	xfs_iext_count(struct xfs_ifork *ifp);
+void		xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
+			struct xfs_bmbt_irec *, int);
+void		xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
+			int);
+void		xfs_iext_destroy(struct xfs_ifork *);
+
+bool		xfs_iext_lookup_extent(struct xfs_inode *ip,
+			struct xfs_ifork *ifp, xfs_fileoff_t bno,
+			struct xfs_iext_cursor *cur,
+			struct xfs_bmbt_irec *gotp);
+bool		xfs_iext_lookup_extent_before(struct xfs_inode *ip,
+			struct xfs_ifork *ifp, xfs_fileoff_t *end,
+			struct xfs_iext_cursor *cur,
+			struct xfs_bmbt_irec *gotp);
+bool		xfs_iext_get_extent(struct xfs_ifork *ifp,
+			struct xfs_iext_cursor *cur,
+			struct xfs_bmbt_irec *gotp);
+void		xfs_iext_update_extent(struct xfs_inode *ip, int state,
+			struct xfs_iext_cursor *cur,
+			struct xfs_bmbt_irec *gotp);
+
+void		xfs_iext_first(struct xfs_ifork *, struct xfs_iext_cursor *);
+void		xfs_iext_last(struct xfs_ifork *, struct xfs_iext_cursor *);
+void		xfs_iext_next(struct xfs_ifork *, struct xfs_iext_cursor *);
+void		xfs_iext_prev(struct xfs_ifork *, struct xfs_iext_cursor *);
+
+static inline bool xfs_iext_next_extent(struct xfs_ifork *ifp,
+		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+	xfs_iext_next(ifp, cur);
+	return xfs_iext_get_extent(ifp, cur, gotp);
+}
+
+static inline bool xfs_iext_prev_extent(struct xfs_ifork *ifp,
+		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+	xfs_iext_prev(ifp, cur);
+	return xfs_iext_get_extent(ifp, cur, gotp);
+}
+
+/*
+ * Return the extent after cur in gotp without updating the cursor.
+ */
+static inline bool xfs_iext_peek_next_extent(struct xfs_ifork *ifp,
+		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+	struct xfs_iext_cursor ncur = *cur;
+
+	xfs_iext_next(ifp, &ncur);
+	return xfs_iext_get_extent(ifp, &ncur, gotp);
+}
+
+/*
+ * Return the extent before cur in gotp without updating the cursor.
+ */
+static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp,
+		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+	struct xfs_iext_cursor ncur = *cur;
+
+	xfs_iext_prev(ifp, &ncur);
+	return xfs_iext_get_extent(ifp, &ncur, gotp);
+}
+
+#define for_each_xfs_iext(ifp, ext, got)		\
+	for (xfs_iext_first((ifp), (ext));		\
+	     xfs_iext_get_extent((ifp), (ext), (got));	\
+	     xfs_iext_next((ifp), (ext)))
+
+extern struct kmem_cache	*xfs_ifork_cache;
+
+extern void xfs_ifork_init_cow(struct xfs_inode *ip);
+
+int xfs_ifork_verify_local_data(struct xfs_inode *ip);
+int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
+int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
+		int nr_to_add);
+int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
+		uint nr_to_add);
+
+/* returns true if the fork has extents but they are not read in yet. */
+static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
+{
+	return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0;
+}
+
+#endif	/* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
new file mode 100644
index 000000000..f13e0809d
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -0,0 +1,993 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_LOG_FORMAT_H__
+#define __XFS_LOG_FORMAT_H__
+
+struct xfs_mount;
+struct xfs_trans_res;
+
+/*
+ * On-disk Log Format definitions.
+ *
+ * This file contains all the on-disk format definitions used within the log. It
+ * includes the physical log structure itself, as well as all the log item
+ * format structures that are written into the log and intepreted by log
+ * recovery. We start with the physical log format definitions, and then work
+ * through all the log items definitions and everything they encode into the
+ * log.
+ */
+typedef uint32_t xlog_tid_t;
+
+#define XLOG_MIN_ICLOGS		2
+#define XLOG_MAX_ICLOGS		8
+#define XLOG_HEADER_MAGIC_NUM	0xFEEDbabe	/* Invalid cycle number */
+#define XLOG_VERSION_1		1
+#define XLOG_VERSION_2		2		/* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS	(XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE	(16*1024)	/* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE	(32*1024)	/* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE	(256*1024)
+#define XLOG_HEADER_CYCLE_SIZE	(32*1024)	/* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT	14		/* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT	15		/* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT	18		/* 256k == 1 << 18 */
+
+#define XLOG_HEADER_SIZE	512
+
+/* Minimum number of transactions that must fit in the log (defined by mkfs) */
+#define XFS_MIN_LOG_FACTOR	3
+
+#define XLOG_REC_SHIFT(log) \
+	BTOBB(1 << (xfs_has_logv2(log->l_mp) ? \
+	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+	BTOBB(XLOG_MAX_ICLOGS << (xfs_has_logv2(log->l_mp) ? \
+	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+	return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+	if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+		return be32_to_cpu(*((__be32 *)ptr + 1));
+	else
+		return be32_to_cpu(*(__be32 *)ptr);
+}
+
+/* Log Clients */
+#define XFS_TRANSACTION		0x69
+#define XFS_LOG			0xaa
+
+#define XLOG_UNMOUNT_TYPE	0x556e	/* Un for Unmount */
+
+/*
+ * Log item for unmount records.
+ *
+ * The unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now; see xfs_log_unmount_write.
+ */
+struct xfs_unmount_log_format {
+	uint16_t	magic;	/* XLOG_UNMOUNT_TYPE */
+	uint16_t	pad1;
+	uint32_t	pad2;	/* may as well make it 64 bits */
+};
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT		1
+#define XLOG_REG_TYPE_BCHUNK		2
+#define XLOG_REG_TYPE_EFI_FORMAT	3
+#define XLOG_REG_TYPE_EFD_FORMAT	4
+#define XLOG_REG_TYPE_IFORMAT		5
+#define XLOG_REG_TYPE_ICORE		6
+#define XLOG_REG_TYPE_IEXT		7
+#define XLOG_REG_TYPE_IBROOT		8
+#define XLOG_REG_TYPE_ILOCAL		9
+#define XLOG_REG_TYPE_IATTR_EXT		10
+#define XLOG_REG_TYPE_IATTR_BROOT	11
+#define XLOG_REG_TYPE_IATTR_LOCAL	12
+#define XLOG_REG_TYPE_QFORMAT		13
+#define XLOG_REG_TYPE_DQUOT		14
+#define XLOG_REG_TYPE_QUOTAOFF		15
+#define XLOG_REG_TYPE_LRHEADER		16
+#define XLOG_REG_TYPE_UNMOUNT		17
+#define XLOG_REG_TYPE_COMMIT		18
+#define XLOG_REG_TYPE_TRANSHDR		19
+#define XLOG_REG_TYPE_ICREATE		20
+#define XLOG_REG_TYPE_RUI_FORMAT	21
+#define XLOG_REG_TYPE_RUD_FORMAT	22
+#define XLOG_REG_TYPE_CUI_FORMAT	23
+#define XLOG_REG_TYPE_CUD_FORMAT	24
+#define XLOG_REG_TYPE_BUI_FORMAT	25
+#define XLOG_REG_TYPE_BUD_FORMAT	26
+#define XLOG_REG_TYPE_ATTRI_FORMAT	27
+#define XLOG_REG_TYPE_ATTRD_FORMAT	28
+#define XLOG_REG_TYPE_ATTR_NAME	29
+#define XLOG_REG_TYPE_ATTR_VALUE	30
+#define XLOG_REG_TYPE_MAX		30
+
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS	0x01	/* Start a new transaction */
+#define XLOG_COMMIT_TRANS	0x02	/* Commit this transaction */
+#define XLOG_CONTINUE_TRANS	0x04	/* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS	0x08	/* Cont this trans into new region */
+#define XLOG_END_TRANS		0x10	/* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS	0x20	/* Unmount a filesystem transaction */
+
+
+typedef struct xlog_op_header {
+	__be32	   oh_tid;	/* transaction id of operation	:  4 b */
+	__be32	   oh_len;	/* bytes in data region		:  4 b */
+	__u8	   oh_clientid;	/* who sent me this		:  1 b */
+	__u8	   oh_flags;	/*				:  1 b */
+	__u16	   oh_res2;	/* 32 bit align			:  2 b */
+} xlog_op_header_t;
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+	__be32	  h_magicno;	/* log record (LR) identifier		:  4 */
+	__be32	  h_cycle;	/* write cycle of log			:  4 */
+	__be32	  h_version;	/* LR version				:  4 */
+	__be32	  h_len;	/* len in bytes; should be 64-bit aligned: 4 */
+	__be64	  h_lsn;	/* lsn of this LR			:  8 */
+	__be64	  h_tail_lsn;	/* lsn of 1st LR w/ buffers not committed: 8 */
+	__le32	  h_crc;	/* crc of log record                    :  4 */
+	__be32	  h_prev_block; /* block number to previous LR		:  4 */
+	__be32	  h_num_logops;	/* number of log operations in this LR	:  4 */
+	__be32	  h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+	/* new fields */
+	__be32    h_fmt;        /* format of log record                 :  4 */
+	uuid_t	  h_fs_uuid;    /* uuid of FS                           : 16 */
+	__be32	  h_size;	/* iclog size				:  4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+	__be32	  xh_cycle;	/* write cycle of log			: 4 */
+	__be32	  xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*	: 256 */
+} xlog_rec_ext_header_t;
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+	xlog_rec_header_t	hic_header;
+	xlog_rec_ext_header_t	hic_xheader;
+	char			hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/* not an on-disk structure, but needed by log recovery in userspace */
+typedef struct xfs_log_iovec {
+	void		*i_addr;	/* beginning address of region */
+	int		i_len;		/* length in bytes of region */
+	uint		i_type;		/* type of region */
+} xfs_log_iovec_t;
+
+
+/*
+ * Transaction Header definitions.
+ *
+ * This is the structure written in the log at the head of every transaction. It
+ * identifies the type and id of the transaction, and contains the number of
+ * items logged by the transaction so we know how many to expect during
+ * recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+	uint		th_magic;		/* magic number */
+	uint		th_type;		/* transaction type */
+	int32_t		th_tid;			/* transaction id (unused) */
+	uint		th_num_items;		/* num items logged by trans */
+} xfs_trans_header_t;
+
+#define	XFS_TRANS_HEADER_MAGIC	0x5452414e	/* TRAN */
+
+/*
+ * The only type valid for th_type in CIL-enabled file system logs:
+ */
+#define XFS_TRANS_CHECKPOINT	40
+
+/*
+ * Log item types.
+ */
+#define	XFS_LI_EFI		0x1236
+#define	XFS_LI_EFD		0x1237
+#define	XFS_LI_IUNLINK		0x1238
+#define	XFS_LI_INODE		0x123b	/* aligned ino chunks, var-size ibufs */
+#define	XFS_LI_BUF		0x123c	/* v2 bufs, variable sized inode bufs */
+#define	XFS_LI_DQUOT		0x123d
+#define	XFS_LI_QUOTAOFF		0x123e
+#define	XFS_LI_ICREATE		0x123f
+#define	XFS_LI_RUI		0x1240	/* rmap update intent */
+#define	XFS_LI_RUD		0x1241
+#define	XFS_LI_CUI		0x1242	/* refcount update intent */
+#define	XFS_LI_CUD		0x1243
+#define	XFS_LI_BUI		0x1244	/* bmbt update intent */
+#define	XFS_LI_BUD		0x1245
+#define	XFS_LI_ATTRI		0x1246  /* attr set/remove intent*/
+#define	XFS_LI_ATTRD		0x1247  /* attr set/remove done */
+
+#define XFS_LI_TYPE_DESC \
+	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
+	{ XFS_LI_EFD,		"XFS_LI_EFD" }, \
+	{ XFS_LI_IUNLINK,	"XFS_LI_IUNLINK" }, \
+	{ XFS_LI_INODE,		"XFS_LI_INODE" }, \
+	{ XFS_LI_BUF,		"XFS_LI_BUF" }, \
+	{ XFS_LI_DQUOT,		"XFS_LI_DQUOT" }, \
+	{ XFS_LI_QUOTAOFF,	"XFS_LI_QUOTAOFF" }, \
+	{ XFS_LI_ICREATE,	"XFS_LI_ICREATE" }, \
+	{ XFS_LI_RUI,		"XFS_LI_RUI" }, \
+	{ XFS_LI_RUD,		"XFS_LI_RUD" }, \
+	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \
+	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \
+	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
+	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
+	{ XFS_LI_ATTRI,		"XFS_LI_ATTRI" }, \
+	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }
+
+/*
+ * Inode Log Item Format definitions.
+ *
+ * This is the structure used to lay out an inode log item in the
+ * log.  The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ */
+struct xfs_inode_log_format {
+	uint16_t		ilf_type;	/* inode log item type */
+	uint16_t		ilf_size;	/* size of this item */
+	uint32_t		ilf_fields;	/* flags for fields logged */
+	uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	uint16_t		ilf_dsize;	/* size of data/ext/root */
+	uint32_t		ilf_pad;	/* pad for 64 bit boundary */
+	uint64_t		ilf_ino;	/* inode number */
+	union {
+		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uint8_t		__pad[16];	/* unused */
+	} ilf_u;
+	int64_t			ilf_blkno;	/* blkno of inode buffer */
+	int32_t			ilf_len;	/* len of inode buffer */
+	int32_t			ilf_boffset;	/* off of inode in buffer */
+};
+
+/*
+ * Old 32 bit systems will log in this format without the 64 bit
+ * alignment padding. Recovery will detect this and convert it to the
+ * correct format.
+ */
+struct xfs_inode_log_format_32 {
+	uint16_t		ilf_type;	/* inode log item type */
+	uint16_t		ilf_size;	/* size of this item */
+	uint32_t		ilf_fields;	/* flags for fields logged */
+	uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	uint16_t		ilf_dsize;	/* size of data/ext/root */
+	uint64_t		ilf_ino;	/* inode number */
+	union {
+		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uint8_t		__pad[16];	/* unused */
+	} ilf_u;
+	int64_t			ilf_blkno;	/* blkno of inode buffer */
+	int32_t			ilf_len;	/* len of inode buffer */
+	int32_t			ilf_boffset;	/* off of inode in buffer */
+} __attribute__((packed));
+
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define	XFS_ILOG_CORE	0x001	/* log standard inode fields */
+#define	XFS_ILOG_DDATA	0x002	/* log i_df.if_data */
+#define	XFS_ILOG_DEXT	0x004	/* log i_df.if_extents */
+#define	XFS_ILOG_DBROOT	0x008	/* log i_df.i_broot */
+#define	XFS_ILOG_DEV	0x010	/* log the dev field */
+#define	XFS_ILOG_UUID	0x020	/* added long ago, but never used */
+#define	XFS_ILOG_ADATA	0x040	/* log i_af.if_data */
+#define	XFS_ILOG_AEXT	0x080	/* log i_af.if_extents */
+#define	XFS_ILOG_ABROOT	0x100	/* log i_af.i_broot */
+#define XFS_ILOG_DOWNER	0x200	/* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER	0x400	/* change the attr fork owner on replay */
+
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core.  Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP	0x4000
+
+#define	XFS_ILOG_NONCORE	(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+				 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \
+				 XFS_ILOG_AOWNER)
+
+#define	XFS_ILOG_DFORK		(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT)
+
+#define	XFS_ILOG_AFORK		(XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT)
+
+#define	XFS_ILOG_ALL		(XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+				 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+				 XFS_ILOG_DEV | XFS_ILOG_ADATA | \
+				 XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+				 XFS_ILOG_TIMESTAMP | XFS_ILOG_DOWNER | \
+				 XFS_ILOG_AOWNER)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+/*
+ * Incore version of the on-disk inode core structures. We log this directly
+ * into the journal in host CPU format (for better or worse) and as such
+ * directly mirrors the xfs_dinode structure as it must contain all the same
+ * information.
+ */
+typedef uint64_t xfs_log_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_log_legacy_timestamp {
+	int32_t		t_sec;		/* timestamp seconds */
+	int32_t		t_nsec;		/* timestamp nanoseconds */
+};
+
+/*
+ * Define the format of the inode core that is logged. This structure must be
+ * kept identical to struct xfs_dinode except for the endianness annotations.
+ */
+struct xfs_log_dinode {
+	uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	uint16_t	di_mode;	/* mode and type of file */
+	int8_t		di_version;	/* inode version */
+	int8_t		di_format;	/* format of di_c data */
+	uint8_t		di_pad3[2];	/* unused in v2/3 inodes */
+	uint32_t	di_uid;		/* owner's user id */
+	uint32_t	di_gid;		/* owner's group id */
+	uint32_t	di_nlink;	/* number of links to file */
+	uint16_t	di_projid_lo;	/* lower part of owner's project id */
+	uint16_t	di_projid_hi;	/* higher part of owner's project id */
+	union {
+		/* Number of data fork extents if NREXT64 is set */
+		uint64_t	di_big_nextents;
+
+		/* Padding for V3 inodes without NREXT64 set. */
+		uint64_t	di_v3_pad;
+
+		/* Padding and inode flush counter for V2 inodes. */
+		struct {
+			uint8_t	di_v2_pad[6];	/* V2 inode zeroed space */
+			uint16_t di_flushiter;	/* V2 inode incremented on flush */
+		};
+	};
+	xfs_log_timestamp_t di_atime;	/* time last accessed */
+	xfs_log_timestamp_t di_mtime;	/* time last modified */
+	xfs_log_timestamp_t di_ctime;	/* time created/inode modified */
+	xfs_fsize_t	di_size;	/* number of bytes in file */
+	xfs_rfsblock_t	di_nblocks;	/* # of direct & btree blocks used */
+	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
+	union {
+		/*
+		 * For V2 inodes and V3 inodes without NREXT64 set, this
+		 * is the number of data and attr fork extents.
+		 */
+		struct {
+			uint32_t  di_nextents;
+			uint16_t  di_anextents;
+		} __packed;
+
+		/* Number of attr fork extents if NREXT64 is set. */
+		struct {
+			uint32_t  di_big_anextents;
+			uint16_t  di_nrext64_pad;
+		} __packed;
+	} __packed;
+	uint8_t		di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	int8_t		di_aformat;	/* format of attr fork's data */
+	uint32_t	di_dmevmask;	/* DMIG event mask */
+	uint16_t	di_dmstate;	/* DMIG state info */
+	uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
+	uint32_t	di_gen;		/* generation number */
+
+	/* di_next_unlinked is the only non-core field in the old dinode */
+	xfs_agino_t	di_next_unlinked;/* agi unlinked list ptr */
+
+	/* start of the extended dinode, writable fields */
+	uint32_t	di_crc;		/* CRC of the inode */
+	uint64_t	di_changecount;	/* number of attribute changes */
+
+	/*
+	 * The LSN we write to this field during formatting is not a reflection
+	 * of the current on-disk LSN. It should never be used for recovery
+	 * sequencing, nor should it be recovered into the on-disk inode at all.
+	 * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
+	 * for details.
+	 */
+	xfs_lsn_t	di_lsn;
+
+	uint64_t	di_flags2;	/* more random flags */
+	uint32_t	di_cowextsize;	/* basic cow extent size for file */
+	uint8_t		di_pad2[12];	/* more padding for future expansion */
+
+	/* fields only written to during inode creation */
+	xfs_log_timestamp_t di_crtime;	/* time created */
+	xfs_ino_t	di_ino;		/* inode number */
+	uuid_t		di_uuid;	/* UUID of the filesystem */
+
+	/* structure must be padded to 64 bit alignment */
+};
+
+#define xfs_log_dinode_size(mp)						\
+	(xfs_has_v3inodes((mp)) ?					\
+		sizeof(struct xfs_log_dinode) :				\
+		offsetof(struct xfs_log_dinode, di_next_unlinked))
+
+/*
+ * Buffer Log Format definitions
+ *
+ * These are the physical dirty bitmap definitions for the log format structure.
+ */
+#define	XFS_BLF_CHUNK		128
+#define	XFS_BLF_SHIFT		7
+#define	BIT_TO_WORD_SHIFT	5
+#define	NBWORD			(NBBY * sizeof(unsigned int))
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define	XFS_BLF_INODE_BUF	(1<<0)
+
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define	XFS_BLF_CANCEL		(1<<1)
+
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define	XFS_BLF_UDQUOT_BUF	(1<<2)
+#define XFS_BLF_PDQUOT_BUF	(1<<3)
+#define	XFS_BLF_GDQUOT_BUF	(1<<4)
+
+/*
+ * This is the structure used to lay out a buf log item in the log.  The data
+ * map describes which 128 byte chunks of the buffer have been logged.
+ *
+ * The placement of blf_map_size causes blf_data_map to start at an odd
+ * multiple of sizeof(unsigned int) offset within the struct.  Because the data
+ * bitmap size will always be an even number, the end of the data_map (and
+ * therefore the structure) will also be at an odd multiple of sizeof(unsigned
+ * int).  Some 64-bit compilers will insert padding at the end of the struct to
+ * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not.  Therefore,
+ * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and
+ * keep the structure size consistent between 32-bit and 64-bit platforms.
+ */
+#define __XFS_BLF_DATAMAP_SIZE	((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+#define XFS_BLF_DATAMAP_SIZE	(__XFS_BLF_DATAMAP_SIZE + 1)
+
+typedef struct xfs_buf_log_format {
+	unsigned short	blf_type;	/* buf log item type indicator */
+	unsigned short	blf_size;	/* size of this item */
+	unsigned short	blf_flags;	/* misc state */
+	unsigned short	blf_len;	/* number of blocks in this buf */
+	int64_t		blf_blkno;	/* starting blkno of this buf */
+	unsigned int	blf_map_size;	/* used size of data bitmap in words */
+	unsigned int	blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+
+/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS	5
+#define XFS_BLFT_SHIFT	11
+#define XFS_BLFT_MASK	(((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+	XFS_BLFT_UNKNOWN_BUF = 0,
+	XFS_BLFT_UDQUOT_BUF,
+	XFS_BLFT_PDQUOT_BUF,
+	XFS_BLFT_GDQUOT_BUF,
+	XFS_BLFT_BTREE_BUF,
+	XFS_BLFT_AGF_BUF,
+	XFS_BLFT_AGFL_BUF,
+	XFS_BLFT_AGI_BUF,
+	XFS_BLFT_DINO_BUF,
+	XFS_BLFT_SYMLINK_BUF,
+	XFS_BLFT_DIR_BLOCK_BUF,
+	XFS_BLFT_DIR_DATA_BUF,
+	XFS_BLFT_DIR_FREE_BUF,
+	XFS_BLFT_DIR_LEAF1_BUF,
+	XFS_BLFT_DIR_LEAFN_BUF,
+	XFS_BLFT_DA_NODE_BUF,
+	XFS_BLFT_ATTR_LEAF_BUF,
+	XFS_BLFT_ATTR_RMT_BUF,
+	XFS_BLFT_SB_BUF,
+	XFS_BLFT_RTBITMAP_BUF,
+	XFS_BLFT_RTSUMMARY_BUF,
+	XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+	ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+	blf->blf_flags &= ~XFS_BLFT_MASK;
+	blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+	return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
+ * EFI/EFD log format definitions
+ */
+typedef struct xfs_extent {
+	xfs_fsblock_t	ext_start;
+	xfs_extlen_t	ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+	uint64_t	ext_start;
+	uint32_t	ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+	uint64_t	ext_start;
+	uint32_t	ext_len;
+	uint32_t	ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.  The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+	uint16_t		efi_type;	/* efi log item type */
+	uint16_t		efi_size;	/* size of this item */
+	uint32_t		efi_nextents;	/* # extents to free */
+	uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_t		efi_extents[];	/* array of extents to free */
+} xfs_efi_log_format_t;
+
+static inline size_t
+xfs_efi_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_efi_log_format) +
+			nr * sizeof(struct xfs_extent);
+}
+
+typedef struct xfs_efi_log_format_32 {
+	uint16_t		efi_type;	/* efi log item type */
+	uint16_t		efi_size;	/* size of this item */
+	uint32_t		efi_nextents;	/* # extents to free */
+	uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_32_t		efi_extents[];	/* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+static inline size_t
+xfs_efi_log_format32_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_efi_log_format_32) +
+			nr * sizeof(struct xfs_extent_32);
+}
+
+typedef struct xfs_efi_log_format_64 {
+	uint16_t		efi_type;	/* efi log item type */
+	uint16_t		efi_size;	/* size of this item */
+	uint32_t		efi_nextents;	/* # extents to free */
+	uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_64_t		efi_extents[];	/* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+static inline size_t
+xfs_efi_log_format64_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_efi_log_format_64) +
+			nr * sizeof(struct xfs_extent_64);
+}
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.  The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+	uint16_t		efd_type;	/* efd log item type */
+	uint16_t		efd_size;	/* size of this item */
+	uint32_t		efd_nextents;	/* # of extents freed */
+	uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_t		efd_extents[];	/* array of extents freed */
+} xfs_efd_log_format_t;
+
+static inline size_t
+xfs_efd_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_efd_log_format) +
+			nr * sizeof(struct xfs_extent);
+}
+
+typedef struct xfs_efd_log_format_32 {
+	uint16_t		efd_type;	/* efd log item type */
+	uint16_t		efd_size;	/* size of this item */
+	uint32_t		efd_nextents;	/* # of extents freed */
+	uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_32_t		efd_extents[];	/* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+static inline size_t
+xfs_efd_log_format32_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_efd_log_format_32) +
+			nr * sizeof(struct xfs_extent_32);
+}
+
+typedef struct xfs_efd_log_format_64 {
+	uint16_t		efd_type;	/* efd log item type */
+	uint16_t		efd_size;	/* size of this item */
+	uint32_t		efd_nextents;	/* # of extents freed */
+	uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_64_t		efd_extents[];	/* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+static inline size_t
+xfs_efd_log_format64_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_efd_log_format_64) +
+			nr * sizeof(struct xfs_extent_64);
+}
+
+/*
+ * RUI/RUD (reverse mapping) log format definitions
+ */
+struct xfs_map_extent {
+	uint64_t		me_owner;
+	uint64_t		me_startblock;
+	uint64_t		me_startoff;
+	uint32_t		me_len;
+	uint32_t		me_flags;
+};
+
+/* rmap me_flags: upper bits are flags, lower byte is type code */
+#define XFS_RMAP_EXTENT_MAP		1
+#define XFS_RMAP_EXTENT_MAP_SHARED	2
+#define XFS_RMAP_EXTENT_UNMAP		3
+#define XFS_RMAP_EXTENT_UNMAP_SHARED	4
+#define XFS_RMAP_EXTENT_CONVERT		5
+#define XFS_RMAP_EXTENT_CONVERT_SHARED	6
+#define XFS_RMAP_EXTENT_ALLOC		7
+#define XFS_RMAP_EXTENT_FREE		8
+#define XFS_RMAP_EXTENT_TYPE_MASK	0xFF
+
+#define XFS_RMAP_EXTENT_ATTR_FORK	(1U << 31)
+#define XFS_RMAP_EXTENT_BMBT_BLOCK	(1U << 30)
+#define XFS_RMAP_EXTENT_UNWRITTEN	(1U << 29)
+
+#define XFS_RMAP_EXTENT_FLAGS		(XFS_RMAP_EXTENT_TYPE_MASK | \
+					 XFS_RMAP_EXTENT_ATTR_FORK | \
+					 XFS_RMAP_EXTENT_BMBT_BLOCK | \
+					 XFS_RMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an rui log item in the
+ * log.  The rui_extents field is a variable size array whose
+ * size is given by rui_nextents.
+ */
+struct xfs_rui_log_format {
+	uint16_t		rui_type;	/* rui log item type */
+	uint16_t		rui_size;	/* size of this item */
+	uint32_t		rui_nextents;	/* # extents to free */
+	uint64_t		rui_id;		/* rui identifier */
+	struct xfs_map_extent	rui_extents[];	/* array of extents to rmap */
+};
+
+static inline size_t
+xfs_rui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_rui_log_format) +
+			nr * sizeof(struct xfs_map_extent);
+}
+
+/*
+ * This is the structure used to lay out an rud log item in the
+ * log.  The rud_extents array is a variable size array whose
+ * size is given by rud_nextents;
+ */
+struct xfs_rud_log_format {
+	uint16_t		rud_type;	/* rud log item type */
+	uint16_t		rud_size;	/* size of this item */
+	uint32_t		__pad;
+	uint64_t		rud_rui_id;	/* id of corresponding rui */
+};
+
+/*
+ * CUI/CUD (refcount update) log format definitions
+ */
+struct xfs_phys_extent {
+	uint64_t		pe_startblock;
+	uint32_t		pe_len;
+	uint32_t		pe_flags;
+};
+
+/* refcount pe_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_refcount_intent_type. */
+#define XFS_REFCOUNT_EXTENT_TYPE_MASK	0xFF
+
+#define XFS_REFCOUNT_EXTENT_FLAGS	(XFS_REFCOUNT_EXTENT_TYPE_MASK)
+
+/*
+ * This is the structure used to lay out a cui log item in the
+ * log.  The cui_extents field is a variable size array whose
+ * size is given by cui_nextents.
+ */
+struct xfs_cui_log_format {
+	uint16_t		cui_type;	/* cui log item type */
+	uint16_t		cui_size;	/* size of this item */
+	uint32_t		cui_nextents;	/* # extents to free */
+	uint64_t		cui_id;		/* cui identifier */
+	struct xfs_phys_extent	cui_extents[];	/* array of extents */
+};
+
+static inline size_t
+xfs_cui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_cui_log_format) +
+			nr * sizeof(struct xfs_phys_extent);
+}
+
+/*
+ * This is the structure used to lay out a cud log item in the
+ * log.  The cud_extents array is a variable size array whose
+ * size is given by cud_nextents;
+ */
+struct xfs_cud_log_format {
+	uint16_t		cud_type;	/* cud log item type */
+	uint16_t		cud_size;	/* size of this item */
+	uint32_t		__pad;
+	uint64_t		cud_cui_id;	/* id of corresponding cui */
+};
+
+/*
+ * BUI/BUD (inode block mapping) log format definitions
+ */
+
+/* bmbt me_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_bmap_intent_type. */
+#define XFS_BMAP_EXTENT_TYPE_MASK	0xFF
+
+#define XFS_BMAP_EXTENT_ATTR_FORK	(1U << 31)
+#define XFS_BMAP_EXTENT_UNWRITTEN	(1U << 30)
+
+#define XFS_BMAP_EXTENT_FLAGS		(XFS_BMAP_EXTENT_TYPE_MASK | \
+					 XFS_BMAP_EXTENT_ATTR_FORK | \
+					 XFS_BMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an bui log item in the
+ * log.  The bui_extents field is a variable size array whose
+ * size is given by bui_nextents.
+ */
+struct xfs_bui_log_format {
+	uint16_t		bui_type;	/* bui log item type */
+	uint16_t		bui_size;	/* size of this item */
+	uint32_t		bui_nextents;	/* # extents to free */
+	uint64_t		bui_id;		/* bui identifier */
+	struct xfs_map_extent	bui_extents[];	/* array of extents to bmap */
+};
+
+static inline size_t
+xfs_bui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_bui_log_format) +
+			nr * sizeof(struct xfs_map_extent);
+}
+
+/*
+ * This is the structure used to lay out an bud log item in the
+ * log.  The bud_extents array is a variable size array whose
+ * size is given by bud_nextents;
+ */
+struct xfs_bud_log_format {
+	uint16_t		bud_type;	/* bud log item type */
+	uint16_t		bud_size;	/* size of this item */
+	uint32_t		__pad;
+	uint64_t		bud_bui_id;	/* id of corresponding bui */
+};
+
+/*
+ * Dquot Log format definitions.
+ *
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+	uint16_t		qlf_type;      /* dquot log item type */
+	uint16_t		qlf_size;      /* size of this item */
+	xfs_dqid_t		qlf_id;	       /* usr/grp/proj id : 32 bits */
+	int64_t			qlf_blkno;     /* blkno of dquot buffer */
+	int32_t			qlf_len;       /* len of dquot buffer */
+	uint32_t		qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+	unsigned short		qf_type;	/* quotaoff log item type */
+	unsigned short		qf_size;	/* size of this item */
+	unsigned int		qf_flags;	/* USR and/or GRP */
+	char			qf_pad[12];	/* padding for future */
+} xfs_qoff_logformat_t;
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT	0x0001  /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD	0x0002  /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD	0x0004  /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT	0x0008  /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD	0x0010  /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD	0x0020  /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT	0x0040  /* group quota accounting ON */
+
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD	0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD	0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD	0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD	0x0400  /* quotacheck run on project quotas */
+
+#define XFS_ALL_QUOTA_ACCT	\
+		(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD	\
+		(XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD	\
+		(XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+				 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+				 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+				 XFS_PQUOTA_CHKD)
+
+/*
+ * Inode create log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+	uint16_t	icl_type;	/* type of log format structure */
+	uint16_t	icl_size;	/* size of log format structure */
+	__be32		icl_ag;		/* ag being allocated in */
+	__be32		icl_agbno;	/* start block of inode range */
+	__be32		icl_count;	/* number of inodes to initialise */
+	__be32		icl_isize;	/* size of inodes */
+	__be32		icl_length;	/* length of extent to initialise */
+	__be32		icl_gen;	/* inode generation number to use */
+};
+
+/*
+ * Flags for deferred attribute operations.
+ * Upper bits are flags, lower byte is type code
+ */
+#define XFS_ATTRI_OP_FLAGS_SET		1	/* Set the attribute */
+#define XFS_ATTRI_OP_FLAGS_REMOVE	2	/* Remove the attribute */
+#define XFS_ATTRI_OP_FLAGS_REPLACE	3	/* Replace the attribute */
+#define XFS_ATTRI_OP_FLAGS_TYPE_MASK	0xFF	/* Flags type mask */
+
+/*
+ * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should
+ * never have any other bits set.
+ */
+#define XFS_ATTRI_FILTER_MASK		(XFS_ATTR_ROOT | \
+					 XFS_ATTR_SECURE | \
+					 XFS_ATTR_INCOMPLETE)
+
+/*
+ * This is the structure used to lay out an attr log item in the
+ * log.
+ */
+struct xfs_attri_log_format {
+	uint16_t	alfi_type;	/* attri log item type */
+	uint16_t	alfi_size;	/* size of this item */
+	uint32_t	__pad;		/* pad to 64 bit aligned */
+	uint64_t	alfi_id;	/* attri identifier */
+	uint64_t	alfi_ino;	/* the inode for this attr operation */
+	uint32_t	alfi_op_flags;	/* marks the op as a set or remove */
+	uint32_t	alfi_name_len;	/* attr name length */
+	uint32_t	alfi_value_len;	/* attr value length */
+	uint32_t	alfi_attr_filter;/* attr filter flags */
+};
+
+struct xfs_attrd_log_format {
+	uint16_t	alfd_type;	/* attrd log item type */
+	uint16_t	alfd_size;	/* size of this item */
+	uint32_t	__pad;		/* pad to 64 bit aligned */
+	uint64_t	alfd_alf_id;	/* id of corresponding attri */
+};
+
+#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
new file mode 100644
index 000000000..2420865f3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+
+/*
+ * Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to
+ * define how recovery should work for that type of log item.
+ */
+struct xlog_recover_item;
+
+/* Sorting hat for log items as they're read in. */
+enum xlog_recover_reorder {
+	XLOG_REORDER_BUFFER_LIST,
+	XLOG_REORDER_ITEM_LIST,
+	XLOG_REORDER_INODE_BUFFER_LIST,
+	XLOG_REORDER_CANCEL_LIST,
+};
+
+struct xlog_recover_item_ops {
+	uint16_t	item_type;	/* XFS_LI_* type code. */
+
+	/*
+	 * Help sort recovered log items into the order required to replay them
+	 * correctly.  Log item types that always use XLOG_REORDER_ITEM_LIST do
+	 * not have to supply a function here.  See the comment preceding
+	 * xlog_recover_reorder_trans for more details about what the return
+	 * values mean.
+	 */
+	enum xlog_recover_reorder (*reorder)(struct xlog_recover_item *item);
+
+	/* Start readahead for pass2, if provided. */
+	void (*ra_pass2)(struct xlog *log, struct xlog_recover_item *item);
+
+	/* Do whatever work we need to do for pass1, if provided. */
+	int (*commit_pass1)(struct xlog *log, struct xlog_recover_item *item);
+
+	/*
+	 * This function should do whatever work is needed for pass2 of log
+	 * recovery, if provided.
+	 *
+	 * If the recovered item is an intent item, this function should parse
+	 * the recovered item to construct an in-core log intent item and
+	 * insert it into the AIL.  The in-core log intent item should have 1
+	 * refcount so that the item is freed either (a) when we commit the
+	 * recovered log item for the intent-done item; (b) replay the work and
+	 * log a new intent-done item; or (c) recovery fails and we have to
+	 * abort.
+	 *
+	 * If the recovered item is an intent-done item, this function should
+	 * parse the recovered item to find the id of the corresponding intent
+	 * log item.  Next, it should find the in-core log intent item in the
+	 * AIL and release it.
+	 */
+	int (*commit_pass2)(struct xlog *log, struct list_head *buffer_list,
+			    struct xlog_recover_item *item, xfs_lsn_t lsn);
+};
+
+extern const struct xlog_recover_item_ops xlog_icreate_item_ops;
+extern const struct xlog_recover_item_ops xlog_buf_item_ops;
+extern const struct xlog_recover_item_ops xlog_inode_item_ops;
+extern const struct xlog_recover_item_ops xlog_dquot_item_ops;
+extern const struct xlog_recover_item_ops xlog_quotaoff_item_ops;
+extern const struct xlog_recover_item_ops xlog_bui_item_ops;
+extern const struct xlog_recover_item_ops xlog_bud_item_ops;
+extern const struct xlog_recover_item_ops xlog_efi_item_ops;
+extern const struct xlog_recover_item_ops xlog_efd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rud_item_ops;
+extern const struct xlog_recover_item_ops xlog_cui_item_ops;
+extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+extern const struct xlog_recover_item_ops xlog_attri_item_ops;
+extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_RHASH_BITS  4
+#define XLOG_RHASH_SIZE	16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid)	\
+	((((uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
+
+
+/*
+ * item headers are in ri_buf[0].  Additional buffers follow.
+ */
+struct xlog_recover_item {
+	struct list_head	ri_list;
+	int			ri_cnt;	/* count of regions found */
+	int			ri_total;	/* total regions */
+	struct xfs_log_iovec	*ri_buf;	/* ptr to regions buffer */
+	const struct xlog_recover_item_ops *ri_ops;
+};
+
+struct xlog_recover {
+	struct hlist_node	r_list;
+	xlog_tid_t		r_log_tid;	/* log's transaction id */
+	xfs_trans_header_t	r_theader;	/* trans header for partial */
+	int			r_state;	/* not needed */
+	xfs_lsn_t		r_lsn;		/* xact lsn */
+	struct list_head	r_itemq;	/* q for items */
+};
+
+#define ITEM_TYPE(i)	(*(unsigned short *)(i)->ri_buf[0].i_addr)
+
+#define	XLOG_RECOVER_CRCPASS	0
+#define	XLOG_RECOVER_PASS1	1
+#define	XLOG_RECOVER_PASS2	2
+
+void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len,
+		const struct xfs_buf_ops *ops);
+bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
+
+int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
+		struct xfs_inode **ipp);
+void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
+		uint64_t intent_id);
+int xlog_alloc_buf_cancel_table(struct xlog *log);
+void xlog_free_buf_cancel_table(struct xlog *log);
+
+#ifdef DEBUG
+void xlog_check_buf_cancel_table(struct xlog *log);
+#else
+#define xlog_check_buf_cancel_table(log) do { } while (0)
+#endif
+
+#endif	/* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
new file mode 100644
index 000000000..9975b93a7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_space.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trace.h"
+
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+	struct xfs_mount	*mp)
+{
+	int			size;
+	int			nblks;
+
+	size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
+	       MAXNAMELEN - 1;
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	nblks += XFS_B_TO_FSB(mp, size);
+	nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+
+	return  M_RES(mp)->tr_attrsetm.tr_logres +
+		M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+
+/*
+ * Compute an alternate set of log reservation sizes for use exclusively with
+ * minimum log size calculations.
+ */
+static void
+xfs_log_calc_trans_resv_for_minlogblocks(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resv)
+{
+	unsigned int		rmap_maxlevels = mp->m_rmap_maxlevels;
+
+	/*
+	 * In the early days of rmap+reflink, we always set the rmap maxlevels
+	 * to 9 even if the AG was small enough that it would never grow to
+	 * that height.  Transaction reservation sizes influence the minimum
+	 * log size calculation, which influences the size of the log that mkfs
+	 * creates.  Use the old value here to ensure that newly formatted
+	 * small filesystems will mount on older kernels.
+	 */
+	if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+		mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
+	xfs_trans_resv_calc(mp, resv);
+
+	if (xfs_has_reflink(mp)) {
+		/*
+		 * In the early days of reflink, typical log operation counts
+		 * were greatly overestimated.
+		 */
+		resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+		resv->tr_itruncate.tr_logcount =
+				XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+		resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+	} else if (xfs_has_rmapbt(mp)) {
+		/*
+		 * In the early days of non-reflink rmap, the impact of rmapbt
+		 * updates on log counts were not taken into account at all.
+		 */
+		resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+		resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+		resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+	}
+
+	/*
+	 * In the early days of reflink, we did not use deferred refcount
+	 * update log items, so log reservations must be recomputed using the
+	 * old calculations.
+	 */
+	resv->tr_write.tr_logres =
+			xfs_calc_write_reservation_minlogsize(mp);
+	resv->tr_itruncate.tr_logres =
+			xfs_calc_itruncate_reservation_minlogsize(mp);
+	resv->tr_qm_dqalloc.tr_logres =
+			xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
+
+	/* Put everything back the way it was.  This goes at the end. */
+	mp->m_rmap_maxlevels = rmap_maxlevels;
+}
+
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+void
+xfs_log_get_max_trans_res(
+	struct xfs_mount	*mp,
+	struct xfs_trans_res	*max_resp)
+{
+	struct xfs_trans_resv	resv = {};
+	struct xfs_trans_res	*resp;
+	struct xfs_trans_res	*end_resp;
+	unsigned int		i;
+	int			log_space = 0;
+	int			attr_space;
+
+	attr_space = xfs_log_calc_max_attrsetm_res(mp);
+
+	xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
+
+	resp = (struct xfs_trans_res *)&resv;
+	end_resp = (struct xfs_trans_res *)(&resv + 1);
+	for (i = 0; resp < end_resp; i++, resp++) {
+		int		tmp = resp->tr_logcount > 1 ?
+				      resp->tr_logres * resp->tr_logcount :
+				      resp->tr_logres;
+
+		trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
+		if (log_space < tmp) {
+			log_space = tmp;
+			*max_resp = *resp;		/* struct copy */
+		}
+	}
+
+	if (attr_space > log_space) {
+		*max_resp = resv.tr_attrsetm;	/* struct copy */
+		max_resp->tr_logres = attr_space;
+	}
+	trace_xfs_log_get_max_trans_res(mp, max_resp);
+}
+
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans_res	tres = {0};
+	int			max_logres;
+	int			min_logblks = 0;
+	int			lsunit = 0;
+
+	xfs_log_get_max_trans_res(mp, &tres);
+
+	max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+	if (tres.tr_logcount > 1)
+		max_logres *= tres.tr_logcount;
+
+	if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1)
+		lsunit = BTOBB(mp->m_sb.sb_logsunit);
+
+	/*
+	 * Two factors should be taken into account for calculating the minimum
+	 * log space.
+	 * 1) The fundamental limitation is that no single transaction can be
+	 *    larger than half size of the log.
+	 *
+	 *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+	 *    define, which is set to 3. That means we can definitely fit
+	 *    maximally sized 2 transactions in the log. We'll use this same
+	 *    value here.
+	 *
+	 * 2) If the lsunit option is specified, a transaction requires 2 LSU
+	 *    for the reservation because there are two log writes that can
+	 *    require padding - the transaction data and the commit record which
+	 *    are written separately and both can require padding to the LSU.
+	 *    Consider that we can have an active CIL reservation holding 2*LSU,
+	 *    but the CIL is not over a push threshold, in this case, if we
+	 *    don't have enough log space for at one new transaction, which
+	 *    includes another 2*LSU in the reservation, we will run into dead
+	 *    loop situation in log space grant procedure. i.e.
+	 *    xlog_grant_head_wait().
+	 *
+	 *    Hence the log size needs to be able to contain two maximally sized
+	 *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
+	 *
+	 * Also, the log size should be a multiple of the log stripe unit, round
+	 * it up to lsunit boundary if lsunit is specified.
+	 */
+	if (lsunit) {
+		min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+			      2 * lsunit;
+	} else
+		min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+	min_logblks *= XFS_MIN_LOG_FACTOR;
+
+	return XFS_BB_TO_FSB(mp, min_logblks);
+}
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
new file mode 100644
index 000000000..cb035da3f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_QUOTA_DEFS_H__
+#define __XFS_QUOTA_DEFS_H__
+
+/*
+ * Quota definitions shared between user and kernel source trees.
+ */
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef uint64_t	xfs_qcnt_t;
+
+typedef uint8_t		xfs_dqtype_t;
+
+#define XFS_DQTYPE_STRINGS \
+	{ XFS_DQTYPE_USER,	"USER" }, \
+	{ XFS_DQTYPE_PROJ,	"PROJ" }, \
+	{ XFS_DQTYPE_GROUP,	"GROUP" }, \
+	{ XFS_DQTYPE_BIGTIME,	"BIGTIME" }
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQFLAG_DIRTY	(1u << 0)	/* dquot is dirty */
+#define XFS_DQFLAG_FREEING	(1u << 1)	/* dquot is being torn down */
+
+#define XFS_DQFLAG_STRINGS \
+	{ XFS_DQFLAG_DIRTY,	"DIRTY" }, \
+	{ XFS_DQFLAG_FREEING,	"FREEING" }
+
+/*
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
+ */
+#define XFS_DQUOT_LOGRES(mp)	\
+	((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
+
+#define XFS_IS_QUOTA_ON(mp)		((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_ON(mp)		((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_ON(mp)		((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_ON(mp)		((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_PQUOTA_ENFD)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_UQUOTA	(1u << 0) /* user dquot requested */
+#define XFS_QMOPT_GQUOTA	(1u << 1) /* group dquot requested */
+#define XFS_QMOPT_PQUOTA	(1u << 2) /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES	(1u << 3) /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION	(1u << 4) /* change superblock version num */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS	(1u << 7)
+#define XFS_QMOPT_RES_RTBLKS	(1u << 8)
+#define XFS_QMOPT_BCOUNT	(1u << 9)
+#define XFS_QMOPT_ICOUNT	(1u << 10)
+#define XFS_QMOPT_RTBCOUNT	(1u << 11)
+#define XFS_QMOPT_DELBCOUNT	(1u << 12)
+#define XFS_QMOPT_DELRTBCOUNT	(1u << 13)
+#define XFS_QMOPT_RES_INOS	(1u << 14)
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT	(1u << 31)
+
+#define XFS_QMOPT_FLAGS \
+	{ XFS_QMOPT_UQUOTA,		"UQUOTA" }, \
+	{ XFS_QMOPT_PQUOTA,		"PQUOTA" }, \
+	{ XFS_QMOPT_FORCE_RES,		"FORCE_RES" }, \
+	{ XFS_QMOPT_SBVERSION,		"SBVERSION" }, \
+	{ XFS_QMOPT_GQUOTA,		"GQUOTA" }, \
+	{ XFS_QMOPT_INHERIT,		"INHERIT" }, \
+	{ XFS_QMOPT_RES_REGBLKS,	"RES_REGBLKS" }, \
+	{ XFS_QMOPT_RES_RTBLKS,		"RES_RTBLKS" }, \
+	{ XFS_QMOPT_BCOUNT,		"BCOUNT" }, \
+	{ XFS_QMOPT_ICOUNT,		"ICOUNT" }, \
+	{ XFS_QMOPT_RTBCOUNT,		"RTBCOUNT" }, \
+	{ XFS_QMOPT_DELBCOUNT,		"DELBCOUNT" }, \
+	{ XFS_QMOPT_DELRTBCOUNT,	"DELRTBCOUNT" }, \
+	{ XFS_QMOPT_RES_INOS,		"RES_INOS" }
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS	XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS	XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS	XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT	XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT	XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT	XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT	XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL	\
+		(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+
+extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
+		struct xfs_disk_dquot *ddq, xfs_dqid_t id);
+extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
+		struct xfs_dqblk *dqb, xfs_dqid_t id);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
+extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
+		xfs_dqid_t id, xfs_dqtype_t type);
+
+struct xfs_dquot;
+time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq,
+		__be32 dtimer);
+__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer);
+
+#endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
new file mode 100644
index 000000000..3f34bafe1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -0,0 +1,1915 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_refcount.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+struct kmem_cache	*xfs_refcount_intent_cache;
+
+/* Allowable refcount adjustment amounts. */
+enum xfs_refc_adjust_op {
+	XFS_REFCOUNT_ADJUST_INCREASE	= 1,
+	XFS_REFCOUNT_ADJUST_DECREASE	= -1,
+	XFS_REFCOUNT_ADJUST_COW_ALLOC	= 0,
+	XFS_REFCOUNT_ADJUST_COW_FREE	= -1,
+};
+
+STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen);
+STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen);
+
+/*
+ * Look up the first record less than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_le(
+	struct xfs_btree_cur	*cur,
+	enum xfs_refc_domain	domain,
+	xfs_agblock_t		bno,
+	int			*stat)
+{
+	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			xfs_refcount_encode_startblock(bno, domain),
+			XFS_LOOKUP_LE);
+	cur->bc_rec.rc.rc_startblock = bno;
+	cur->bc_rec.rc.rc_blockcount = 0;
+	cur->bc_rec.rc.rc_domain = domain;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Look up the first record greater than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_ge(
+	struct xfs_btree_cur	*cur,
+	enum xfs_refc_domain	domain,
+	xfs_agblock_t		bno,
+	int			*stat)
+{
+	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			xfs_refcount_encode_startblock(bno, domain),
+			XFS_LOOKUP_GE);
+	cur->bc_rec.rc.rc_startblock = bno;
+	cur->bc_rec.rc.rc_blockcount = 0;
+	cur->bc_rec.rc.rc_domain = domain;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Look up the first record equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_eq(
+	struct xfs_btree_cur	*cur,
+	enum xfs_refc_domain	domain,
+	xfs_agblock_t		bno,
+	int			*stat)
+{
+	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			xfs_refcount_encode_startblock(bno, domain),
+			XFS_LOOKUP_LE);
+	cur->bc_rec.rc.rc_startblock = bno;
+	cur->bc_rec.rc.rc_blockcount = 0;
+	cur->bc_rec.rc.rc_domain = domain;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/* Convert on-disk record to in-core format. */
+void
+xfs_refcount_btrec_to_irec(
+	const union xfs_btree_rec	*rec,
+	struct xfs_refcount_irec	*irec)
+{
+	uint32_t			start;
+
+	start = be32_to_cpu(rec->refc.rc_startblock);
+	if (start & XFS_REFC_COWFLAG) {
+		start &= ~XFS_REFC_COWFLAG;
+		irec->rc_domain = XFS_REFC_DOMAIN_COW;
+	} else {
+		irec->rc_domain = XFS_REFC_DOMAIN_SHARED;
+	}
+
+	irec->rc_startblock = start;
+	irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
+	irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_refcount_get_rec(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec,
+	int				*stat)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_perag		*pag = cur->bc_ag.pag;
+	union xfs_btree_rec		*rec;
+	int				error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (error || !*stat)
+		return error;
+
+	xfs_refcount_btrec_to_irec(rec, irec);
+	if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
+		goto out_bad_rec;
+
+	if (!xfs_refcount_check_domain(irec))
+		goto out_bad_rec;
+
+	/* check for valid extent range, including overflow */
+	if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
+		goto out_bad_rec;
+
+	if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
+		goto out_bad_rec;
+
+	trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec);
+	return 0;
+
+out_bad_rec:
+	xfs_warn(mp,
+		"Refcount BTree record corruption in AG %d detected!",
+		pag->pag_agno);
+	xfs_warn(mp,
+		"Start block 0x%x, block count 0x%x, references 0x%x",
+		irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_update(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec)
+{
+	union xfs_btree_rec	rec;
+	uint32_t		start;
+	int			error;
+
+	trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	rec.refc.rc_startblock = cpu_to_be32(start);
+	rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
+	rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+
+	error = xfs_btree_update(cur, &rec);
+	if (error)
+		trace_xfs_refcount_update_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Insert the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int
+xfs_refcount_insert(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec,
+	int				*i)
+{
+	int				error;
+
+	trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+
+	cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
+	cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
+	cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+	cur->bc_rec.rc.rc_domain = irec->rc_domain;
+
+	error = xfs_btree_insert(cur, i);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+out_error:
+	if (error)
+		trace_xfs_refcount_insert_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Remove the record referred to by cur, then set the pointer to the spot
+ * where the record could be re-inserted, in case we want to increment or
+ * decrement the cursor.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_delete(
+	struct xfs_btree_cur	*cur,
+	int			*i)
+{
+	struct xfs_refcount_irec	irec;
+	int			found_rec;
+	int			error;
+
+	error = xfs_refcount_get_rec(cur, &irec, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+	trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
+	error = xfs_btree_delete(cur, i);
+	if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+	if (error)
+		goto out_error;
+	error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock,
+			&found_rec);
+out_error:
+	if (error)
+		trace_xfs_refcount_delete_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Adjusting the Reference Count
+ *
+ * As stated elsewhere, the reference count btree (refcbt) stores
+ * >1 reference counts for extents of physical blocks.  In this
+ * operation, we're either raising or lowering the reference count of
+ * some subrange stored in the tree:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+-----+ +--+--------+---------
+ *  2  |   | 3 |  4  | |17|   55   |   10
+ * ----+   +---+-----+ +--+--------+---------
+ * X axis is physical blocks number;
+ * reference counts are the numbers inside the rectangles
+ *
+ * The first thing we need to do is to ensure that there are no
+ * refcount extents crossing either boundary of the range to be
+ * adjusted.  For any extent that does cross a boundary, split it into
+ * two extents so that we can increment the refcount of one of the
+ * pieces later:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+-----+ +--+--------+----+----
+ *  2  |   | 3 |  2  | |17|   55   | 10 | 10
+ * ----+   +---+-----+ +--+--------+----+----
+ *
+ * For this next step, let's assume that all the physical blocks in
+ * the adjustment range are mapped to a file and are therefore in use
+ * at least once.  Therefore, we can infer that any gap in the
+ * refcount tree within the adjustment range represents a physical
+ * extent with refcount == 1:
+ *
+ *      <------ adjustment range ------>
+ * ----+---+---+-----+-+--+--------+----+----
+ *  2  |"1"| 3 |  2  |1|17|   55   | 10 | 10
+ * ----+---+---+-----+-+--+--------+----+----
+ *      ^
+ *
+ * For each extent that falls within the interval range, figure out
+ * which extent is to the left or the right of that extent.  Now we
+ * have a left, current, and right extent.  If the new reference count
+ * of the center extent enables us to merge left, center, and right
+ * into one record covering all three, do so.  If the center extent is
+ * at the left end of the range, abuts the left extent, and its new
+ * reference count matches the left extent's record, then merge them.
+ * If the center extent is at the right end of the range, abuts the
+ * right extent, and the reference counts match, merge those.  In the
+ * example, we can left merge (assuming an increment operation):
+ *
+ *      <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ *    2    | 3 |  2  |1|17|   55   | 10 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *          ^
+ *
+ * For all other extents within the range, adjust the reference count
+ * or delete it if the refcount falls below 2.  If we were
+ * incrementing, the end result looks like this:
+ *
+ *      <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ *    2    | 4 |  3  |2|18|   56   | 11 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *
+ * The result of a decrement operation looks as such:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+       +--+--------+----+----
+ *  2  |   | 2 |       |16|   54   |  9 | 10
+ * ----+   +---+       +--+--------+----+----
+ *      DDDD    111111DD
+ *
+ * The blocks marked "D" are freed; the blocks marked "1" are only
+ * referenced once and therefore the record is removed from the
+ * refcount btree.
+ */
+
+/* Next block after this extent. */
+static inline xfs_agblock_t
+xfs_refc_next(
+	struct xfs_refcount_irec	*rc)
+{
+	return rc->rc_startblock + rc->rc_blockcount;
+}
+
+/*
+ * Split a refcount extent that crosses agbno.
+ */
+STATIC int
+xfs_refcount_split_extent(
+	struct xfs_btree_cur		*cur,
+	enum xfs_refc_domain		domain,
+	xfs_agblock_t			agbno,
+	bool				*shape_changed)
+{
+	struct xfs_refcount_irec	rcext, tmp;
+	int				found_rec;
+	int				error;
+
+	*shape_changed = false;
+	error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &rcext, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+	if (rcext.rc_domain != domain)
+		return 0;
+	if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
+		return 0;
+
+	*shape_changed = true;
+	trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			&rcext, agbno);
+
+	/* Establish the right extent. */
+	tmp = rcext;
+	tmp.rc_startblock = agbno;
+	tmp.rc_blockcount -= (agbno - rcext.rc_startblock);
+	error = xfs_refcount_update(cur, &tmp);
+	if (error)
+		goto out_error;
+
+	/* Insert the left extent. */
+	tmp = rcext;
+	tmp.rc_blockcount = agbno - rcext.rc_startblock;
+	error = xfs_refcount_insert(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+	return error;
+
+out_error:
+	trace_xfs_refcount_split_extent_error(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge the left, center, and right extents.
+ */
+STATIC int
+xfs_refcount_merge_center_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*center,
+	struct xfs_refcount_irec	*right,
+	unsigned long long		extlen,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_center_extents(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, left, center, right);
+
+	ASSERT(left->rc_domain == center->rc_domain);
+	ASSERT(right->rc_domain == center->rc_domain);
+
+	/*
+	 * Make sure the center and right extents are not in the btree.
+	 * If the center extent was synthesized, the first delete call
+	 * removes the right extent and we skip the second deletion.
+	 * If center and right were in the btree, then the first delete
+	 * call removes the center and the second one removes the right
+	 * extent.
+	 */
+	error = xfs_refcount_lookup_ge(cur, center->rc_domain,
+			center->rc_startblock, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	error = xfs_refcount_delete(cur, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	if (center->rc_refcount > 1) {
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+	}
+
+	/* Enlarge the left extent. */
+	error = xfs_refcount_lookup_le(cur, left->rc_domain,
+			left->rc_startblock, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	left->rc_blockcount = extlen;
+	error = xfs_refcount_update(cur, left);
+	if (error)
+		goto out_error;
+
+	*aglen = 0;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge with the left extent.
+ */
+STATIC int
+xfs_refcount_merge_left_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*cleft,
+	xfs_agblock_t			*agbno,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_left_extent(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, left, cleft);
+
+	ASSERT(left->rc_domain == cleft->rc_domain);
+
+	/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
+	if (cleft->rc_refcount > 1) {
+		error = xfs_refcount_lookup_le(cur, cleft->rc_domain,
+				cleft->rc_startblock, &found_rec);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+	}
+
+	/* Enlarge the left extent. */
+	error = xfs_refcount_lookup_le(cur, left->rc_domain,
+			left->rc_startblock, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	left->rc_blockcount += cleft->rc_blockcount;
+	error = xfs_refcount_update(cur, left);
+	if (error)
+		goto out_error;
+
+	*agbno += cleft->rc_blockcount;
+	*aglen -= cleft->rc_blockcount;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge with the right extent.
+ */
+STATIC int
+xfs_refcount_merge_right_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*right,
+	struct xfs_refcount_irec	*cright,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_right_extent(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, cright, right);
+
+	ASSERT(right->rc_domain == cright->rc_domain);
+
+	/*
+	 * If the extent ending at agbno+aglen (cright) wasn't synthesized,
+	 * remove it.
+	 */
+	if (cright->rc_refcount > 1) {
+		error = xfs_refcount_lookup_le(cur, cright->rc_domain,
+				cright->rc_startblock, &found_rec);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+	}
+
+	/* Enlarge the right extent. */
+	error = xfs_refcount_lookup_le(cur, right->rc_domain,
+			right->rc_startblock, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	right->rc_startblock -= cright->rc_blockcount;
+	right->rc_blockcount += cright->rc_blockcount;
+	error = xfs_refcount_update(cur, right);
+	if (error)
+		goto out_error;
+
+	*aglen -= cright->rc_blockcount;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Find the left extent and the one after it (cleft).  This function assumes
+ * that we've already split any extent crossing agbno.
+ */
+STATIC int
+xfs_refcount_find_left_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*cleft,
+	enum xfs_refc_domain		domain,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen)
+{
+	struct xfs_refcount_irec	tmp;
+	int				error;
+	int				found_rec;
+
+	left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
+	error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	if (tmp.rc_domain != domain)
+		return 0;
+	if (xfs_refc_next(&tmp) != agbno)
+		return 0;
+	/* We have a left extent; retrieve (or invent) the next right one */
+	*left = tmp;
+
+	error = xfs_btree_increment(cur, 0, &found_rec);
+	if (error)
+		goto out_error;
+	if (found_rec) {
+		error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+
+		if (tmp.rc_domain != domain)
+			goto not_found;
+
+		/* if tmp starts at the end of our range, just use that */
+		if (tmp.rc_startblock == agbno)
+			*cleft = tmp;
+		else {
+			/*
+			 * There's a gap in the refcntbt at the start of the
+			 * range we're interested in (refcount == 1) so
+			 * synthesize the implied extent and pass it back.
+			 * We assume here that the agbno/aglen range was
+			 * passed in from a data fork extent mapping and
+			 * therefore is allocated to exactly one owner.
+			 */
+			cleft->rc_startblock = agbno;
+			cleft->rc_blockcount = min(aglen,
+					tmp.rc_startblock - agbno);
+			cleft->rc_refcount = 1;
+			cleft->rc_domain = domain;
+		}
+	} else {
+not_found:
+		/*
+		 * No extents, so pretend that there's one covering the whole
+		 * range.
+		 */
+		cleft->rc_startblock = agbno;
+		cleft->rc_blockcount = aglen;
+		cleft->rc_refcount = 1;
+		cleft->rc_domain = domain;
+	}
+	trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			left, cleft, agbno);
+	return error;
+
+out_error:
+	trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Find the right extent and the one before it (cright).  This function
+ * assumes that we've already split any extents crossing agbno + aglen.
+ */
+STATIC int
+xfs_refcount_find_right_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*right,
+	struct xfs_refcount_irec	*cright,
+	enum xfs_refc_domain		domain,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen)
+{
+	struct xfs_refcount_irec	tmp;
+	int				error;
+	int				found_rec;
+
+	right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
+	error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	if (tmp.rc_domain != domain)
+		return 0;
+	if (tmp.rc_startblock != agbno + aglen)
+		return 0;
+	/* We have a right extent; retrieve (or invent) the next left one */
+	*right = tmp;
+
+	error = xfs_btree_decrement(cur, 0, &found_rec);
+	if (error)
+		goto out_error;
+	if (found_rec) {
+		error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+
+		if (tmp.rc_domain != domain)
+			goto not_found;
+
+		/* if tmp ends at the end of our range, just use that */
+		if (xfs_refc_next(&tmp) == agbno + aglen)
+			*cright = tmp;
+		else {
+			/*
+			 * There's a gap in the refcntbt at the end of the
+			 * range we're interested in (refcount == 1) so
+			 * create the implied extent and pass it back.
+			 * We assume here that the agbno/aglen range was
+			 * passed in from a data fork extent mapping and
+			 * therefore is allocated to exactly one owner.
+			 */
+			cright->rc_startblock = max(agbno, xfs_refc_next(&tmp));
+			cright->rc_blockcount = right->rc_startblock -
+					cright->rc_startblock;
+			cright->rc_refcount = 1;
+			cright->rc_domain = domain;
+		}
+	} else {
+not_found:
+		/*
+		 * No extents, so pretend that there's one covering the whole
+		 * range.
+		 */
+		cright->rc_startblock = agbno;
+		cright->rc_blockcount = aglen;
+		cright->rc_refcount = 1;
+		cright->rc_domain = domain;
+	}
+	trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			cright, right, agbno + aglen);
+	return error;
+
+out_error:
+	trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/* Is this extent valid? */
+static inline bool
+xfs_refc_valid(
+	struct xfs_refcount_irec	*rc)
+{
+	return rc->rc_startblock != NULLAGBLOCK;
+}
+
+/*
+ * Try to merge with any extents on the boundaries of the adjustment range.
+ */
+STATIC int
+xfs_refcount_merge_extents(
+	struct xfs_btree_cur	*cur,
+	enum xfs_refc_domain	domain,
+	xfs_agblock_t		*agbno,
+	xfs_extlen_t		*aglen,
+	enum xfs_refc_adjust_op adjust,
+	bool			*shape_changed)
+{
+	struct xfs_refcount_irec	left = {0}, cleft = {0};
+	struct xfs_refcount_irec	cright = {0}, right = {0};
+	int				error;
+	unsigned long long		ulen;
+	bool				cequal;
+
+	*shape_changed = false;
+	/*
+	 * Find the extent just below agbno [left], just above agbno [cleft],
+	 * just below (agbno + aglen) [cright], and just above (agbno + aglen)
+	 * [right].
+	 */
+	error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain,
+			*agbno, *aglen);
+	if (error)
+		return error;
+	error = xfs_refcount_find_right_extents(cur, &right, &cright, domain,
+			*agbno, *aglen);
+	if (error)
+		return error;
+
+	/* No left or right extent to merge; exit. */
+	if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right))
+		return 0;
+
+	cequal = (cleft.rc_startblock == cright.rc_startblock) &&
+		 (cleft.rc_blockcount == cright.rc_blockcount);
+
+	/* Try to merge left, cleft, and right.  cleft must == cright. */
+	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
+			right.rc_blockcount;
+	if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
+	    xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
+	    left.rc_refcount == cleft.rc_refcount + adjust &&
+	    right.rc_refcount == cleft.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		return xfs_refcount_merge_center_extents(cur, &left, &cleft,
+				&right, ulen, aglen);
+	}
+
+	/* Try to merge left and cleft. */
+	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
+	if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
+	    left.rc_refcount == cleft.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
+				agbno, aglen);
+		if (error)
+			return error;
+
+		/*
+		 * If we just merged left + cleft and cleft == cright,
+		 * we no longer have a cright to merge with right.  We're done.
+		 */
+		if (cequal)
+			return 0;
+	}
+
+	/* Try to merge cright and right. */
+	ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
+	if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
+	    right.rc_refcount == cright.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		return xfs_refcount_merge_right_extent(cur, &right, &cright,
+				aglen);
+	}
+
+	return 0;
+}
+
+/*
+ * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
+ * true incorrectly is a shutdown FS; the penalty for guessing false
+ * incorrectly is more transaction rolls than might be necessary.
+ * Be conservative here.
+ */
+static bool
+xfs_refcount_still_have_space(
+	struct xfs_btree_cur		*cur)
+{
+	unsigned long			overhead;
+
+	/*
+	 * Worst case estimate: full splits of the free space and rmap btrees
+	 * to handle each of the shape changes to the refcount btree.
+	 */
+	overhead = xfs_allocfree_block_count(cur->bc_mp,
+				cur->bc_ag.refc.shape_changes);
+	overhead += cur->bc_mp->m_refc_maxlevels;
+	overhead *= cur->bc_mp->m_sb.sb_blocksize;
+
+	/*
+	 * Only allow 2 refcount extent updates per transaction if the
+	 * refcount continue update "error" has been injected.
+	 */
+	if (cur->bc_ag.refc.nr_ops > 2 &&
+	    XFS_TEST_ERROR(false, cur->bc_mp,
+			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
+		return false;
+
+	if (cur->bc_ag.refc.nr_ops == 0)
+		return true;
+	else if (overhead > cur->bc_tp->t_log_res)
+		return false;
+	return  cur->bc_tp->t_log_res - overhead >
+		cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+}
+
+/*
+ * Adjust the refcounts of middle extents.  At this point we should have
+ * split extents that crossed the adjustment range; merged with adjacent
+ * extents; and updated agbno/aglen to reflect the merges.  Therefore,
+ * all we have to do is update the extents inside [agbno, agbno + aglen].
+ */
+STATIC int
+xfs_refcount_adjust_extents(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		*agbno,
+	xfs_extlen_t		*aglen,
+	enum xfs_refc_adjust_op	adj)
+{
+	struct xfs_refcount_irec	ext, tmp;
+	int				error;
+	int				found_rec, found_tmp;
+	xfs_fsblock_t			fsbno;
+
+	/* Merging did all the work already. */
+	if (*aglen == 0)
+		return 0;
+
+	error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno,
+			&found_rec);
+	if (error)
+		goto out_error;
+
+	while (*aglen > 0 && xfs_refcount_still_have_space(cur)) {
+		error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+		if (error)
+			goto out_error;
+		if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
+			ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+			ext.rc_blockcount = 0;
+			ext.rc_refcount = 0;
+			ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
+		}
+
+		/*
+		 * Deal with a hole in the refcount tree; if a file maps to
+		 * these blocks and there's no refcountbt record, pretend that
+		 * there is one with refcount == 1.
+		 */
+		if (ext.rc_startblock != *agbno) {
+			tmp.rc_startblock = *agbno;
+			tmp.rc_blockcount = min(*aglen,
+					ext.rc_startblock - *agbno);
+			tmp.rc_refcount = 1 + adj;
+			tmp.rc_domain = XFS_REFC_DOMAIN_SHARED;
+
+			trace_xfs_refcount_modify_extent(cur->bc_mp,
+					cur->bc_ag.pag->pag_agno, &tmp);
+
+			/*
+			 * Either cover the hole (increment) or
+			 * delete the range (decrement).
+			 */
+			cur->bc_ag.refc.nr_ops++;
+			if (tmp.rc_refcount) {
+				error = xfs_refcount_insert(cur, &tmp,
+						&found_tmp);
+				if (error)
+					goto out_error;
+				if (XFS_IS_CORRUPT(cur->bc_mp,
+						   found_tmp != 1)) {
+					error = -EFSCORRUPTED;
+					goto out_error;
+				}
+			} else {
+				fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+						cur->bc_ag.pag->pag_agno,
+						tmp.rc_startblock);
+				xfs_free_extent_later(cur->bc_tp, fsbno,
+						  tmp.rc_blockcount, NULL);
+			}
+
+			(*agbno) += tmp.rc_blockcount;
+			(*aglen) -= tmp.rc_blockcount;
+
+			/* Stop if there's nothing left to modify */
+			if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+				break;
+
+			/* Move the cursor to the start of ext. */
+			error = xfs_refcount_lookup_ge(cur,
+					XFS_REFC_DOMAIN_SHARED, *agbno,
+					&found_rec);
+			if (error)
+				goto out_error;
+		}
+
+		/*
+		 * A previous step trimmed agbno/aglen such that the end of the
+		 * range would not be in the middle of the record.  If this is
+		 * no longer the case, something is seriously wrong with the
+		 * btree.  Make sure we never feed the synthesized record into
+		 * the processing loop below.
+		 */
+		if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
+		    XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+
+		/*
+		 * Adjust the reference count and either update the tree
+		 * (incr) or free the blocks (decr).
+		 */
+		if (ext.rc_refcount == MAXREFCOUNT)
+			goto skip;
+		ext.rc_refcount += adj;
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, &ext);
+		cur->bc_ag.refc.nr_ops++;
+		if (ext.rc_refcount > 1) {
+			error = xfs_refcount_update(cur, &ext);
+			if (error)
+				goto out_error;
+		} else if (ext.rc_refcount == 1) {
+			error = xfs_refcount_delete(cur, &found_rec);
+			if (error)
+				goto out_error;
+			if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+				error = -EFSCORRUPTED;
+				goto out_error;
+			}
+			goto advloop;
+		} else {
+			fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+					cur->bc_ag.pag->pag_agno,
+					ext.rc_startblock);
+			xfs_free_extent_later(cur->bc_tp, fsbno,
+					ext.rc_blockcount, NULL);
+		}
+
+skip:
+		error = xfs_btree_increment(cur, 0, &found_rec);
+		if (error)
+			goto out_error;
+
+advloop:
+		(*agbno) += ext.rc_blockcount;
+		(*aglen) -= ext.rc_blockcount;
+	}
+
+	return error;
+out_error:
+	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/* Adjust the reference count of a range of AG blocks. */
+STATIC int
+xfs_refcount_adjust(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	xfs_agblock_t		*new_agbno,
+	xfs_extlen_t		*new_aglen,
+	enum xfs_refc_adjust_op	adj)
+{
+	bool			shape_changed;
+	int			shape_changes = 0;
+	int			error;
+
+	*new_agbno = agbno;
+	*new_aglen = aglen;
+	if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
+		trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+				agbno, aglen);
+	else
+		trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+				agbno, aglen);
+
+	/*
+	 * Ensure that no rcextents cross the boundary of the adjustment range.
+	 */
+	error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+			agbno, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+
+	error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+			agbno + aglen, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+
+	/*
+	 * Try to merge with the left or right extents of the range.
+	 */
+	error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
+			new_agbno, new_aglen, adj, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+	if (shape_changes)
+		cur->bc_ag.refc.shape_changes++;
+
+	/* Now that we've taken care of the ends, adjust the middle extents */
+	error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
+	if (error)
+		goto out_error;
+
+	return 0;
+
+out_error:
+	trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			error, _RET_IP_);
+	return error;
+}
+
+/* Clean up after calling xfs_refcount_finish_one. */
+void
+xfs_refcount_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	struct xfs_buf		*agbp;
+
+	if (rcur == NULL)
+		return;
+	agbp = rcur->bc_ag.agbp;
+	xfs_btree_del_cursor(rcur, error);
+	if (error)
+		xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Set up a continuation a deferred refcount operation by updating the intent.
+ * Checks to make sure we're not going to run off the end of the AG.
+ */
+static inline int
+xfs_refcount_continue_op(
+	struct xfs_btree_cur		*cur,
+	xfs_fsblock_t			startblock,
+	xfs_agblock_t			new_agbno,
+	xfs_extlen_t			new_len,
+	xfs_fsblock_t			*new_fsbno)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_perag		*pag = cur->bc_ag.pag;
+
+	if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
+		return -EFSCORRUPTED;
+
+	*new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+
+	ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
+	ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
+
+	return 0;
+}
+
+/*
+ * Process one of the deferred refcount operations.  We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_refcount_finish_one(
+	struct xfs_trans		*tp,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	xfs_fsblock_t			*new_fsb,
+	xfs_extlen_t			*new_len,
+	struct xfs_btree_cur		**pcur)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*rcur;
+	struct xfs_buf			*agbp = NULL;
+	int				error = 0;
+	xfs_agblock_t			bno;
+	xfs_agblock_t			new_agbno;
+	unsigned long			nr_ops = 0;
+	int				shape_changes = 0;
+	struct xfs_perag		*pag;
+
+	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
+	bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
+			type, XFS_FSB_TO_AGBNO(mp, startblock),
+			blockcount);
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
+		error = -EIO;
+		goto out_drop;
+	}
+
+	/*
+	 * If we haven't gotten a cursor or the cursor AG doesn't match
+	 * the startblock, get one now.
+	 */
+	rcur = *pcur;
+	if (rcur != NULL && rcur->bc_ag.pag != pag) {
+		nr_ops = rcur->bc_ag.refc.nr_ops;
+		shape_changes = rcur->bc_ag.refc.shape_changes;
+		xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+		rcur = NULL;
+		*pcur = NULL;
+	}
+	if (rcur == NULL) {
+		error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING,
+				&agbp);
+		if (error)
+			goto out_drop;
+
+		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+		rcur->bc_ag.refc.nr_ops = nr_ops;
+		rcur->bc_ag.refc.shape_changes = shape_changes;
+	}
+	*pcur = rcur;
+
+	switch (type) {
+	case XFS_REFCOUNT_INCREASE:
+		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+				new_len, XFS_REFCOUNT_ADJUST_INCREASE);
+		if (error)
+			goto out_drop;
+		if (*new_len > 0)
+			error = xfs_refcount_continue_op(rcur, startblock,
+					new_agbno, *new_len, new_fsb);
+		break;
+	case XFS_REFCOUNT_DECREASE:
+		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+				new_len, XFS_REFCOUNT_ADJUST_DECREASE);
+		if (error)
+			goto out_drop;
+		if (*new_len > 0)
+			error = xfs_refcount_continue_op(rcur, startblock,
+					new_agbno, *new_len, new_fsb);
+		break;
+	case XFS_REFCOUNT_ALLOC_COW:
+		*new_fsb = startblock + blockcount;
+		*new_len = 0;
+		error = __xfs_refcount_cow_alloc(rcur, bno, blockcount);
+		break;
+	case XFS_REFCOUNT_FREE_COW:
+		*new_fsb = startblock + blockcount;
+		*new_len = 0;
+		error = __xfs_refcount_cow_free(rcur, bno, blockcount);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+	}
+	if (!error && *new_len > 0)
+		trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type,
+				bno, blockcount, new_agbno, *new_len);
+out_drop:
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Record a refcount intent for later processing.
+ */
+static void
+__xfs_refcount_add(
+	struct xfs_trans		*tp,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount)
+{
+	struct xfs_refcount_intent	*ri;
+
+	trace_xfs_refcount_defer(tp->t_mountp,
+			XFS_FSB_TO_AGNO(tp->t_mountp, startblock),
+			type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+			blockcount);
+
+	ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+			GFP_NOFS | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&ri->ri_list);
+	ri->ri_type = type;
+	ri->ri_startblock = startblock;
+	ri->ri_blockcount = blockcount;
+
+	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+}
+
+/*
+ * Increase the reference count of the blocks backing a file's extent.
+ */
+void
+xfs_refcount_increase_extent(
+	struct xfs_trans		*tp,
+	struct xfs_bmbt_irec		*PREV)
+{
+	if (!xfs_has_reflink(tp->t_mountp))
+		return;
+
+	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+			PREV->br_blockcount);
+}
+
+/*
+ * Decrease the reference count of the blocks backing a file's extent.
+ */
+void
+xfs_refcount_decrease_extent(
+	struct xfs_trans		*tp,
+	struct xfs_bmbt_irec		*PREV)
+{
+	if (!xfs_has_reflink(tp->t_mountp))
+		return;
+
+	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+			PREV->br_blockcount);
+}
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen.  If
+ * find_end_of_shared is set, return the longest contiguous extent of
+ * shared blocks; if not, just return the first extent we find.  If no
+ * shared blocks are found, fbno and flen will be set to NULLAGBLOCK
+ * and 0, respectively.
+ */
+int
+xfs_refcount_find_shared(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen,
+	xfs_agblock_t			*fbno,
+	xfs_extlen_t			*flen,
+	bool				find_end_of_shared)
+{
+	struct xfs_refcount_irec	tmp;
+	int				i;
+	int				have;
+	int				error;
+
+	trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			agbno, aglen);
+
+	/* By default, skip the whole range */
+	*fbno = NULLAGBLOCK;
+	*flen = 0;
+
+	/* Try to find a refcount extent that crosses the start */
+	error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno,
+			&have);
+	if (error)
+		goto out_error;
+	if (!have) {
+		/* No left extent, look at the next one */
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			goto done;
+	}
+	error = xfs_refcount_get_rec(cur, &tmp, &i);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+	if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+		goto done;
+
+	/* If the extent ends before the start, look at the next one */
+	if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			goto done;
+		error = xfs_refcount_get_rec(cur, &tmp, &i);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+			goto done;
+	}
+
+	/* If the extent starts after the range we want, bail out */
+	if (tmp.rc_startblock >= agbno + aglen)
+		goto done;
+
+	/* We found the start of a shared extent! */
+	if (tmp.rc_startblock < agbno) {
+		tmp.rc_blockcount -= (agbno - tmp.rc_startblock);
+		tmp.rc_startblock = agbno;
+	}
+
+	*fbno = tmp.rc_startblock;
+	*flen = min(tmp.rc_blockcount, agbno + aglen - *fbno);
+	if (!find_end_of_shared)
+		goto done;
+
+	/* Otherwise, find the end of this shared extent */
+	while (*fbno + *flen < agbno + aglen) {
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			break;
+		error = xfs_refcount_get_rec(cur, &tmp, &i);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED ||
+		    tmp.rc_startblock >= agbno + aglen ||
+		    tmp.rc_startblock != *fbno + *flen)
+			break;
+		*flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
+	}
+
+done:
+	trace_xfs_refcount_find_shared_result(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, *fbno, *flen);
+
+out_error:
+	if (error)
+		trace_xfs_refcount_find_shared_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Recovering CoW Blocks After a Crash
+ *
+ * Due to the way that the copy on write mechanism works, there's a window of
+ * opportunity in which we can lose track of allocated blocks during a crash.
+ * Because CoW uses delayed allocation in the in-core CoW fork, writeback
+ * causes blocks to be allocated and stored in the CoW fork.  The blocks are
+ * no longer in the free space btree but are not otherwise recorded anywhere
+ * until the write completes and the blocks are mapped into the file.  A crash
+ * in between allocation and remapping results in the replacement blocks being
+ * lost.  This situation is exacerbated by the CoW extent size hint because
+ * allocations can hang around for long time.
+ *
+ * However, there is a place where we can record these allocations before they
+ * become mappings -- the reference count btree.  The btree does not record
+ * extents with refcount == 1, so we can record allocations with a refcount of
+ * 1.  Blocks being used for CoW writeout cannot be shared, so there should be
+ * no conflict with shared block records.  These mappings should be created
+ * when we allocate blocks to the CoW fork and deleted when they're removed
+ * from the CoW fork.
+ *
+ * Minor nit: records for in-progress CoW allocations and records for shared
+ * extents must never be merged, to preserve the property that (except for CoW
+ * allocations) there are no refcount btree entries with refcount == 1.  The
+ * only time this could potentially happen is when unsharing a block that's
+ * adjacent to CoW allocations, so we must be careful to avoid this.
+ *
+ * At mount time we recover lost CoW allocations by searching the refcount
+ * btree for these refcount == 1 mappings.  These represent CoW allocations
+ * that were in progress at the time the filesystem went down, so we can free
+ * them to get the space back.
+ *
+ * This mechanism is superior to creating EFIs for unmapped CoW extents for
+ * several reasons -- first, EFIs pin the tail of the log and would have to be
+ * periodically relogged to avoid filling up the log.  Second, CoW completions
+ * will have to file an EFD and create new EFIs for whatever remains in the
+ * CoW fork; this partially takes care of (1) but extent-size reservations
+ * will have to periodically relog even if there's no writeout in progress.
+ * This can happen if the CoW extent size hint is set, which you really want.
+ * Third, EFIs cannot currently be automatically relogged into newer
+ * transactions to advance the log tail.  Fourth, stuffing the log full of
+ * EFIs places an upper bound on the number of CoW allocations that can be
+ * held filesystem-wide at any given time.  Recording them in the refcount
+ * btree doesn't require us to maintain any state in memory and doesn't pin
+ * the log.
+ */
+/*
+ * Adjust the refcounts of CoW allocations.  These allocations are "magic"
+ * in that they're not referenced anywhere else in the filesystem, so we
+ * stash them in the refcount btree with a refcount of 1 until either file
+ * remapping (or CoW cancellation) happens.
+ */
+STATIC int
+xfs_refcount_adjust_cow_extents(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	enum xfs_refc_adjust_op	adj)
+{
+	struct xfs_refcount_irec	ext, tmp;
+	int				error;
+	int				found_rec, found_tmp;
+
+	if (aglen == 0)
+		return 0;
+
+	/* Find any overlapping refcount records */
+	error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno,
+			&found_rec);
+	if (error)
+		goto out_error;
+	error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
+				ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+	if (!found_rec) {
+		ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+		ext.rc_blockcount = 0;
+		ext.rc_refcount = 0;
+		ext.rc_domain = XFS_REFC_DOMAIN_COW;
+	}
+
+	switch (adj) {
+	case XFS_REFCOUNT_ADJUST_COW_ALLOC:
+		/* Adding a CoW reservation, there should be nothing here. */
+		if (XFS_IS_CORRUPT(cur->bc_mp,
+				   agbno + aglen > ext.rc_startblock)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+
+		tmp.rc_startblock = agbno;
+		tmp.rc_blockcount = aglen;
+		tmp.rc_refcount = 1;
+		tmp.rc_domain = XFS_REFC_DOMAIN_COW;
+
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, &tmp);
+
+		error = xfs_refcount_insert(cur, &tmp,
+				&found_tmp);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		break;
+	case XFS_REFCOUNT_ADJUST_COW_FREE:
+		/* Removing a CoW reservation, there should be one extent. */
+		if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+
+		ext.rc_refcount = 0;
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, &ext);
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	return error;
+out_error:
+	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Add or remove refcount btree entries for CoW reservations.
+ */
+STATIC int
+xfs_refcount_adjust_cow(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	enum xfs_refc_adjust_op	adj)
+{
+	bool			shape_changed;
+	int			error;
+
+	/*
+	 * Ensure that no rcextents cross the boundary of the adjustment range.
+	 */
+	error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+			agbno, &shape_changed);
+	if (error)
+		goto out_error;
+
+	error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+			agbno + aglen, &shape_changed);
+	if (error)
+		goto out_error;
+
+	/*
+	 * Try to merge with the left or right extents of the range.
+	 */
+	error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno,
+			&aglen, adj, &shape_changed);
+	if (error)
+		goto out_error;
+
+	/* Now that we've taken care of the ends, adjust the middle extents */
+	error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj);
+	if (error)
+		goto out_error;
+
+	return 0;
+
+out_error:
+	trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Record a CoW allocation in the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_alloc(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen)
+{
+	trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
+			agbno, aglen);
+
+	/* Add refcount btree reservation */
+	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
+			XFS_REFCOUNT_ADJUST_COW_ALLOC);
+}
+
+/*
+ * Remove a CoW allocation from the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_free(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen)
+{
+	trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
+			agbno, aglen);
+
+	/* Remove refcount btree reservation */
+	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
+			XFS_REFCOUNT_ADJUST_COW_FREE);
+}
+
+/* Record a CoW staging extent in the refcount btree. */
+void
+xfs_refcount_alloc_cow_extent(
+	struct xfs_trans		*tp,
+	xfs_fsblock_t			fsb,
+	xfs_extlen_t			len)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	if (!xfs_has_reflink(mp))
+		return;
+
+	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+
+	/* Add rmap entry */
+	xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+}
+
+/* Forget a CoW staging event in the refcount btree. */
+void
+xfs_refcount_free_cow_extent(
+	struct xfs_trans		*tp,
+	xfs_fsblock_t			fsb,
+	xfs_extlen_t			len)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	if (!xfs_has_reflink(mp))
+		return;
+
+	/* Remove rmap entry */
+	xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+}
+
+struct xfs_refcount_recovery {
+	struct list_head		rr_list;
+	struct xfs_refcount_irec	rr_rrec;
+};
+
+/* Stuff an extent on the recovery list. */
+STATIC int
+xfs_refcount_recover_extent(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*rec,
+	void				*priv)
+{
+	struct list_head		*debris = priv;
+	struct xfs_refcount_recovery	*rr;
+
+	if (XFS_IS_CORRUPT(cur->bc_mp,
+			   be32_to_cpu(rec->refc.rc_refcount) != 1))
+		return -EFSCORRUPTED;
+
+	rr = kmalloc(sizeof(struct xfs_refcount_recovery),
+			GFP_KERNEL | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&rr->rr_list);
+	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
+
+	if (XFS_IS_CORRUPT(cur->bc_mp,
+			   rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+		kfree(rr);
+		return -EFSCORRUPTED;
+	}
+
+	list_add_tail(&rr->rr_list, debris);
+	return 0;
+}
+
+/* Find and remove leftover CoW reservations. */
+int
+xfs_refcount_recover_cow_leftovers(
+	struct xfs_mount		*mp,
+	struct xfs_perag		*pag)
+{
+	struct xfs_trans		*tp;
+	struct xfs_btree_cur		*cur;
+	struct xfs_buf			*agbp;
+	struct xfs_refcount_recovery	*rr, *n;
+	struct list_head		debris;
+	union xfs_btree_irec		low;
+	union xfs_btree_irec		high;
+	xfs_fsblock_t			fsb;
+	int				error;
+
+	/* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+	BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
+	if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
+		return -EOPNOTSUPP;
+
+	INIT_LIST_HEAD(&debris);
+
+	/*
+	 * In this first part, we use an empty transaction to gather up
+	 * all the leftover CoW extents so that we can subsequently
+	 * delete them.  The empty transaction is used to avoid
+	 * a buffer lock deadlock if there happens to be a loop in the
+	 * refcountbt because we're allowed to re-grab a buffer that is
+	 * already attached to our transaction.  When we're done
+	 * recording the CoW debris we cancel the (empty) transaction
+	 * and everything goes away cleanly.
+	 */
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+	if (error)
+		goto out_trans;
+	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+
+	/* Find all the leftover CoW staging extents. */
+	memset(&low, 0, sizeof(low));
+	memset(&high, 0, sizeof(high));
+	low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW;
+	high.rc.rc_startblock = -1U;
+	error = xfs_btree_query_range(cur, &low, &high,
+			xfs_refcount_recover_extent, &debris);
+	xfs_btree_del_cursor(cur, error);
+	xfs_trans_brelse(tp, agbp);
+	xfs_trans_cancel(tp);
+	if (error)
+		goto out_free;
+
+	/* Now iterate the list to free the leftovers */
+	list_for_each_entry_safe(rr, n, &debris, rr_list) {
+		/* Set up transaction. */
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+		if (error)
+			goto out_free;
+
+		trace_xfs_refcount_recover_extent(mp, pag->pag_agno,
+				&rr->rr_rrec);
+
+		/* Free the orphan record */
+		fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
+				rr->rr_rrec.rc_startblock);
+		xfs_refcount_free_cow_extent(tp, fsb,
+				rr->rr_rrec.rc_blockcount);
+
+		/* Free the block. */
+		xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+
+		error = xfs_trans_commit(tp);
+		if (error)
+			goto out_free;
+
+		list_del(&rr->rr_list);
+		kfree(rr);
+	}
+
+	return error;
+out_trans:
+	xfs_trans_cancel(tp);
+out_free:
+	/* Free the leftover list */
+	list_for_each_entry_safe(rr, n, &debris, rr_list) {
+		list_del(&rr->rr_list);
+		kfree(rr);
+	}
+	return error;
+}
+
+/* Is there a record covering a given extent? */
+int
+xfs_refcount_has_record(
+	struct xfs_btree_cur	*cur,
+	enum xfs_refc_domain	domain,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			*exists)
+{
+	union xfs_btree_irec	low;
+	union xfs_btree_irec	high;
+
+	memset(&low, 0, sizeof(low));
+	low.rc.rc_startblock = bno;
+	memset(&high, 0xFF, sizeof(high));
+	high.rc.rc_startblock = bno + len - 1;
+	low.rc.rc_domain = high.rc.rc_domain = domain;
+
+	return xfs_btree_has_record(cur, &low, &high, exists);
+}
+
+int __init
+xfs_refcount_intent_init_cache(void)
+{
+	xfs_refcount_intent_cache = kmem_cache_create("xfs_refc_intent",
+			sizeof(struct xfs_refcount_intent),
+			0, 0, NULL);
+
+	return xfs_refcount_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_refcount_intent_destroy_cache(void)
+{
+	kmem_cache_destroy(xfs_refcount_intent_cache);
+	xfs_refcount_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
new file mode 100644
index 000000000..452f30556
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_REFCOUNT_H__
+#define __XFS_REFCOUNT_H__
+
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_btree_cur;
+struct xfs_bmbt_irec;
+struct xfs_refcount_irec;
+
+extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
+		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
+		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
+		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
+		struct xfs_refcount_irec *irec, int *stat);
+
+static inline uint32_t
+xfs_refcount_encode_startblock(
+	xfs_agblock_t		startblock,
+	enum xfs_refc_domain	domain)
+{
+	uint32_t		start;
+
+	/*
+	 * low level btree operations need to handle the generic btree range
+	 * query functions (which set rc_domain == -1U), so we check that the
+	 * domain is /not/ shared.
+	 */
+	start = startblock & ~XFS_REFC_COWFLAG;
+	if (domain != XFS_REFC_DOMAIN_SHARED)
+		start |= XFS_REFC_COWFLAG;
+
+	return start;
+}
+
+enum xfs_refcount_intent_type {
+	XFS_REFCOUNT_INCREASE = 1,
+	XFS_REFCOUNT_DECREASE,
+	XFS_REFCOUNT_ALLOC_COW,
+	XFS_REFCOUNT_FREE_COW,
+};
+
+struct xfs_refcount_intent {
+	struct list_head			ri_list;
+	enum xfs_refcount_intent_type		ri_type;
+	xfs_extlen_t				ri_blockcount;
+	xfs_fsblock_t				ri_startblock;
+};
+
+/* Check that the refcount is appropriate for the record domain. */
+static inline bool
+xfs_refcount_check_domain(
+	const struct xfs_refcount_irec	*irec)
+{
+	if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1)
+		return false;
+	if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2)
+		return false;
+	return true;
+}
+
+void xfs_refcount_increase_extent(struct xfs_trans *tp,
+		struct xfs_bmbt_irec *irec);
+void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+		struct xfs_bmbt_irec *irec);
+
+extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
+		struct xfs_btree_cur *rcur, int error);
+extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+		enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
+		xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
+		xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
+
+extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+		xfs_extlen_t *flen, bool find_end_of_shared);
+
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
+extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+		struct xfs_perag *pag);
+
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs.  Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 32 bytes
+ * per record.  We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ *
+ * Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
+ * This is a low estimate for an EFI tracking a single extent (16 bytes for the
+ * EFI header, 16 for the extent, and 12 for the xlog op header), but the
+ * estimate is acceptable if there's more than one extent being freed.
+ * In the worst case of freeing every other block during a refcount decrease
+ * operation, we amortize the space used for one EFI log item across 16
+ * extents.
+ */
+#define XFS_REFCOUNT_ITEM_OVERHEAD	32
+
+extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
+		enum xfs_refc_domain domain, xfs_agblock_t bno,
+		xfs_extlen_t len, bool *exists);
+union xfs_btree_rec;
+extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
+		struct xfs_refcount_irec *irec);
+extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
+		struct xfs_refcount_irec *irec, int *stat);
+
+extern struct kmem_cache	*xfs_refcount_intent_cache;
+
+int __init xfs_refcount_intent_init_cache(void);
+void xfs_refcount_intent_destroy_cache(void);
+
+#endif	/* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
new file mode 100644
index 000000000..e1f789866
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+static struct kmem_cache	*xfs_refcountbt_cur_cache;
+
+static struct xfs_btree_cur *
+xfs_refcountbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_ag.agbp, cur->bc_ag.pag);
+}
+
+STATIC void
+xfs_refcountbt_set_root(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				inc)
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xfs_perag	*pag = agbp->b_pag;
+
+	ASSERT(ptr->s != 0);
+
+	agf->agf_refcount_root = ptr->s;
+	be32_add_cpu(&agf->agf_refcount_level, inc);
+	pag->pagf_refcount_level += inc;
+
+	xfs_alloc_log_agf(cur->bc_tp, agbp,
+			XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
+}
+
+STATIC int
+xfs_refcountbt_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xfs_alloc_arg	args;		/* block allocation args */
+	int			error;		/* error return value */
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			xfs_refc_block(args.mp));
+	args.oinfo = XFS_RMAP_OINFO_REFC;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.resv = XFS_AG_RESV_METADATA;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto out_error;
+	trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			args.agbno, 1);
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.agno == cur->bc_ag.pag->pag_agno);
+	ASSERT(args.len == 1);
+
+	new->s = cpu_to_be32(args.agbno);
+	be32_add_cpu(&agf->agf_refcount_blocks, 1);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+
+	*stat = 1;
+	return 0;
+
+out_error:
+	return error;
+}
+
+STATIC int
+xfs_refcountbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agf		*agf = agbp->b_addr;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+	int			error;
+
+	trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
+	be32_add_cpu(&agf->agf_refcount_blocks, -1);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+	error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC,
+			XFS_AG_RESV_METADATA);
+	if (error)
+		return error;
+
+	return error;
+}
+
+STATIC int
+xfs_refcountbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_refc_mnr[level != 0];
+}
+
+STATIC int
+xfs_refcountbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_refc_mxr[level != 0];
+}
+
+STATIC void
+xfs_refcountbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_refcountbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	__u32				x;
+
+	x = be32_to_cpu(rec->refc.rc_startblock);
+	x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+	key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_refcountbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+	uint32_t		start;
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	rec->refc.rc_startblock = cpu_to_be32(start);
+	rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+	rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_refcountbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agf		*agf = cur->bc_ag.agbp->b_addr;
+
+	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+
+	ptr->s = agf->agf_refcount_root;
+}
+
+STATIC int64_t
+xfs_refcountbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	const struct xfs_refcount_key	*kp = &key->refc;
+	const struct xfs_refcount_irec	*irec = &cur->bc_rec.rc;
+	uint32_t			start;
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+}
+
+STATIC int64_t
+xfs_refcountbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+			  be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC xfs_failaddr_t
+xfs_refcountbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (!xfs_has_reflink(mp))
+		return __this_address;
+	fa = xfs_btree_sblock_v5hdr_verify(bp);
+	if (fa)
+		return fa;
+
+	level = be16_to_cpu(block->bb_level);
+	if (pag && pag->pagf_init) {
+		if (level >= pag->pagf_refcount_level)
+			return __this_address;
+	} else if (level >= mp->m_refc_maxlevels)
+		return __this_address;
+
+	return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+}
+
+STATIC void
+xfs_refcountbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_refcountbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+STATIC void
+xfs_refcountbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_refcountbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
+	.name			= "xfs_refcountbt",
+	.magic			= { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
+	.verify_read		= xfs_refcountbt_read_verify,
+	.verify_write		= xfs_refcountbt_write_verify,
+	.verify_struct		= xfs_refcountbt_verify,
+};
+
+STATIC int
+xfs_refcountbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->refc.rc_startblock) <
+	       be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_refcountbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	return  be32_to_cpu(r1->refc.rc_startblock) +
+		be32_to_cpu(r1->refc.rc_blockcount) <=
+		be32_to_cpu(r2->refc.rc_startblock);
+}
+
+static const struct xfs_btree_ops xfs_refcountbt_ops = {
+	.rec_len		= sizeof(struct xfs_refcount_rec),
+	.key_len		= sizeof(struct xfs_refcount_key),
+
+	.dup_cursor		= xfs_refcountbt_dup_cursor,
+	.set_root		= xfs_refcountbt_set_root,
+	.alloc_block		= xfs_refcountbt_alloc_block,
+	.free_block		= xfs_refcountbt_free_block,
+	.get_minrecs		= xfs_refcountbt_get_minrecs,
+	.get_maxrecs		= xfs_refcountbt_get_maxrecs,
+	.init_key_from_rec	= xfs_refcountbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_refcountbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_refcountbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_refcountbt_init_ptr_from_cur,
+	.key_diff		= xfs_refcountbt_key_diff,
+	.buf_ops		= &xfs_refcountbt_buf_ops,
+	.diff_two_keys		= xfs_refcountbt_diff_two_keys,
+	.keys_inorder		= xfs_refcountbt_keys_inorder,
+	.recs_inorder		= xfs_refcountbt_recs_inorder,
+};
+
+/*
+ * Initialize a new refcount btree cursor.
+ */
+static struct xfs_btree_cur *
+xfs_refcountbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag)
+{
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
+
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+			mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
+
+	cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	/* take a reference for the cursor */
+	atomic_inc(&pag->pag_ref);
+	cur->bc_ag.pag = pag;
+
+	cur->bc_ag.refc.nr_ops = 0;
+	cur->bc_ag.refc.shape_changes = 0;
+	cur->bc_ops = &xfs_refcountbt_ops;
+	return cur;
+}
+
+/* Create a btree cursor. */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag)
+{
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_refcountbt_init_common(mp, tp, pag);
+	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+	cur->bc_ag.agbp = agbp;
+	return cur;
+}
+
+/* Create a btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_refcountbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xbtree_afakeroot	*afake,
+	struct xfs_perag	*pag)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_refcountbt_init_common(mp, NULL, pag);
+	xfs_btree_stage_afakeroot(cur, afake);
+	return cur;
+}
+
+/*
+ * Swap in the new btree root.  Once we pass this point the newly rebuilt btree
+ * is in place and we have to kill off all the old btree blocks.
+ */
+void
+xfs_refcountbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp)
+{
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xbtree_afakeroot	*afake = cur->bc_ag.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	agf->agf_refcount_root = cpu_to_be32(afake->af_root);
+	agf->agf_refcount_level = cpu_to_be32(afake->af_levels);
+	agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks);
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
+				    XFS_AGF_REFCOUNT_ROOT |
+				    XFS_AGF_REFCOUNT_LEVEL);
+	xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+}
+
+/* Calculate number of records in a refcount btree block. */
+static inline unsigned int
+xfs_refcountbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (sizeof(struct xfs_refcount_key) +
+			   sizeof(xfs_refcount_ptr_t));
+}
+
+/*
+ * Calculate the number of records in a refcount btree block.
+ */
+int
+xfs_refcountbt_maxrecs(
+	int			blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+	return xfs_refcountbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height of the maximally sized refcount btree. */
+unsigned int
+xfs_refcountbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2;
+
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS);
+}
+
+/* Compute the maximum height of a refcount btree. */
+void
+xfs_refcountbt_compute_maxlevels(
+	struct xfs_mount		*mp)
+{
+	if (!xfs_has_reflink(mp)) {
+		mp->m_refc_maxlevels = 0;
+		return;
+	}
+
+	mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(
+			mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+	ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk());
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_refcountbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_refc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_refcountbt_max_size(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agblocks)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_refc_mxr[0] == 0)
+		return 0;
+
+	return xfs_refcountbt_calc_size(mp, agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_refcountbt_calc_reserves(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_agf		*agf;
+	xfs_agblock_t		agblocks;
+	xfs_extlen_t		tree_len;
+	int			error;
+
+	if (!xfs_has_reflink(mp))
+		return 0;
+
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+	if (error)
+		return error;
+
+	agf = agbp->b_addr;
+	agblocks = be32_to_cpu(agf->agf_length);
+	tree_len = be32_to_cpu(agf->agf_refcount_blocks);
+	xfs_trans_brelse(tp, agbp);
+
+	/*
+	 * The log is permanently allocated, so the space it occupies will
+	 * never be available for the kinds of things that would require btree
+	 * expansion.  We therefore can pretend the space isn't there.
+	 */
+	if (xfs_ag_contains_log(mp, pag->pag_agno))
+		agblocks -= mp->m_sb.sb_logblocks;
+
+	*ask += xfs_refcountbt_max_size(mp, agblocks);
+	*used += tree_len;
+
+	return error;
+}
+
+int __init
+xfs_refcountbt_init_cur_cache(void)
+{
+	xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur",
+			xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_refcountbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_refcountbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_refcountbt_cur_cache);
+	xfs_refcountbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
new file mode 100644
index 000000000..d66b37259
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_REFCOUNT_BTREE_H__
+#define	__XFS_REFCOUNT_BTREE_H__
+
+/*
+ * Reference Count Btree on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xbtree_afakeroot;
+
+/*
+ * Btree block header size
+ */
+#define XFS_REFCOUNT_BLOCK_LEN	XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_REFCOUNT_REC_ADDR(block, index) \
+	((struct xfs_refcount_rec *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 (((index) - 1) * sizeof(struct xfs_refcount_rec))))
+
+#define XFS_REFCOUNT_KEY_ADDR(block, index) \
+	((struct xfs_refcount_key *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 ((index) - 1) * sizeof(struct xfs_refcount_key)))
+
+#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \
+	((xfs_refcount_ptr_t *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 (maxrecs) * sizeof(struct xfs_refcount_key) + \
+		 ((index) - 1) * sizeof(xfs_refcount_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_buf *agbp,
+		struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
+		struct xbtree_afakeroot *afake, struct xfs_perag *pag);
+extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
+extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp,
+		xfs_agblock_t agblocks);
+
+extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask,
+		xfs_extlen_t *used);
+
+void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_refcountbt_maxlevels_ondisk(void);
+
+int __init xfs_refcountbt_init_cur_cache(void);
+void xfs_refcountbt_destroy_cur_cache(void);
+
+#endif	/* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
new file mode 100644
index 000000000..b56aca1e7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -0,0 +1,2826 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_sb.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_inode.h"
+#include "xfs_ag.h"
+
+struct kmem_cache	*xfs_rmap_intent_cache;
+
+/*
+ * Lookup the first record less than or equal to [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_le(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	int			get_stat = 0;
+	int			error;
+
+	cur->bc_rec.r.rm_startblock = bno;
+	cur->bc_rec.r.rm_blockcount = 0;
+	cur->bc_rec.r.rm_owner = owner;
+	cur->bc_rec.r.rm_offset = offset;
+	cur->bc_rec.r.rm_flags = flags;
+
+	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+	if (error || !(*stat) || !irec)
+		return error;
+
+	error = xfs_rmap_get_rec(cur, irec, &get_stat);
+	if (error)
+		return error;
+	if (!get_stat)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/*
+ * Lookup the record exactly matching [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_eq(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags,
+	int			*stat)
+{
+	cur->bc_rec.r.rm_startblock = bno;
+	cur->bc_rec.r.rm_blockcount = len;
+	cur->bc_rec.r.rm_owner = owner;
+	cur->bc_rec.r.rm_offset = offset;
+	cur->bc_rec.r.rm_flags = flags;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, owner, offset].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_rmap_update(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*irec)
+{
+	union xfs_btree_rec	rec;
+	int			error;
+
+	trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			irec->rm_startblock, irec->rm_blockcount,
+			irec->rm_owner, irec->rm_offset, irec->rm_flags);
+
+	rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock);
+	rec.rmap.rm_blockcount = cpu_to_be32(irec->rm_blockcount);
+	rec.rmap.rm_owner = cpu_to_be64(irec->rm_owner);
+	rec.rmap.rm_offset = cpu_to_be64(
+			xfs_rmap_irec_offset_pack(irec));
+	error = xfs_btree_update(cur, &rec);
+	if (error)
+		trace_xfs_rmap_update_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+int
+xfs_rmap_insert(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	int			i;
+	int			error;
+
+	trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
+			len, owner, offset, flags);
+
+	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+	if (error)
+		goto done;
+	if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) {
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+
+	rcur->bc_rec.r.rm_startblock = agbno;
+	rcur->bc_rec.r.rm_blockcount = len;
+	rcur->bc_rec.r.rm_owner = owner;
+	rcur->bc_rec.r.rm_offset = offset;
+	rcur->bc_rec.r.rm_flags = flags;
+	error = xfs_btree_insert(rcur, &i);
+	if (error)
+		goto done;
+	if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+done:
+	if (error)
+		trace_xfs_rmap_insert_error(rcur->bc_mp,
+				rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+STATIC int
+xfs_rmap_delete(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	int			i;
+	int			error;
+
+	trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
+			len, owner, offset, flags);
+
+	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+	if (error)
+		goto done;
+	if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+
+	error = xfs_btree_delete(rcur, &i);
+	if (error)
+		goto done;
+	if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+done:
+	if (error)
+		trace_xfs_rmap_delete_error(rcur->bc_mp,
+				rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/* Convert an internal btree record to an rmap record. */
+int
+xfs_rmap_btrec_to_irec(
+	const union xfs_btree_rec	*rec,
+	struct xfs_rmap_irec		*irec)
+{
+	irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
+	irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
+	irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
+	return xfs_rmap_irec_offset_unpack(be64_to_cpu(rec->rmap.rm_offset),
+			irec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_rmap_get_rec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_perag	*pag = cur->bc_ag.pag;
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (error || !*stat)
+		return error;
+
+	if (xfs_rmap_btrec_to_irec(rec, irec))
+		goto out_bad_rec;
+
+	if (irec->rm_blockcount == 0)
+		goto out_bad_rec;
+	if (irec->rm_startblock <= XFS_AGFL_BLOCK(mp)) {
+		if (irec->rm_owner != XFS_RMAP_OWN_FS)
+			goto out_bad_rec;
+		if (irec->rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
+			goto out_bad_rec;
+	} else {
+		/* check for valid extent range, including overflow */
+		if (!xfs_verify_agbext(pag, irec->rm_startblock,
+					    irec->rm_blockcount))
+			goto out_bad_rec;
+	}
+
+	if (!(xfs_verify_ino(mp, irec->rm_owner) ||
+	      (irec->rm_owner <= XFS_RMAP_OWN_FS &&
+	       irec->rm_owner >= XFS_RMAP_OWN_MIN)))
+		goto out_bad_rec;
+
+	return 0;
+out_bad_rec:
+	xfs_warn(mp,
+		"Reverse Mapping BTree record corruption in AG %d detected!",
+		pag->pag_agno);
+	xfs_warn(mp,
+		"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
+		irec->rm_owner, irec->rm_flags, irec->rm_startblock,
+		irec->rm_blockcount);
+	return -EFSCORRUPTED;
+}
+
+struct xfs_find_left_neighbor_info {
+	struct xfs_rmap_irec	high;
+	struct xfs_rmap_irec	*irec;
+};
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_find_left_neighbor_helper(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xfs_find_left_neighbor_info	*info = priv;
+
+	trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+			rec->rm_flags);
+
+	if (rec->rm_owner != info->high.rm_owner)
+		return 0;
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+	    rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
+		return 0;
+
+	*info->irec = *rec;
+	return -ECANCELED;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and adjacent physical and logical
+ * block ranges.
+ */
+STATIC int
+xfs_rmap_find_left_neighbor(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	struct xfs_find_left_neighbor_info	info;
+	int			found = 0;
+	int			error;
+
+	*stat = 0;
+	if (bno == 0)
+		return 0;
+	info.high.rm_startblock = bno - 1;
+	info.high.rm_owner = owner;
+	if (!XFS_RMAP_NON_INODE_OWNER(owner) &&
+	    !(flags & XFS_RMAP_BMBT_BLOCK)) {
+		if (offset == 0)
+			return 0;
+		info.high.rm_offset = offset - 1;
+	} else
+		info.high.rm_offset = 0;
+	info.high.rm_flags = flags;
+	info.high.rm_blockcount = 0;
+	info.irec = irec;
+
+	trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
+
+	/*
+	 * Historically, we always used the range query to walk every reverse
+	 * mapping that could possibly overlap the key that the caller asked
+	 * for, and filter out the ones that don't.  That is very slow when
+	 * there are a lot of records.
+	 *
+	 * However, there are two scenarios where the classic btree search can
+	 * produce correct results -- if the index contains a record that is an
+	 * exact match for the lookup key; and if there are no other records
+	 * between the record we want and the key we supplied.
+	 *
+	 * As an optimization, try a non-overlapped lookup first.  This makes
+	 * extent conversion and remap operations run a bit faster if the
+	 * physical extents aren't being shared.  If we don't find what we
+	 * want, we fall back to the overlapped query.
+	 */
+	error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+			&found);
+	if (error)
+		return error;
+	if (found)
+		error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info);
+	if (!error)
+		error = xfs_rmap_query_range(cur, &info.high, &info.high,
+				xfs_rmap_find_left_neighbor_helper, &info);
+	if (error != -ECANCELED)
+		return error;
+
+	*stat = 1;
+	trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+			irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+			irec->rm_flags);
+	return 0;
+}
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_lookup_le_range_helper(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xfs_find_left_neighbor_info	*info = priv;
+
+	trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+			rec->rm_flags);
+
+	if (rec->rm_owner != info->high.rm_owner)
+		return 0;
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+	    (rec->rm_offset > info->high.rm_offset ||
+	     rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
+		return 0;
+
+	*info->irec = *rec;
+	return -ECANCELED;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and overlapping physical and logical
+ * block ranges.  This is the overlapping-interval version of
+ * xfs_rmap_lookup_le.
+ */
+int
+xfs_rmap_lookup_le_range(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	struct xfs_find_left_neighbor_info	info;
+	int			found = 0;
+	int			error;
+
+	info.high.rm_startblock = bno;
+	info.high.rm_owner = owner;
+	if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK))
+		info.high.rm_offset = offset;
+	else
+		info.high.rm_offset = 0;
+	info.high.rm_flags = flags;
+	info.high.rm_blockcount = 0;
+	*stat = 0;
+	info.irec = irec;
+
+	trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+			bno, 0, owner, offset, flags);
+
+	/*
+	 * Historically, we always used the range query to walk every reverse
+	 * mapping that could possibly overlap the key that the caller asked
+	 * for, and filter out the ones that don't.  That is very slow when
+	 * there are a lot of records.
+	 *
+	 * However, there are two scenarios where the classic btree search can
+	 * produce correct results -- if the index contains a record that is an
+	 * exact match for the lookup key; and if there are no other records
+	 * between the record we want and the key we supplied.
+	 *
+	 * As an optimization, try a non-overlapped lookup first.  This makes
+	 * scrub run much faster on most filesystems because bmbt records are
+	 * usually an exact match for rmap records.  If we don't find what we
+	 * want, we fall back to the overlapped query.
+	 */
+	error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+			&found);
+	if (error)
+		return error;
+	if (found)
+		error = xfs_rmap_lookup_le_range_helper(cur, irec, &info);
+	if (!error)
+		error = xfs_rmap_query_range(cur, &info.high, &info.high,
+				xfs_rmap_lookup_le_range_helper, &info);
+	if (error != -ECANCELED)
+		return error;
+
+	*stat = 1;
+	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+			irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+			irec->rm_flags);
+	return 0;
+}
+
+/*
+ * Perform all the relevant owner checks for a removal op.  If we're doing an
+ * unknown-owner removal then we have no owner information to check.
+ */
+static int
+xfs_rmap_free_check_owner(
+	struct xfs_mount	*mp,
+	uint64_t		ltoff,
+	struct xfs_rmap_irec	*rec,
+	xfs_filblks_t		len,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	int			error = 0;
+
+	if (owner == XFS_RMAP_OWN_UNKNOWN)
+		return 0;
+
+	/* Make sure the unwritten flag matches. */
+	if (XFS_IS_CORRUPT(mp,
+			   (flags & XFS_RMAP_UNWRITTEN) !=
+			   (rec->rm_flags & XFS_RMAP_UNWRITTEN))) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* Make sure the owner matches what we expect to find in the tree. */
+	if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* Check the offset, if necessary. */
+	if (XFS_RMAP_NON_INODE_OWNER(owner))
+		goto out;
+
+	if (flags & XFS_RMAP_BMBT_BLOCK) {
+		if (XFS_IS_CORRUPT(mp,
+				   !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+	} else {
+		if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+		if (XFS_IS_CORRUPT(mp,
+				   offset + len > ltoff + rec->rm_blockcount)) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+	}
+
+out:
+	return error;
+}
+
+/*
+ * Find the extent in the rmap btree and remove it.
+ *
+ * The record we find should always be an exact match for the extent that we're
+ * looking for, since we insert them into the btree without modification.
+ *
+ * Special Case #1: when growing the filesystem, we "free" an extent when
+ * growing the last AG. This extent is new space and so it is not tracked as
+ * used space in the btree. The growfs code will pass in an owner of
+ * XFS_RMAP_OWN_NULL to indicate that it expected that there is no owner of this
+ * extent. We verify that - the extent lookup result in a record that does not
+ * overlap.
+ *
+ * Special Case #2: EFIs do not record the owner of the extent, so when
+ * recovering EFIs from the log we pass in XFS_RMAP_OWN_UNKNOWN to tell the rmap
+ * btree to ignore the owner (i.e. wildcard match) so we don't trigger
+ * corruption checks during log recovery.
+ */
+STATIC int
+xfs_rmap_unmap(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		ltrec;
+	uint64_t			ltoff;
+	int				error = 0;
+	int				i;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags;
+	bool				ignore_off;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+			(flags & XFS_RMAP_BMBT_BLOCK);
+	if (unwritten)
+		flags |= XFS_RMAP_UNWRITTEN;
+	trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+
+	/*
+	 * We should always have a left record because there's a static record
+	 * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+	 * will not ever be removed from the tree.
+	 */
+	error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec, &i);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
+			ltrec.rm_blockcount, ltrec.rm_owner,
+			ltrec.rm_offset, ltrec.rm_flags);
+	ltoff = ltrec.rm_offset;
+
+	/*
+	 * For growfs, the incoming extent must be beyond the left record we
+	 * just found as it is new space and won't be used by anyone. This is
+	 * just a corruption check as we don't actually do anything with this
+	 * extent.  Note that we need to use >= instead of > because it might
+	 * be the case that the "left" extent goes all the way to EOFS.
+	 */
+	if (owner == XFS_RMAP_OWN_NULL) {
+		if (XFS_IS_CORRUPT(mp,
+				   bno <
+				   ltrec.rm_startblock + ltrec.rm_blockcount)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		goto out_done;
+	}
+
+	/*
+	 * If we're doing an unknown-owner removal for EFI recovery, we expect
+	 * to find the full range in the rmapbt or nothing at all.  If we
+	 * don't find any rmaps overlapping either end of the range, we're
+	 * done.  Hopefully this means that the EFI creator already queued
+	 * (and finished) a RUI to remove the rmap.
+	 */
+	if (owner == XFS_RMAP_OWN_UNKNOWN &&
+	    ltrec.rm_startblock + ltrec.rm_blockcount <= bno) {
+		struct xfs_rmap_irec    rtrec;
+
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto out_error;
+		if (i == 0)
+			goto out_done;
+		error = xfs_rmap_get_rec(cur, &rtrec, &i);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		if (rtrec.rm_startblock >= bno + len)
+			goto out_done;
+	}
+
+	/* Make sure the extent we found covers the entire freeing range. */
+	if (XFS_IS_CORRUPT(mp,
+			   ltrec.rm_startblock > bno ||
+			   ltrec.rm_startblock + ltrec.rm_blockcount <
+			   bno + len)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	/* Check owner information. */
+	error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, len, owner,
+			offset, flags);
+	if (error)
+		goto out_error;
+
+	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+		/* exact match, simply remove the record from rmap tree */
+		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+				ltrec.rm_startblock, ltrec.rm_blockcount,
+				ltrec.rm_owner, ltrec.rm_offset,
+				ltrec.rm_flags);
+		error = xfs_btree_delete(cur, &i);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+	} else if (ltrec.rm_startblock == bno) {
+		/*
+		 * overlap left hand side of extent: move the start, trim the
+		 * length and update the current record.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing: |fffffffff|
+		 * Result:            |rrrrrrrrrr|
+		 *         bno       len
+		 */
+		ltrec.rm_startblock += len;
+		ltrec.rm_blockcount -= len;
+		if (!ignore_off)
+			ltrec.rm_offset += len;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+		/*
+		 * overlap right hand side of extent: trim the length and update
+		 * the current record.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing:            |fffffffff|
+		 * Result:  |rrrrrrrrrr|
+		 *                    bno       len
+		 */
+		ltrec.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else {
+
+		/*
+		 * overlap middle of extent: trim the length of the existing
+		 * record to the length of the new left-extent size, increment
+		 * the insertion position so we can insert a new record
+		 * containing the remaining right-extent space.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing:       |fffffffff|
+		 * Result:  |rrrrr|         |rrrr|
+		 *               bno       len
+		 */
+		xfs_extlen_t	orig_len = ltrec.rm_blockcount;
+
+		ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto out_error;
+
+		cur->bc_rec.r.rm_startblock = bno + len;
+		cur->bc_rec.r.rm_blockcount = orig_len - len -
+						     ltrec.rm_blockcount;
+		cur->bc_rec.r.rm_owner = ltrec.rm_owner;
+		if (ignore_off)
+			cur->bc_rec.r.rm_offset = 0;
+		else
+			cur->bc_rec.r.rm_offset = offset + len;
+		cur->bc_rec.r.rm_flags = flags;
+		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
+				cur->bc_rec.r.rm_startblock,
+				cur->bc_rec.r.rm_blockcount,
+				cur->bc_rec.r.rm_owner,
+				cur->bc_rec.r.rm_offset,
+				cur->bc_rec.r.rm_flags);
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto out_error;
+	}
+
+out_done:
+	trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+out_error:
+	if (error)
+		trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno,
+				error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Remove a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_free(
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	struct xfs_perag		*pag,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*cur;
+	int				error;
+
+	if (!xfs_has_rmapbt(mp))
+		return 0;
+
+	cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+
+	error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
+
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * A mergeable rmap must have the same owner and the same values for
+ * the unwritten, attr_fork, and bmbt flags.  The startblock and
+ * offset are checked separately.
+ */
+static bool
+xfs_rmap_is_mergeable(
+	struct xfs_rmap_irec	*irec,
+	uint64_t		owner,
+	unsigned int		flags)
+{
+	if (irec->rm_owner == XFS_RMAP_OWN_NULL)
+		return false;
+	if (irec->rm_owner != owner)
+		return false;
+	if ((flags & XFS_RMAP_UNWRITTEN) ^
+	    (irec->rm_flags & XFS_RMAP_UNWRITTEN))
+		return false;
+	if ((flags & XFS_RMAP_ATTR_FORK) ^
+	    (irec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return false;
+	if ((flags & XFS_RMAP_BMBT_BLOCK) ^
+	    (irec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+		return false;
+	return true;
+}
+
+/*
+ * When we allocate a new block, the first thing we do is add a reference to
+ * the extent in the rmap btree. This takes the form of a [agbno, length,
+ * owner, offset] record.  Flags are encoded in the high bits of the offset
+ * field.
+ */
+STATIC int
+xfs_rmap_map(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		ltrec;
+	struct xfs_rmap_irec		gtrec;
+	int				have_gt;
+	int				have_lt;
+	int				error = 0;
+	int				i;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags = 0;
+	bool				ignore_off;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	ASSERT(owner != 0);
+	ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+			(flags & XFS_RMAP_BMBT_BLOCK);
+	if (unwritten)
+		flags |= XFS_RMAP_UNWRITTEN;
+	trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+	ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
+
+	/*
+	 * For the initial lookup, look for an exact match or the left-adjacent
+	 * record for our insertion point. This will also give us the record for
+	 * start block contiguity tests.
+	 */
+	error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec,
+			&have_lt);
+	if (error)
+		goto out_error;
+	if (have_lt) {
+		trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+
+		if (!xfs_rmap_is_mergeable(&ltrec, owner, flags))
+			have_lt = 0;
+	}
+
+	if (XFS_IS_CORRUPT(mp,
+			   have_lt != 0 &&
+			   ltrec.rm_startblock + ltrec.rm_blockcount > bno)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	/*
+	 * Increment the cursor to see if we have a right-adjacent record to our
+	 * insertion point. This will give us the record for end block
+	 * contiguity tests.
+	 */
+	error = xfs_btree_increment(cur, 0, &have_gt);
+	if (error)
+		goto out_error;
+	if (have_gt) {
+		error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
+			gtrec.rm_blockcount, gtrec.rm_owner,
+			gtrec.rm_offset, gtrec.rm_flags);
+		if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+			have_gt = 0;
+	}
+
+	/*
+	 * Note: cursor currently points one record to the right of ltrec, even
+	 * if there is no record in the tree to the right.
+	 */
+	if (have_lt &&
+	    ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+	    (ignore_off || ltrec.rm_offset + ltrec.rm_blockcount == offset)) {
+		/*
+		 * left edge contiguous, merge into left record.
+		 *
+		 *       ltbno     ltlen
+		 * orig:   |ooooooooo|
+		 * adding:           |aaaaaaaaa|
+		 * result: |rrrrrrrrrrrrrrrrrrr|
+		 *                  bno       len
+		 */
+		ltrec.rm_blockcount += len;
+		if (have_gt &&
+		    bno + len == gtrec.rm_startblock &&
+		    (ignore_off || offset + len == gtrec.rm_offset) &&
+		    (unsigned long)ltrec.rm_blockcount + len +
+				gtrec.rm_blockcount <= XFS_RMAP_LEN_MAX) {
+			/*
+			 * right edge also contiguous, delete right record
+			 * and merge into left record.
+			 *
+			 *       ltbno     ltlen    gtbno     gtlen
+			 * orig:   |ooooooooo|         |ooooooooo|
+			 * adding:           |aaaaaaaaa|
+			 * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+			 */
+			ltrec.rm_blockcount += gtrec.rm_blockcount;
+			trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+					gtrec.rm_startblock,
+					gtrec.rm_blockcount,
+					gtrec.rm_owner,
+					gtrec.rm_offset,
+					gtrec.rm_flags);
+			error = xfs_btree_delete(cur, &i);
+			if (error)
+				goto out_error;
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				error = -EFSCORRUPTED;
+				goto out_error;
+			}
+		}
+
+		/* point the cursor back to the left record and update */
+		error = xfs_btree_decrement(cur, 0, &have_gt);
+		if (error)
+			goto out_error;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else if (have_gt &&
+		   bno + len == gtrec.rm_startblock &&
+		   (ignore_off || offset + len == gtrec.rm_offset)) {
+		/*
+		 * right edge contiguous, merge into right record.
+		 *
+		 *                 gtbno     gtlen
+		 * Orig:             |ooooooooo|
+		 * adding: |aaaaaaaaa|
+		 * Result: |rrrrrrrrrrrrrrrrrrr|
+		 *        bno       len
+		 */
+		gtrec.rm_startblock = bno;
+		gtrec.rm_blockcount += len;
+		if (!ignore_off)
+			gtrec.rm_offset = offset;
+		error = xfs_rmap_update(cur, &gtrec);
+		if (error)
+			goto out_error;
+	} else {
+		/*
+		 * no contiguous edge with identical owner, insert
+		 * new record at current cursor position.
+		 */
+		cur->bc_rec.r.rm_startblock = bno;
+		cur->bc_rec.r.rm_blockcount = len;
+		cur->bc_rec.r.rm_owner = owner;
+		cur->bc_rec.r.rm_offset = offset;
+		cur->bc_rec.r.rm_flags = flags;
+		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			owner, offset, flags);
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+	}
+
+	trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+out_error:
+	if (error)
+		trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno,
+				error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Add a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_alloc(
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	struct xfs_perag		*pag,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*cur;
+	int				error;
+
+	if (!xfs_has_rmapbt(mp))
+		return 0;
+
+	cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+	error = xfs_rmap_map(cur, bno, len, false, oinfo);
+
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+#define RMAP_LEFT_CONTIG	(1 << 0)
+#define RMAP_RIGHT_CONTIG	(1 << 1)
+#define RMAP_LEFT_FILLING	(1 << 2)
+#define RMAP_RIGHT_FILLING	(1 << 3)
+#define RMAP_LEFT_VALID		(1 << 6)
+#define RMAP_RIGHT_VALID	(1 << 7)
+
+#define LEFT		r[0]
+#define RIGHT		r[1]
+#define PREV		r[2]
+#define NEW		r[3]
+
+/*
+ * Convert an unwritten extent to a real extent or vice versa.
+ * Does not handle overlapping extents.
+ */
+STATIC int
+xfs_rmap_convert(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		r[4];	/* neighbor extent entries */
+						/* left is 0, right is 1, */
+						/* prev is 2, new is 3 */
+	uint64_t		owner;
+	uint64_t		offset;
+	uint64_t		new_endoff;
+	unsigned int		oldext;
+	unsigned int		newext;
+	unsigned int		flags = 0;
+	int			i;
+	int			state = 0;
+	int			error;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+	new_endoff = offset + len;
+	trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+
+	/*
+	 * For the initial lookup, look for an exact match or the left-adjacent
+	 * record for our insertion point. This will also give us the record for
+	 * start block contiguity tests.
+	 */
+	error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i);
+	if (error)
+		goto done;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+
+	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
+			PREV.rm_blockcount, PREV.rm_owner,
+			PREV.rm_offset, PREV.rm_flags);
+
+	ASSERT(PREV.rm_offset <= offset);
+	ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+	ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+	newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	if (PREV.rm_offset == offset)
+		state |= RMAP_LEFT_FILLING;
+	if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+		state |= RMAP_RIGHT_FILLING;
+
+	/*
+	 * Decrement the cursor to see if we have a left-adjacent record to our
+	 * insertion point. This will give us the record for end block
+	 * contiguity tests.
+	 */
+	error = xfs_btree_decrement(cur, 0, &i);
+	if (error)
+		goto done;
+	if (i) {
+		state |= RMAP_LEFT_VALID;
+		error = xfs_rmap_get_rec(cur, &LEFT, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		if (XFS_IS_CORRUPT(mp,
+				   LEFT.rm_startblock + LEFT.rm_blockcount >
+				   bno)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, LEFT.rm_startblock,
+				LEFT.rm_blockcount, LEFT.rm_owner,
+				LEFT.rm_offset, LEFT.rm_flags);
+		if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
+		    LEFT.rm_offset + LEFT.rm_blockcount == offset &&
+		    xfs_rmap_is_mergeable(&LEFT, owner, newext))
+			state |= RMAP_LEFT_CONTIG;
+	}
+
+	/*
+	 * Increment the cursor to see if we have a right-adjacent record to our
+	 * insertion point. This will give us the record for end block
+	 * contiguity tests.
+	 */
+	error = xfs_btree_increment(cur, 0, &i);
+	if (error)
+		goto done;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+	error = xfs_btree_increment(cur, 0, &i);
+	if (error)
+		goto done;
+	if (i) {
+		state |= RMAP_RIGHT_VALID;
+		error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (bno + len == RIGHT.rm_startblock &&
+		    offset + len == RIGHT.rm_offset &&
+		    xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+			state |= RMAP_RIGHT_CONTIG;
+	}
+
+	/* check that left + prev + right is not too long */
+	if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+	    (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+	    (unsigned long)LEFT.rm_blockcount + len +
+	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+		state &= ~RMAP_RIGHT_CONTIG;
+
+	trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
+			_RET_IP_);
+
+	/* reset the cursor back to PREV */
+	error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
+	if (error)
+		goto done;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+				RIGHT.rm_startblock, RIGHT.rm_blockcount,
+				RIGHT.rm_owner, RIGHT.rm_offset,
+				RIGHT.rm_flags);
+		error = xfs_btree_delete(cur, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+				PREV.rm_startblock, PREV.rm_blockcount,
+				PREV.rm_owner, PREV.rm_offset,
+				PREV.rm_flags);
+		error = xfs_btree_delete(cur, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW = LEFT;
+		NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+				PREV.rm_startblock, PREV.rm_blockcount,
+				PREV.rm_owner, PREV.rm_offset,
+				PREV.rm_flags);
+		error = xfs_btree_delete(cur, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW = LEFT;
+		NEW.rm_blockcount += PREV.rm_blockcount;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+				RIGHT.rm_startblock, RIGHT.rm_blockcount,
+				RIGHT.rm_owner, RIGHT.rm_offset,
+				RIGHT.rm_flags);
+		error = xfs_btree_delete(cur, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW = PREV;
+		NEW.rm_blockcount = len + RIGHT.rm_blockcount;
+		NEW.rm_flags = newext;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		NEW = PREV;
+		NEW.rm_flags = newext;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		NEW = PREV;
+		NEW.rm_offset += len;
+		NEW.rm_startblock += len;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		NEW.rm_blockcount += len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		NEW = PREV;
+		NEW.rm_startblock += len;
+		NEW.rm_offset += len;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		NEW.rm_startblock = bno;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = offset;
+		NEW.rm_blockcount = len;
+		NEW.rm_flags = newext;
+		cur->bc_rec.r = NEW;
+		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
+				len, owner, offset, newext);
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		break;
+
+	case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		NEW = PREV;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto done;
+		NEW = RIGHT;
+		NEW.rm_offset = offset;
+		NEW.rm_startblock = bno;
+		NEW.rm_blockcount += len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_RIGHT_FILLING:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		NEW = PREV;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+				oldext, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_startblock = bno;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = offset;
+		NEW.rm_blockcount = len;
+		NEW.rm_flags = newext;
+		cur->bc_rec.r = NEW;
+		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
+				len, owner, offset, newext);
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		/* new right extent - oldext */
+		NEW.rm_startblock = bno + len;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = new_endoff;
+		NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+				new_endoff;
+		NEW.rm_flags = PREV.rm_flags;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		/* new left extent - oldext */
+		NEW = PREV;
+		NEW.rm_blockcount = offset - PREV.rm_offset;
+		cur->bc_rec.r = NEW;
+		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
+				NEW.rm_startblock, NEW.rm_blockcount,
+				NEW.rm_owner, NEW.rm_offset,
+				NEW.rm_flags);
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		/*
+		 * Reset the cursor to the position of the new extent
+		 * we are about to insert as we can't trust it after
+		 * the previous insert.
+		 */
+		error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+				oldext, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		/* new middle extent - newext */
+		cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
+		cur->bc_rec.r.rm_flags |= newext;
+		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
+				owner, offset, newext);
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+	case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_LEFT_CONTIG:
+	case RMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+done:
+	if (error)
+		trace_xfs_rmap_convert_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Convert an unwritten extent to a real extent or vice versa.  If there is no
+ * possibility of overlapping extents, delegate to the simpler convert
+ * function.
+ */
+STATIC int
+xfs_rmap_convert_shared(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		r[4];	/* neighbor extent entries */
+						/* left is 0, right is 1, */
+						/* prev is 2, new is 3 */
+	uint64_t		owner;
+	uint64_t		offset;
+	uint64_t		new_endoff;
+	unsigned int		oldext;
+	unsigned int		newext;
+	unsigned int		flags = 0;
+	int			i;
+	int			state = 0;
+	int			error;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+	new_endoff = offset + len;
+	trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+
+	/*
+	 * For the initial lookup, look for and exact match or the left-adjacent
+	 * record for our insertion point. This will also give us the record for
+	 * start block contiguity tests.
+	 */
+	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, oldext,
+			&PREV, &i);
+	if (error)
+		goto done;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+
+	ASSERT(PREV.rm_offset <= offset);
+	ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+	ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+	newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	if (PREV.rm_offset == offset)
+		state |= RMAP_LEFT_FILLING;
+	if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+		state |= RMAP_RIGHT_FILLING;
+
+	/* Is there a left record that abuts our range? */
+	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext,
+			&LEFT, &i);
+	if (error)
+		goto done;
+	if (i) {
+		state |= RMAP_LEFT_VALID;
+		if (XFS_IS_CORRUPT(mp,
+				   LEFT.rm_startblock + LEFT.rm_blockcount >
+				   bno)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		if (xfs_rmap_is_mergeable(&LEFT, owner, newext))
+			state |= RMAP_LEFT_CONTIG;
+	}
+
+	/* Is there a right record that abuts our range? */
+	error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+			newext, &i);
+	if (error)
+		goto done;
+	if (i) {
+		state |= RMAP_RIGHT_VALID;
+		error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+			state |= RMAP_RIGHT_CONTIG;
+	}
+
+	/* check that left + prev + right is not too long */
+	if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+	    (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+	    (unsigned long)LEFT.rm_blockcount + len +
+	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+		state &= ~RMAP_RIGHT_CONTIG;
+
+	trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
+			_RET_IP_);
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (error)
+			goto done;
+		error = xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		error = xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_blockcount += PREV.rm_blockcount;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (error)
+			goto done;
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_blockcount += RIGHT.rm_blockcount;
+		NEW.rm_flags = RIGHT.rm_flags;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_flags = newext;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset += len;
+		NEW.rm_startblock += len;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_blockcount += len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset += len;
+		NEW.rm_startblock += len;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_blockcount = offset - NEW.rm_offset;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		NEW = RIGHT;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset = offset;
+		NEW.rm_startblock = bno;
+		NEW.rm_blockcount += len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_RIGHT_FILLING:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+		if (error)
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		/* new right extent - oldext */
+		NEW.rm_startblock = bno + len;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = new_endoff;
+		NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+				new_endoff;
+		NEW.rm_flags = PREV.rm_flags;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+				NEW.rm_flags);
+		if (error)
+			goto done;
+		/* new left extent - oldext */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto done;
+		}
+		NEW.rm_blockcount = offset - NEW.rm_offset;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		/* new middle extent - newext */
+		NEW.rm_startblock = bno;
+		NEW.rm_blockcount = len;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = offset;
+		NEW.rm_flags = newext;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+				NEW.rm_flags);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+	case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_LEFT_CONTIG:
+	case RMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+done:
+	if (error)
+		trace_xfs_rmap_convert_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+#undef	NEW
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+
+/*
+ * Find an extent in the rmap btree and unmap it.  For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _free function.
+ */
+STATIC int
+xfs_rmap_unmap_shared(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		ltrec;
+	uint64_t			ltoff;
+	int				error = 0;
+	int				i;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	if (unwritten)
+		flags |= XFS_RMAP_UNWRITTEN;
+	trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+
+	/*
+	 * We should always have a left record because there's a static record
+	 * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+	 * will not ever be removed from the tree.
+	 */
+	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+			&ltrec, &i);
+	if (error)
+		goto out_error;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+	ltoff = ltrec.rm_offset;
+
+	/* Make sure the extent we found covers the entire freeing range. */
+	if (XFS_IS_CORRUPT(mp,
+			   ltrec.rm_startblock > bno ||
+			   ltrec.rm_startblock + ltrec.rm_blockcount <
+			   bno + len)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	/* Make sure the owner matches what we expect to find in the tree. */
+	if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	/* Make sure the unwritten flag matches. */
+	if (XFS_IS_CORRUPT(mp,
+			   (flags & XFS_RMAP_UNWRITTEN) !=
+			   (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	/* Check the offset. */
+	if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+	if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) {
+		error = -EFSCORRUPTED;
+		goto out_error;
+	}
+
+	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+		/* Exact match, simply remove the record from rmap tree. */
+		error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else if (ltrec.rm_startblock == bno) {
+		/*
+		 * Overlap left hand side of extent: move the start, trim the
+		 * length and update the current record.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing: |fffffffff|
+		 * Result:            |rrrrrrrrrr|
+		 *         bno       len
+		 */
+
+		/* Delete prev rmap. */
+		error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+
+		/* Add an rmap at the new offset. */
+		ltrec.rm_startblock += len;
+		ltrec.rm_blockcount -= len;
+		ltrec.rm_offset += len;
+		error = xfs_rmap_insert(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+		/*
+		 * Overlap right hand side of extent: trim the length and
+		 * update the current record.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing:            |fffffffff|
+		 * Result:  |rrrrrrrrrr|
+		 *                    bno       len
+		 */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		ltrec.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else {
+		/*
+		 * Overlap middle of extent: trim the length of the existing
+		 * record to the length of the new left-extent size, increment
+		 * the insertion position so we can insert a new record
+		 * containing the remaining right-extent space.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing:       |fffffffff|
+		 * Result:  |rrrrr|         |rrrr|
+		 *               bno       len
+		 */
+		xfs_extlen_t	orig_len = ltrec.rm_blockcount;
+
+		/* Shrink the left side of the rmap */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+
+		/* Add an rmap at the new offset */
+		error = xfs_rmap_insert(cur, bno + len,
+				orig_len - len - ltrec.rm_blockcount,
+				ltrec.rm_owner, offset + len,
+				ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	}
+
+	trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+out_error:
+	if (error)
+		trace_xfs_rmap_unmap_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Find an extent in the rmap btree and map it.  For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _alloc function.
+ */
+STATIC int
+xfs_rmap_map_shared(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		ltrec;
+	struct xfs_rmap_irec		gtrec;
+	int				have_gt;
+	int				have_lt;
+	int				error = 0;
+	int				i;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags = 0;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	if (unwritten)
+		flags |= XFS_RMAP_UNWRITTEN;
+	trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+
+	/* Is there a left record that abuts our range? */
+	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
+			&ltrec, &have_lt);
+	if (error)
+		goto out_error;
+	if (have_lt &&
+	    !xfs_rmap_is_mergeable(&ltrec, owner, flags))
+		have_lt = 0;
+
+	/* Is there a right record that abuts our range? */
+	error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+			flags, &have_gt);
+	if (error)
+		goto out_error;
+	if (have_gt) {
+		error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+			cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
+			gtrec.rm_blockcount, gtrec.rm_owner,
+			gtrec.rm_offset, gtrec.rm_flags);
+
+		if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+			have_gt = 0;
+	}
+
+	if (have_lt &&
+	    ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+	    ltrec.rm_offset + ltrec.rm_blockcount == offset) {
+		/*
+		 * Left edge contiguous, merge into left record.
+		 *
+		 *       ltbno     ltlen
+		 * orig:   |ooooooooo|
+		 * adding:           |aaaaaaaaa|
+		 * result: |rrrrrrrrrrrrrrrrrrr|
+		 *                  bno       len
+		 */
+		ltrec.rm_blockcount += len;
+		if (have_gt &&
+		    bno + len == gtrec.rm_startblock &&
+		    offset + len == gtrec.rm_offset) {
+			/*
+			 * Right edge also contiguous, delete right record
+			 * and merge into left record.
+			 *
+			 *       ltbno     ltlen    gtbno     gtlen
+			 * orig:   |ooooooooo|         |ooooooooo|
+			 * adding:           |aaaaaaaaa|
+			 * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+			 */
+			ltrec.rm_blockcount += gtrec.rm_blockcount;
+			error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+					gtrec.rm_blockcount, gtrec.rm_owner,
+					gtrec.rm_offset, gtrec.rm_flags);
+			if (error)
+				goto out_error;
+		}
+
+		/* Point the cursor back to the left record and update. */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_error;
+		}
+
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else if (have_gt &&
+		   bno + len == gtrec.rm_startblock &&
+		   offset + len == gtrec.rm_offset) {
+		/*
+		 * Right edge contiguous, merge into right record.
+		 *
+		 *                 gtbno     gtlen
+		 * Orig:             |ooooooooo|
+		 * adding: |aaaaaaaaa|
+		 * Result: |rrrrrrrrrrrrrrrrrrr|
+		 *        bno       len
+		 */
+		/* Delete the old record. */
+		error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+				gtrec.rm_blockcount, gtrec.rm_owner,
+				gtrec.rm_offset, gtrec.rm_flags);
+		if (error)
+			goto out_error;
+
+		/* Move the start and re-add it. */
+		gtrec.rm_startblock = bno;
+		gtrec.rm_blockcount += len;
+		gtrec.rm_offset = offset;
+		error = xfs_rmap_insert(cur, gtrec.rm_startblock,
+				gtrec.rm_blockcount, gtrec.rm_owner,
+				gtrec.rm_offset, gtrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else {
+		/*
+		 * No contiguous edge with identical owner, insert
+		 * new record at current cursor position.
+		 */
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, flags);
+		if (error)
+			goto out_error;
+	}
+
+	trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+			unwritten, oinfo);
+out_error:
+	if (error)
+		trace_xfs_rmap_map_error(cur->bc_mp,
+				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	return error;
+}
+
+/* Insert a raw rmap into the rmapbt. */
+int
+xfs_rmap_map_raw(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*rmap)
+{
+	struct xfs_owner_info	oinfo;
+
+	oinfo.oi_owner = rmap->rm_owner;
+	oinfo.oi_offset = rmap->rm_offset;
+	oinfo.oi_flags = 0;
+	if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
+		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+	if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+
+	if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+		return xfs_rmap_map(cur, rmap->rm_startblock,
+				rmap->rm_blockcount,
+				rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+				&oinfo);
+
+	return xfs_rmap_map_shared(cur, rmap->rm_startblock,
+			rmap->rm_blockcount,
+			rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+			&oinfo);
+}
+
+struct xfs_rmap_query_range_info {
+	xfs_rmap_query_range_fn	fn;
+	void				*priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_rmap_query_range_helper(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*rec,
+	void				*priv)
+{
+	struct xfs_rmap_query_range_info	*query = priv;
+	struct xfs_rmap_irec			irec;
+	int					error;
+
+	error = xfs_rmap_btrec_to_irec(rec, &irec);
+	if (error)
+		return error;
+	return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all rmaps between two keys. */
+int
+xfs_rmap_query_range(
+	struct xfs_btree_cur			*cur,
+	const struct xfs_rmap_irec		*low_rec,
+	const struct xfs_rmap_irec		*high_rec,
+	xfs_rmap_query_range_fn			fn,
+	void					*priv)
+{
+	union xfs_btree_irec			low_brec;
+	union xfs_btree_irec			high_brec;
+	struct xfs_rmap_query_range_info	query;
+
+	low_brec.r = *low_rec;
+	high_brec.r = *high_rec;
+	query.priv = priv;
+	query.fn = fn;
+	return xfs_btree_query_range(cur, &low_brec, &high_brec,
+			xfs_rmap_query_range_helper, &query);
+}
+
+/* Find all rmaps. */
+int
+xfs_rmap_query_all(
+	struct xfs_btree_cur			*cur,
+	xfs_rmap_query_range_fn			fn,
+	void					*priv)
+{
+	struct xfs_rmap_query_range_info	query;
+
+	query.priv = priv;
+	query.fn = fn;
+	return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
+}
+
+/* Clean up after calling xfs_rmap_finish_one. */
+void
+xfs_rmap_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	struct xfs_buf		*agbp;
+
+	if (rcur == NULL)
+		return;
+	agbp = rcur->bc_ag.agbp;
+	xfs_btree_del_cursor(rcur, error);
+	if (error)
+		xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred rmap operations.  We pass back the
+ * btree cursor to maintain our lock on the rmapbt between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_rmap_finish_one(
+	struct xfs_trans		*tp,
+	enum xfs_rmap_intent_type	type,
+	uint64_t			owner,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			blockcount,
+	xfs_exntst_t			state,
+	struct xfs_btree_cur		**pcur)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_perag		*pag;
+	struct xfs_btree_cur		*rcur;
+	struct xfs_buf			*agbp = NULL;
+	int				error = 0;
+	struct xfs_owner_info		oinfo;
+	xfs_agblock_t			bno;
+	bool				unwritten;
+
+	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
+	bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+	trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork,
+			startoff, blockcount, state);
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
+		error = -EIO;
+		goto out_drop;
+	}
+
+
+	/*
+	 * If we haven't gotten a cursor or the cursor AG doesn't match
+	 * the startblock, get one now.
+	 */
+	rcur = *pcur;
+	if (rcur != NULL && rcur->bc_ag.pag != pag) {
+		xfs_rmap_finish_one_cleanup(tp, rcur, 0);
+		rcur = NULL;
+		*pcur = NULL;
+	}
+	if (rcur == NULL) {
+		/*
+		 * Refresh the freelist before we start changing the
+		 * rmapbt, because a shape change could cause us to
+		 * allocate blocks.
+		 */
+		error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+		if (error)
+			goto out_drop;
+		if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+			error = -EFSCORRUPTED;
+			goto out_drop;
+		}
+
+		rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+	}
+	*pcur = rcur;
+
+	xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff);
+	unwritten = state == XFS_EXT_UNWRITTEN;
+	bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock);
+
+	switch (type) {
+	case XFS_RMAP_ALLOC:
+	case XFS_RMAP_MAP:
+		error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
+		break;
+	case XFS_RMAP_MAP_SHARED:
+		error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
+				&oinfo);
+		break;
+	case XFS_RMAP_FREE:
+	case XFS_RMAP_UNMAP:
+		error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
+				&oinfo);
+		break;
+	case XFS_RMAP_UNMAP_SHARED:
+		error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
+				&oinfo);
+		break;
+	case XFS_RMAP_CONVERT:
+		error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
+				&oinfo);
+		break;
+	case XFS_RMAP_CONVERT_SHARED:
+		error = xfs_rmap_convert_shared(rcur, bno, blockcount,
+				!unwritten, &oinfo);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+	}
+out_drop:
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Don't defer an rmap if we aren't an rmap filesystem.
+ */
+static bool
+xfs_rmap_update_is_needed(
+	struct xfs_mount	*mp,
+	int			whichfork)
+{
+	return xfs_has_rmapbt(mp) && whichfork != XFS_COW_FORK;
+}
+
+/*
+ * Record a rmap intent; the list is kept sorted first by AG and then by
+ * increasing age.
+ */
+static void
+__xfs_rmap_add(
+	struct xfs_trans		*tp,
+	enum xfs_rmap_intent_type	type,
+	uint64_t			owner,
+	int				whichfork,
+	struct xfs_bmbt_irec		*bmap)
+{
+	struct xfs_rmap_intent		*ri;
+
+	trace_xfs_rmap_defer(tp->t_mountp,
+			XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
+			type,
+			XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
+			owner, whichfork,
+			bmap->br_startoff,
+			bmap->br_blockcount,
+			bmap->br_state);
+
+	ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&ri->ri_list);
+	ri->ri_type = type;
+	ri->ri_owner = owner;
+	ri->ri_whichfork = whichfork;
+	ri->ri_bmap = *bmap;
+
+	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+}
+
+/* Map an extent into a file. */
+void
+xfs_rmap_map_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*PREV)
+{
+	enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+
+	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
+		return;
+
+	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+		type = XFS_RMAP_MAP_SHARED;
+
+	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+}
+
+/* Unmap an extent out of a file. */
+void
+xfs_rmap_unmap_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*PREV)
+{
+	enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+
+	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
+		return;
+
+	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+		type = XFS_RMAP_UNMAP_SHARED;
+
+	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+}
+
+/*
+ * Convert a data fork extent from unwritten to real or vice versa.
+ *
+ * Note that tp can be NULL here as no transaction is used for COW fork
+ * unwritten conversion.
+ */
+void
+xfs_rmap_convert_extent(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*PREV)
+{
+	enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+
+	if (!xfs_rmap_update_is_needed(mp, whichfork))
+		return;
+
+	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+		type = XFS_RMAP_CONVERT_SHARED;
+
+	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+}
+
+/* Schedule the creation of an rmap for non-file data. */
+void
+xfs_rmap_alloc_extent(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	uint64_t		owner)
+{
+	struct xfs_bmbt_irec	bmap;
+
+	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
+		return;
+
+	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+	bmap.br_blockcount = len;
+	bmap.br_startoff = 0;
+	bmap.br_state = XFS_EXT_NORM;
+
+	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+}
+
+/* Schedule the deletion of an rmap for non-file data. */
+void
+xfs_rmap_free_extent(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	uint64_t		owner)
+{
+	struct xfs_bmbt_irec	bmap;
+
+	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
+		return;
+
+	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+	bmap.br_blockcount = len;
+	bmap.br_startoff = 0;
+	bmap.br_state = XFS_EXT_NORM;
+
+	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+}
+
+/* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
+int
+xfs_rmap_compare(
+	const struct xfs_rmap_irec	*a,
+	const struct xfs_rmap_irec	*b)
+{
+	__u64				oa;
+	__u64				ob;
+
+	oa = xfs_rmap_irec_offset_pack(a);
+	ob = xfs_rmap_irec_offset_pack(b);
+
+	if (a->rm_startblock < b->rm_startblock)
+		return -1;
+	else if (a->rm_startblock > b->rm_startblock)
+		return 1;
+	else if (a->rm_owner < b->rm_owner)
+		return -1;
+	else if (a->rm_owner > b->rm_owner)
+		return 1;
+	else if (oa < ob)
+		return -1;
+	else if (oa > ob)
+		return 1;
+	else
+		return 0;
+}
+
+/* Is there a record covering a given extent? */
+int
+xfs_rmap_has_record(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			*exists)
+{
+	union xfs_btree_irec	low;
+	union xfs_btree_irec	high;
+
+	memset(&low, 0, sizeof(low));
+	low.r.rm_startblock = bno;
+	memset(&high, 0xFF, sizeof(high));
+	high.r.rm_startblock = bno + len - 1;
+
+	return xfs_btree_has_record(cur, &low, &high, exists);
+}
+
+/*
+ * Is there a record for this owner completely covering a given physical
+ * extent?  If so, *has_rmap will be set to true.  If there is no record
+ * or the record only covers part of the range, we set *has_rmap to false.
+ * This function doesn't perform range lookups or offset checks, so it is
+ * not suitable for checking data fork blocks.
+ */
+int
+xfs_rmap_record_exists(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	bool				*has_rmap)
+{
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags;
+	int				has_record;
+	struct xfs_rmap_irec		irec;
+	int				error;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
+	       (flags & XFS_RMAP_BMBT_BLOCK));
+
+	error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
+			&has_record);
+	if (error)
+		return error;
+	if (!has_record) {
+		*has_rmap = false;
+		return 0;
+	}
+
+	*has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
+		     irec.rm_startblock + irec.rm_blockcount >= bno + len);
+	return 0;
+}
+
+struct xfs_rmap_key_state {
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags;
+};
+
+/* For each rmap given, figure out if it doesn't match the key we want. */
+STATIC int
+xfs_rmap_has_other_keys_helper(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xfs_rmap_key_state	*rks = priv;
+
+	if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
+	    ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
+		return 0;
+	return -ECANCELED;
+}
+
+/*
+ * Given an extent and some owner info, can we find records overlapping
+ * the extent whose owner info does not match the given owner?
+ */
+int
+xfs_rmap_has_other_keys(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	bool				*has_rmap)
+{
+	struct xfs_rmap_irec		low = {0};
+	struct xfs_rmap_irec		high;
+	struct xfs_rmap_key_state	rks;
+	int				error;
+
+	xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
+	*has_rmap = false;
+
+	low.rm_startblock = bno;
+	memset(&high, 0xFF, sizeof(high));
+	high.rm_startblock = bno + len - 1;
+
+	error = xfs_rmap_query_range(cur, &low, &high,
+			xfs_rmap_has_other_keys_helper, &rks);
+	if (error == -ECANCELED) {
+		*has_rmap = true;
+		return 0;
+	}
+
+	return error;
+}
+
+const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
+	.oi_owner = XFS_RMAP_OWN_NULL,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER = {
+	.oi_owner = XFS_RMAP_OWN_UNKNOWN,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_FS = {
+	.oi_owner = XFS_RMAP_OWN_FS,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_LOG = {
+	.oi_owner = XFS_RMAP_OWN_LOG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_AG = {
+	.oi_owner = XFS_RMAP_OWN_AG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INOBT = {
+	.oi_owner = XFS_RMAP_OWN_INOBT,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INODES = {
+	.oi_owner = XFS_RMAP_OWN_INODES,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
+	.oi_owner = XFS_RMAP_OWN_REFC,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
+	.oi_owner = XFS_RMAP_OWN_COW,
+};
+
+int __init
+xfs_rmap_intent_init_cache(void)
+{
+	xfs_rmap_intent_cache = kmem_cache_create("xfs_rmap_intent",
+			sizeof(struct xfs_rmap_intent),
+			0, 0, NULL);
+
+	return xfs_rmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_rmap_intent_destroy_cache(void)
+{
+	kmem_cache_destroy(xfs_rmap_intent_cache);
+	xfs_rmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
new file mode 100644
index 000000000..54741a591
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_RMAP_H__
+#define __XFS_RMAP_H__
+
+struct xfs_perag;
+
+static inline void
+xfs_rmap_ino_bmbt_owner(
+	struct xfs_owner_info	*oi,
+	xfs_ino_t		ino,
+	int			whichfork)
+{
+	oi->oi_owner = ino;
+	oi->oi_offset = 0;
+	oi->oi_flags = XFS_OWNER_INFO_BMBT_BLOCK;
+	if (whichfork == XFS_ATTR_FORK)
+		oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline void
+xfs_rmap_ino_owner(
+	struct xfs_owner_info	*oi,
+	xfs_ino_t		ino,
+	int			whichfork,
+	xfs_fileoff_t		offset)
+{
+	oi->oi_owner = ino;
+	oi->oi_offset = offset;
+	oi->oi_flags = 0;
+	if (whichfork == XFS_ATTR_FORK)
+		oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline bool
+xfs_rmap_should_skip_owner_update(
+	const struct xfs_owner_info	*oi)
+{
+	return oi->oi_owner == XFS_RMAP_OWN_NULL;
+}
+
+/* Reverse mapping functions. */
+
+struct xfs_buf;
+
+static inline __u64
+xfs_rmap_irec_offset_pack(
+	const struct xfs_rmap_irec	*irec)
+{
+	__u64			x;
+
+	x = XFS_RMAP_OFF(irec->rm_offset);
+	if (irec->rm_flags & XFS_RMAP_ATTR_FORK)
+		x |= XFS_RMAP_OFF_ATTR_FORK;
+	if (irec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		x |= XFS_RMAP_OFF_BMBT_BLOCK;
+	if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+		x |= XFS_RMAP_OFF_UNWRITTEN;
+	return x;
+}
+
+static inline int
+xfs_rmap_irec_offset_unpack(
+	__u64			offset,
+	struct xfs_rmap_irec	*irec)
+{
+	if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
+		return -EFSCORRUPTED;
+	irec->rm_offset = XFS_RMAP_OFF(offset);
+	irec->rm_flags = 0;
+	if (offset & XFS_RMAP_OFF_ATTR_FORK)
+		irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+	if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
+		irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+	if (offset & XFS_RMAP_OFF_UNWRITTEN)
+		irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+	return 0;
+}
+
+static inline void
+xfs_owner_info_unpack(
+	const struct xfs_owner_info	*oinfo,
+	uint64_t			*owner,
+	uint64_t			*offset,
+	unsigned int			*flags)
+{
+	unsigned int			r = 0;
+
+	*owner = oinfo->oi_owner;
+	*offset = oinfo->oi_offset;
+	if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+		r |= XFS_RMAP_ATTR_FORK;
+	if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+		r |= XFS_RMAP_BMBT_BLOCK;
+	*flags = r;
+}
+
+static inline void
+xfs_owner_info_pack(
+	struct xfs_owner_info	*oinfo,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	oinfo->oi_owner = owner;
+	oinfo->oi_offset = XFS_RMAP_OFF(offset);
+	oinfo->oi_flags = 0;
+	if (flags & XFS_RMAP_ATTR_FORK)
+		oinfo->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+	if (flags & XFS_RMAP_BMBT_BLOCK)
+		oinfo->oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+}
+
+int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
+		   struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len,
+		   const struct xfs_owner_info *oinfo);
+int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
+		  struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len,
+		  const struct xfs_owner_info *oinfo);
+
+int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		uint64_t owner, uint64_t offset, unsigned int flags,
+		struct xfs_rmap_irec *irec, int *stat);
+int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		xfs_extlen_t len, uint64_t owner, uint64_t offset,
+		unsigned int flags, int *stat);
+int xfs_rmap_insert(struct xfs_btree_cur *rcur, xfs_agblock_t agbno,
+		xfs_extlen_t len, uint64_t owner, uint64_t offset,
+		unsigned int flags);
+int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec,
+		int *stat);
+
+typedef int (*xfs_rmap_query_range_fn)(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv);
+
+int xfs_rmap_query_range(struct xfs_btree_cur *cur,
+		const struct xfs_rmap_irec *low_rec,
+		const struct xfs_rmap_irec *high_rec,
+		xfs_rmap_query_range_fn fn, void *priv);
+int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn,
+		void *priv);
+
+enum xfs_rmap_intent_type {
+	XFS_RMAP_MAP,
+	XFS_RMAP_MAP_SHARED,
+	XFS_RMAP_UNMAP,
+	XFS_RMAP_UNMAP_SHARED,
+	XFS_RMAP_CONVERT,
+	XFS_RMAP_CONVERT_SHARED,
+	XFS_RMAP_ALLOC,
+	XFS_RMAP_FREE,
+};
+
+struct xfs_rmap_intent {
+	struct list_head			ri_list;
+	enum xfs_rmap_intent_type		ri_type;
+	int					ri_whichfork;
+	uint64_t				ri_owner;
+	struct xfs_bmbt_irec			ri_bmap;
+};
+
+/* functions for updating the rmapbt based on bmbt map/unmap operations */
+void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+		int whichfork, struct xfs_bmbt_irec *imap);
+void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+		int whichfork, struct xfs_bmbt_irec *imap);
+void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
+		struct xfs_inode *ip, int whichfork,
+		struct xfs_bmbt_irec *imap);
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+
+void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
+		struct xfs_btree_cur *rcur, int error);
+int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
+		uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+		xfs_fsblock_t startblock, xfs_filblks_t blockcount,
+		xfs_exntst_t state, struct xfs_btree_cur **pcur);
+
+int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		uint64_t owner, uint64_t offset, unsigned int flags,
+		struct xfs_rmap_irec *irec, int	*stat);
+int xfs_rmap_compare(const struct xfs_rmap_irec *a,
+		const struct xfs_rmap_irec *b);
+union xfs_btree_rec;
+int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
+		struct xfs_rmap_irec *irec);
+int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		xfs_extlen_t len, bool *exists);
+int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+		bool *has_rmap);
+int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+		bool *has_rmap);
+int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
+
+extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_FS;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_LOG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_AG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INOBT;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+
+extern struct kmem_cache	*xfs_rmap_intent_cache;
+
+int __init xfs_rmap_intent_init_cache(void);
+void xfs_rmap_intent_destroy_cache(void);
+
+#endif	/* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
new file mode 100644
index 000000000..7f83f62e5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -0,0 +1,696 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+
+static struct kmem_cache	*xfs_rmapbt_cur_cache;
+
+/*
+ * Reverse map btree.
+ *
+ * This is a per-ag tree used to track the owner(s) of a given extent. With
+ * reflink it is possible for there to be multiple owners, which is a departure
+ * from classic XFS. Owner records for data extents are inserted when the
+ * extent is mapped and removed when an extent is unmapped.  Owner records for
+ * all other block types (i.e. metadata) are inserted when an extent is
+ * allocated and removed when an extent is freed. There can only be one owner
+ * of a metadata extent, usually an inode or some other metadata structure like
+ * an AG btree.
+ *
+ * The rmap btree is part of the free space management, so blocks for the tree
+ * are sourced from the agfl. Hence we need transaction reservation support for
+ * this tree so that the freelist is always large enough. This also impacts on
+ * the minimum space we need to leave free in the AG.
+ *
+ * The tree is ordered by [ag block, owner, offset]. This is a large key size,
+ * but it is the only way to enforce unique keys when a block can be owned by
+ * multiple files at any offset. There's no need to order/search by extent
+ * size for online updating/management of the tree. It is intended that most
+ * reverse lookups will be to find the owner(s) of a particular block, or to
+ * try to recover tree and file data from corrupt primary metadata.
+ */
+
+static struct xfs_btree_cur *
+xfs_rmapbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
+				cur->bc_ag.agbp, cur->bc_ag.pag);
+}
+
+STATIC void
+xfs_rmapbt_set_root(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				inc)
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agf		*agf = agbp->b_addr;
+	int			btnum = cur->bc_btnum;
+
+	ASSERT(ptr->s != 0);
+
+	agf->agf_roots[btnum] = ptr->s;
+	be32_add_cpu(&agf->agf_levels[btnum], inc);
+	cur->bc_ag.pag->pagf_levels[btnum] += inc;
+
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_rmapbt_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xfs_perag	*pag = cur->bc_ag.pag;
+	int			error;
+	xfs_agblock_t		bno;
+
+	/* Allocate the new block from the freelist. If we can't, give up.  */
+	error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
+				       &bno, 1);
+	if (error)
+		return error;
+
+	trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
+	if (bno == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+
+	xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
+
+	new->s = cpu_to_be32(bno);
+	be32_add_cpu(&agf->agf_rmap_blocks, 1);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
+
+	xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
+
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_rmapbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf		*agbp = cur->bc_ag.agbp;
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xfs_perag	*pag = cur->bc_ag.pag;
+	xfs_agblock_t		bno;
+	int			error;
+
+	bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
+	trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
+			bno, 1);
+	be32_add_cpu(&agf->agf_rmap_blocks, -1);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
+	error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
+	if (error)
+		return error;
+
+	xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
+			      XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+	xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
+	return 0;
+}
+
+STATIC int
+xfs_rmapbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_rmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rmapbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_rmap_mxr[level != 0];
+}
+
+STATIC void
+xfs_rmapbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->rmap.rm_startblock = rec->rmap.rm_startblock;
+	key->rmap.rm_owner = rec->rmap.rm_owner;
+	key->rmap.rm_offset = rec->rmap.rm_offset;
+}
+
+/*
+ * The high key for a reverse mapping record can be computed by shifting
+ * the startblock and offset to the highest value that would still map
+ * to that record.  In practice this means that we add blockcount-1 to
+ * the startblock for all records, and if the record is for a data/attr
+ * fork mapping, we add blockcount-1 to the offset too.
+ */
+STATIC void
+xfs_rmapbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	uint64_t			off;
+	int				adj;
+
+	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+	key->rmap.rm_startblock = rec->rmap.rm_startblock;
+	be32_add_cpu(&key->rmap.rm_startblock, adj);
+	key->rmap.rm_owner = rec->rmap.rm_owner;
+	key->rmap.rm_offset = rec->rmap.rm_offset;
+	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+		return;
+	off = be64_to_cpu(key->rmap.rm_offset);
+	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+	key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rmapbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+	rec->rmap.rm_offset = cpu_to_be64(
+			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rmapbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agf		*agf = cur->bc_ag.agbp->b_addr;
+
+	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+
+	ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC int64_t
+xfs_rmapbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
+	const struct xfs_rmap_key	*kp = &key->rmap;
+	__u64				x, y;
+	int64_t				d;
+
+	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+	if (d)
+		return d;
+
+	x = be64_to_cpu(kp->rm_owner);
+	y = rec->rm_owner;
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+
+	x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
+	y = rec->rm_offset;
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+	return 0;
+}
+
+STATIC int64_t
+xfs_rmapbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	const struct xfs_rmap_key	*kp1 = &k1->rmap;
+	const struct xfs_rmap_key	*kp2 = &k2->rmap;
+	int64_t				d;
+	__u64				x, y;
+
+	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
+		       be32_to_cpu(kp2->rm_startblock);
+	if (d)
+		return d;
+
+	x = be64_to_cpu(kp1->rm_owner);
+	y = be64_to_cpu(kp2->rm_owner);
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+
+	x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
+	y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+	return 0;
+}
+
+static xfs_failaddr_t
+xfs_rmapbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+
+	/*
+	 * magic number and level verification
+	 *
+	 * During growfs operations, we can't verify the exact level or owner as
+	 * the perag is not fully initialised and hence not attached to the
+	 * buffer.  In this case, check against the maximum tree depth.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agf information will not yet have been initialised
+	 * from the on disk AGF. Again, we can only check against maximum limits
+	 * in this case.
+	 */
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (!xfs_has_rmapbt(mp))
+		return __this_address;
+	fa = xfs_btree_sblock_v5hdr_verify(bp);
+	if (fa)
+		return fa;
+
+	level = be16_to_cpu(block->bb_level);
+	if (pag && pag->pagf_init) {
+		if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+			return __this_address;
+	} else if (level >= mp->m_rmap_maxlevels)
+		return __this_address;
+
+	return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+}
+
+static void
+xfs_rmapbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rmapbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rmapbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_rmapbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
+	.name			= "xfs_rmapbt",
+	.magic			= { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
+	.verify_read		= xfs_rmapbt_read_verify,
+	.verify_write		= xfs_rmapbt_write_verify,
+	.verify_struct		= xfs_rmapbt_verify,
+};
+
+STATIC int
+xfs_rmapbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	uint32_t		x;
+	uint32_t		y;
+	uint64_t		a;
+	uint64_t		b;
+
+	x = be32_to_cpu(k1->rmap.rm_startblock);
+	y = be32_to_cpu(k2->rmap.rm_startblock);
+	if (x < y)
+		return 1;
+	else if (x > y)
+		return 0;
+	a = be64_to_cpu(k1->rmap.rm_owner);
+	b = be64_to_cpu(k2->rmap.rm_owner);
+	if (a < b)
+		return 1;
+	else if (a > b)
+		return 0;
+	a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
+	b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
+	if (a <= b)
+		return 1;
+	return 0;
+}
+
+STATIC int
+xfs_rmapbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	uint32_t		x;
+	uint32_t		y;
+	uint64_t		a;
+	uint64_t		b;
+
+	x = be32_to_cpu(r1->rmap.rm_startblock);
+	y = be32_to_cpu(r2->rmap.rm_startblock);
+	if (x < y)
+		return 1;
+	else if (x > y)
+		return 0;
+	a = be64_to_cpu(r1->rmap.rm_owner);
+	b = be64_to_cpu(r2->rmap.rm_owner);
+	if (a < b)
+		return 1;
+	else if (a > b)
+		return 0;
+	a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
+	b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
+	if (a <= b)
+		return 1;
+	return 0;
+}
+
+static const struct xfs_btree_ops xfs_rmapbt_ops = {
+	.rec_len		= sizeof(struct xfs_rmap_rec),
+	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+
+	.dup_cursor		= xfs_rmapbt_dup_cursor,
+	.set_root		= xfs_rmapbt_set_root,
+	.alloc_block		= xfs_rmapbt_alloc_block,
+	.free_block		= xfs_rmapbt_free_block,
+	.get_minrecs		= xfs_rmapbt_get_minrecs,
+	.get_maxrecs		= xfs_rmapbt_get_maxrecs,
+	.init_key_from_rec	= xfs_rmapbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rmapbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rmapbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_rmapbt_init_ptr_from_cur,
+	.key_diff		= xfs_rmapbt_key_diff,
+	.buf_ops		= &xfs_rmapbt_buf_ops,
+	.diff_two_keys		= xfs_rmapbt_diff_two_keys,
+	.keys_inorder		= xfs_rmapbt_keys_inorder,
+	.recs_inorder		= xfs_rmapbt_recs_inorder,
+};
+
+static struct xfs_btree_cur *
+xfs_rmapbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag)
+{
+	struct xfs_btree_cur	*cur;
+
+	/* Overlapping btree; 2 keys per pointer. */
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+			mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
+	cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+	cur->bc_ops = &xfs_rmapbt_ops;
+
+	/* take a reference for the cursor */
+	atomic_inc(&pag->pag_ref);
+	cur->bc_ag.pag = pag;
+
+	return cur;
+}
+
+/* Create a new reverse mapping btree cursor. */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag)
+{
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_rmapbt_init_common(mp, tp, pag);
+	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+	cur->bc_ag.agbp = agbp;
+	return cur;
+}
+
+/* Create a new reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rmapbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xbtree_afakeroot	*afake,
+	struct xfs_perag	*pag)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_rmapbt_init_common(mp, NULL, pag);
+	xfs_btree_stage_afakeroot(cur, afake);
+	return cur;
+}
+
+/*
+ * Install a new reverse mapping btree root.  Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rmapbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp)
+{
+	struct xfs_agf		*agf = agbp->b_addr;
+	struct xbtree_afakeroot	*afake = cur->bc_ag.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+	agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+	agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
+				    XFS_AGF_RMAP_BLOCKS);
+	xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+}
+
+/* Calculate number of records in a reverse mapping btree block. */
+static inline unsigned int
+xfs_rmapbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(struct xfs_rmap_rec);
+	return blocklen /
+		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
+/*
+ * Calculate number of records in an rmap btree block.
+ */
+int
+xfs_rmapbt_maxrecs(
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_RMAP_BLOCK_LEN;
+	return xfs_rmapbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for reverse mapping btrees. */
+unsigned int
+xfs_rmapbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
+
+	/*
+	 * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
+	 *
+	 * On a reflink filesystem, each AG block can have up to 2^32 (per the
+	 * refcount record format) owners, which means that theoretically we
+	 * could face up to 2^64 rmap records.  However, we're likely to run
+	 * out of blocks in the AG long before that happens, which means that
+	 * we must compute the max height based on what the btree will look
+	 * like if it consumes almost all the blocks in the AG due to maximal
+	 * sharing factor.
+	 */
+	return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
+}
+
+/* Compute the maximum height of an rmap btree. */
+void
+xfs_rmapbt_compute_maxlevels(
+	struct xfs_mount		*mp)
+{
+	if (!xfs_has_rmapbt(mp)) {
+		mp->m_rmap_maxlevels = 0;
+		return;
+	}
+
+	if (xfs_has_reflink(mp)) {
+		/*
+		 * Compute the asymptotic maxlevels for an rmap btree on a
+		 * filesystem that supports reflink.
+		 *
+		 * On a reflink filesystem, each AG block can have up to 2^32
+		 * (per the refcount record format) owners, which means that
+		 * theoretically we could face up to 2^64 rmap records.
+		 * However, we're likely to run out of blocks in the AG long
+		 * before that happens, which means that we must compute the
+		 * max height based on what the btree will look like if it
+		 * consumes almost all the blocks in the AG due to maximal
+		 * sharing factor.
+		 */
+		mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
+				mp->m_sb.sb_agblocks);
+	} else {
+		/*
+		 * If there's no block sharing, compute the maximum rmapbt
+		 * height assuming one rmap record per AG block.
+		 */
+		mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
+				mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+	}
+	ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_rmapbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_rmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_rmapbt_max_size(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agblocks)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rmap_mxr[0] == 0)
+		return 0;
+
+	return xfs_rmapbt_calc_size(mp, agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_rmapbt_calc_reserves(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_agf		*agf;
+	xfs_agblock_t		agblocks;
+	xfs_extlen_t		tree_len;
+	int			error;
+
+	if (!xfs_has_rmapbt(mp))
+		return 0;
+
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+	if (error)
+		return error;
+
+	agf = agbp->b_addr;
+	agblocks = be32_to_cpu(agf->agf_length);
+	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
+	xfs_trans_brelse(tp, agbp);
+
+	/*
+	 * The log is permanently allocated, so the space it occupies will
+	 * never be available for the kinds of things that would require btree
+	 * expansion.  We therefore can pretend the space isn't there.
+	 */
+	if (xfs_ag_contains_log(mp, pag->pag_agno))
+		agblocks -= mp->m_sb.sb_logblocks;
+
+	/* Reserve 1% of the AG or enough for 1 block per record. */
+	*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
+	*used += tree_len;
+
+	return error;
+}
+
+int __init
+xfs_rmapbt_init_cur_cache(void)
+{
+	xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
+			xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_rmapbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_rmapbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_rmapbt_cur_cache);
+	xfs_rmapbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
new file mode 100644
index 000000000..3244715dd
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_RMAP_BTREE_H__
+#define __XFS_RMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_afakeroot;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RMAP_BLOCK_LEN	XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_RMAP_REC_ADDR(block, index) \
+	((struct xfs_rmap_rec *) \
+		((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+		 (((index) - 1) * sizeof(struct xfs_rmap_rec))))
+
+#define XFS_RMAP_KEY_ADDR(block, index) \
+	((struct xfs_rmap_key *) \
+		((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+		 ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_HIGH_KEY_ADDR(block, index) \
+	((struct xfs_rmap_key *) \
+		((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+		 sizeof(struct xfs_rmap_key) + \
+		 ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_PTR_ADDR(block, index, maxrecs) \
+	((xfs_rmap_ptr_t *) \
+		((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+		 (maxrecs) * 2 * sizeof(struct xfs_rmap_key) + \
+		 ((index) - 1) * sizeof(xfs_rmap_ptr_t)))
+
+struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
+				struct xfs_trans *tp, struct xfs_buf *bp,
+				struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
+		struct xbtree_afakeroot *afake, struct xfs_perag *pag);
+void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp, struct xfs_buf *agbp);
+int xfs_rmapbt_maxrecs(int blocklen, int leaf);
+extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
+
+extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
+		xfs_agblock_t agblocks);
+
+extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
+		struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
+
+unsigned int xfs_rmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rmapbt_init_cur_cache(void);
+void xfs_rmapbt_destroy_cur_cache(void);
+
+#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
new file mode 100644
index 000000000..fa180ab66
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -0,0 +1,1098 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+
+/*
+ * Real time buffers need verifiers to avoid runtime warnings during IO.
+ * We don't have anything to verify, however, so these are just dummy
+ * operations.
+ */
+static void
+xfs_rtbuf_verify_read(
+	struct xfs_buf	*bp)
+{
+	return;
+}
+
+static void
+xfs_rtbuf_verify_write(
+	struct xfs_buf	*bp)
+{
+	return;
+}
+
+const struct xfs_buf_ops xfs_rtbuf_ops = {
+	.name = "rtbuf",
+	.verify_read = xfs_rtbuf_verify_read,
+	.verify_write = xfs_rtbuf_verify_write,
+};
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	block,		/* block number in bitmap or summary */
+	int		issum,		/* is summary not bitmap */
+	struct xfs_buf	**bpp)		/* output: buffer for the block */
+{
+	struct xfs_buf	*bp;		/* block buffer, result */
+	xfs_inode_t	*ip;		/* bitmap or summary inode */
+	xfs_bmbt_irec_t	map;
+	int		nmap = 1;
+	int		error;		/* error value */
+
+	ip = issum ? mp->m_rsumip : mp->m_rbmip;
+
+	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0);
+	if (error)
+		return error;
+
+	if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
+		return -EFSCORRUPTED;
+
+	ASSERT(map.br_startblock != NULLFSBLOCK);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
+				   mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+	if (error)
+		return error;
+
+	xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
+					     : XFS_BLFT_RTBITMAP_BUF);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	struct xfs_buf	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = start - limit + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit < XFS_NBWORD - 1) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+			firstbit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = bit - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i = bit - firstbit + 1;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = bp->b_addr;
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the previous one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = bp->b_addr;
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if (len - i) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_NBWORD - (len - i);
+		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start - i + 1;
+	return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	struct xfs_buf	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = limit - start + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit) {
+		/*
+		 * Calculate last (rightmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Calculate mask for all the relevant bits in this word.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start + i - 1;
+	return 0;
+}
+
+/*
+ * Read and/or modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ *
+ * Summary information is returned in *sum if specified.
+ * If no delta is specified, returns summary only.
+ */
+int
+xfs_rtmodify_summary_int(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+{
+	struct xfs_buf	*bp;		/* buffer for the summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (*rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (*rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		*rbpp = bp;
+		*rsb = sb;
+	}
+	/*
+	 * Point to the summary information, modify/log it, and/or copy it out.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	if (delta) {
+		uint first = (uint)((char *)sp - (char *)bp->b_addr);
+
+		*sp += delta;
+		if (mp->m_rsum_cache) {
+			if (*sp == 0 && log == mp->m_rsum_cache[bbno])
+				mp->m_rsum_cache[bbno]++;
+			if (*sp != 0 && log < mp->m_rsum_cache[bbno])
+				mp->m_rsum_cache[bbno] = log;
+		}
+		xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+	}
+	if (sum)
+		*sum = *sp;
+	return 0;
+}
+
+int
+xfs_rtmodify_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	return xfs_rtmodify_summary_int(mp, tp, log, bbno,
+					delta, rbpp, rsb, NULL);
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to modify */
+	xfs_extlen_t	len,		/* length of extent to modify */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	struct xfs_buf	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtword_t	*first;		/* first used word in the buffer */
+	int		i;		/* current bit number rel. to start */
+	int		lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block, and point to its data.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	first = b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not changed and mask of relevant bits.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		i = lastbit - bit;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Set the word value correctly.
+		 */
+		*b = val;
+		i += XFS_NBWORD;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Compute a mask of relevant bits.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		b++;
+	}
+	/*
+	 * Log any remaining changed bytes.
+	 */
+	if (b > first)
+		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+			(uint)((char *)b - (char *)bufp - 1));
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to free */
+	xfs_extlen_t	len,		/* length to free */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the freed extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block freed > end */
+	xfs_rtblock_t	preblock;	/* first block freed < start */
+
+	end = start + len - 1;
+	/*
+	 * Modify the bitmap to mark this extent freed.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 1);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Assume we're freeing out of the middle of an allocated extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of allocated extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error)
+		return error;
+	/*
+	 * If there are blocks not being freed at the front of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being freed at the end of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Increment the summary information corresponding to the entire
+	 * (new) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+	return error;
+}
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		val,		/* 1 for free, 0 for allocated */
+	xfs_rtblock_t	*new,		/* out: first block not matching */
+	int		*stat)		/* out: 1 for matches, 0 for not */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	struct xfs_buf	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zero's; 1 (free) => all one's.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not examined.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ val)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * Successful, return.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*new = start + i;
+	*stat = 1;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_alloc_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len)		/* length of extent */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+	int		stat;
+	int		error;
+
+	error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+	if (error)
+		return error;
+	ASSERT(stat);
+	return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l)	(0)
+#endif
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to free */
+	xfs_extlen_t	len)		/* length of extent freed */
+{
+	int		error;		/* error value */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	struct xfs_buf	*sumbp = NULL;	/* summary file block buffer */
+
+	mp = tp->t_mountp;
+
+	ASSERT(mp->m_rbmip->i_itemp != NULL);
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+	error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+	if (error)
+		return error;
+
+	/*
+	 * Free the range of realtime blocks.
+	 */
+	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Mark more blocks free in the superblock.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+	/*
+	 * If we've now freed all the blocks, reset the file sequence
+	 * number to 0.
+	 */
+	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+	    mp->m_sb.sb_rextents) {
+		if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
+			mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+		*(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+	}
+	return 0;
+}
+
+/* Find all the free records within a given range. */
+int
+xfs_rtalloc_query_range(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*low_rec,
+	const struct xfs_rtalloc_rec	*high_rec,
+	xfs_rtalloc_query_range_fn	fn,
+	void				*priv)
+{
+	struct xfs_rtalloc_rec		rec;
+	xfs_rtblock_t			rtstart;
+	xfs_rtblock_t			rtend;
+	xfs_rtblock_t			high_key;
+	int				is_free;
+	int				error = 0;
+
+	if (low_rec->ar_startext > high_rec->ar_startext)
+		return -EINVAL;
+	if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
+	    low_rec->ar_startext == high_rec->ar_startext)
+		return 0;
+
+	high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1);
+
+	/* Iterate the bitmap, looking for discrepancies. */
+	rtstart = low_rec->ar_startext;
+	while (rtstart <= high_key) {
+		/* Is the first block free? */
+		error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+				&is_free);
+		if (error)
+			break;
+
+		/* How long does the extent go for? */
+		error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend);
+		if (error)
+			break;
+
+		if (is_free) {
+			rec.ar_startext = rtstart;
+			rec.ar_extcount = rtend - rtstart + 1;
+
+			error = fn(mp, tp, &rec, priv);
+			if (error)
+				break;
+		}
+
+		rtstart = rtend + 1;
+	}
+
+	return error;
+}
+
+/* Find all the free records. */
+int
+xfs_rtalloc_query_all(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	xfs_rtalloc_query_range_fn	fn,
+	void				*priv)
+{
+	struct xfs_rtalloc_rec		keys[2];
+
+	keys[0].ar_startext = 0;
+	keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
+	keys[0].ar_extcount = keys[1].ar_extcount = 0;
+
+	return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
+}
+
+/* Is the given extent all free? */
+int
+xfs_rtalloc_extent_is_free(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	xfs_rtblock_t			start,
+	xfs_extlen_t			len,
+	bool				*is_free)
+{
+	xfs_rtblock_t			end;
+	int				matches;
+	int				error;
+
+	error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches);
+	if (error)
+		return error;
+
+	*is_free = matches;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
new file mode 100644
index 000000000..b6a584e04
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -0,0 +1,1317 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_log.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_health.h"
+#include "xfs_ag.h"
+
+/*
+ * Physical superblock buffer manipulations. Shared with libxfs in userspace.
+ */
+
+/*
+ * Check that all the V4 feature bits that the V5 filesystem format requires are
+ * correctly set.
+ */
+static bool
+xfs_sb_validate_v5_features(
+	struct xfs_sb	*sbp)
+{
+	/* We must not have any unknown V4 feature bits set */
+	if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
+		return false;
+
+	/*
+	 * The CRC bit is considered an invalid V4 flag, so we have to add it
+	 * manually to the OKBITS mask.
+	 */
+	if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
+				  XFS_SB_VERSION2_CRCBIT))
+		return false;
+
+	/* Now check all the required V4 feature flags are set. */
+
+#define V5_VERS_FLAGS	(XFS_SB_VERSION_NLINKBIT	| \
+			XFS_SB_VERSION_ALIGNBIT		| \
+			XFS_SB_VERSION_LOGV2BIT		| \
+			XFS_SB_VERSION_EXTFLGBIT	| \
+			XFS_SB_VERSION_DIRV2BIT		| \
+			XFS_SB_VERSION_MOREBITSBIT)
+
+#define V5_FEAT_FLAGS	(XFS_SB_VERSION2_LAZYSBCOUNTBIT	| \
+			XFS_SB_VERSION2_ATTR2BIT	| \
+			XFS_SB_VERSION2_PROJID32BIT	| \
+			XFS_SB_VERSION2_CRCBIT)
+
+	if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
+		return false;
+	if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
+		return false;
+	return true;
+}
+
+/*
+ * We current support XFS v5 formats with known features and v4 superblocks with
+ * at least V2 directories.
+ */
+bool
+xfs_sb_good_version(
+	struct xfs_sb	*sbp)
+{
+	/*
+	 * All v5 filesystems are supported, but we must check that all the
+	 * required v4 feature flags are enabled correctly as the code checks
+	 * those flags and not for v5 support.
+	 */
+	if (xfs_sb_is_v5(sbp))
+		return xfs_sb_validate_v5_features(sbp);
+
+	/* versions prior to v4 are not supported */
+	if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4)
+		return false;
+
+	/* We must not have any unknown v4 feature bits set */
+	if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+	    ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+	     (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+		return false;
+
+	/* V4 filesystems need v2 directories and unwritten extents */
+	if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+		return false;
+	if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+		return false;
+
+	/* It's a supported v4 filesystem */
+	return true;
+}
+
+uint64_t
+xfs_sb_version_to_features(
+	struct xfs_sb	*sbp)
+{
+	uint64_t	features = 0;
+
+	/* optional V4 features */
+	if (sbp->sb_rblocks > 0)
+		features |= XFS_FEAT_REALTIME;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
+		features |= XFS_FEAT_NLINK;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
+		features |= XFS_FEAT_ATTR;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
+		features |= XFS_FEAT_QUOTA;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)
+		features |= XFS_FEAT_ALIGN;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)
+		features |= XFS_FEAT_LOGV2;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT)
+		features |= XFS_FEAT_DALIGN;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)
+		features |= XFS_FEAT_EXTFLG;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT)
+		features |= XFS_FEAT_SECTOR;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT)
+		features |= XFS_FEAT_ASCIICI;
+	if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
+		if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
+			features |= XFS_FEAT_LAZYSBCOUNT;
+		if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
+			features |= XFS_FEAT_ATTR2;
+		if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
+			features |= XFS_FEAT_PROJID32;
+		if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
+			features |= XFS_FEAT_FTYPE;
+	}
+
+	if (!xfs_sb_is_v5(sbp))
+		return features;
+
+	/* Always on V5 features */
+	features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
+		    XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 |
+		    XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;
+
+	/* Optional V5 features */
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT)
+		features |= XFS_FEAT_FINOBT;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT)
+		features |= XFS_FEAT_RMAPBT;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK)
+		features |= XFS_FEAT_REFLINK;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+		features |= XFS_FEAT_INOBTCNT;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
+		features |= XFS_FEAT_FTYPE;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
+		features |= XFS_FEAT_SPINODES;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
+		features |= XFS_FEAT_META_UUID;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME)
+		features |= XFS_FEAT_BIGTIME;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+		features |= XFS_FEAT_NEEDSREPAIR;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
+		features |= XFS_FEAT_NREXT64;
+
+	return features;
+}
+
+/* Check all the superblock fields we care about when reading one in. */
+STATIC int
+xfs_validate_sb_read(
+	struct xfs_mount	*mp,
+	struct xfs_sb		*sbp)
+{
+	if (!xfs_sb_is_v5(sbp))
+		return 0;
+
+	/*
+	 * Version 5 superblock feature mask validation. Reject combinations
+	 * the kernel cannot support up front before checking anything else.
+	 */
+	if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+		xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.",
+			(sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN));
+		xfs_warn(mp,
+"Using a more recent kernel is recommended.");
+	}
+
+	if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+		xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+			(sbp->sb_features_ro_compat &
+					XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+		if (!xfs_is_readonly(mp)) {
+			xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.");
+			xfs_warn(mp,
+"Filesystem can only be safely mounted read only.");
+
+			return -EINVAL;
+		}
+	}
+	if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+		xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.",
+			(sbp->sb_features_incompat &
+					XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+		xfs_warn(mp,
+"Filesystem cannot be safely mounted by this kernel.");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Check all the superblock fields we care about when writing one out. */
+STATIC int
+xfs_validate_sb_write(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct xfs_sb		*sbp)
+{
+	/*
+	 * Carry out additional sb summary counter sanity checks when we write
+	 * the superblock.  We skip this in the read validator because there
+	 * could be newer superblocks in the log and if the values are garbage
+	 * even after replay we'll recalculate them at the end of log mount.
+	 *
+	 * mkfs has traditionally written zeroed counters to inprogress and
+	 * secondary superblocks, so allow this usage to continue because
+	 * we never read counters from such superblocks.
+	 */
+	if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
+	    (sbp->sb_fdblocks > sbp->sb_dblocks ||
+	     !xfs_verify_icount(mp, sbp->sb_icount) ||
+	     sbp->sb_ifree > sbp->sb_icount)) {
+		xfs_warn(mp, "SB summary counter sanity check failed");
+		return -EFSCORRUPTED;
+	}
+
+	if (!xfs_sb_is_v5(sbp))
+		return 0;
+
+	/*
+	 * Version 5 superblock feature mask validation. Reject combinations
+	 * the kernel cannot support since we checked for unsupported bits in
+	 * the read verifier, which means that memory is corrupt.
+	 */
+	if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+		xfs_warn(mp,
+"Corruption detected in superblock compatible features (0x%x)!",
+			(sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN));
+		return -EFSCORRUPTED;
+	}
+
+	if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+		xfs_alert(mp,
+"Corruption detected in superblock read-only compatible features (0x%x)!",
+			(sbp->sb_features_ro_compat &
+					XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+		return -EFSCORRUPTED;
+	}
+	if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+		xfs_warn(mp,
+"Corruption detected in superblock incompatible features (0x%x)!",
+			(sbp->sb_features_incompat &
+					XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+		return -EFSCORRUPTED;
+	}
+	if (xfs_sb_has_incompat_log_feature(sbp,
+			XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+		xfs_warn(mp,
+"Corruption detected in superblock incompatible log features (0x%x)!",
+			(sbp->sb_features_log_incompat &
+					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * We can't read verify the sb LSN because the read verifier is called
+	 * before the log is allocated and processed. We know the log is set up
+	 * before write verifier calls, so check it here.
+	 */
+	if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Check the validity of the SB. */
+STATIC int
+xfs_validate_sb_common(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct xfs_sb		*sbp)
+{
+	struct xfs_dsb		*dsb = bp->b_addr;
+	uint32_t		agcount = 0;
+	uint32_t		rem;
+	bool			has_dalign;
+
+	if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
+		xfs_warn(mp,
+"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
+			be32_to_cpu(dsb->sb_magicnum));
+		return -EWRONGFS;
+	}
+
+	if (!xfs_sb_good_version(sbp)) {
+		xfs_warn(mp,
+"Superblock has unknown features enabled or corrupted feature masks.");
+		return -EWRONGFS;
+	}
+
+	/*
+	 * Validate feature flags and state
+	 */
+	if (xfs_sb_is_v5(sbp)) {
+		if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
+			xfs_notice(mp,
+"Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)",
+				sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE);
+			return -EFSCORRUPTED;
+		}
+
+		/* V5 has a separate project quota inode */
+		if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+			xfs_notice(mp,
+			   "Version 5 of Super block has XFS_OQUOTA bits.");
+			return -EFSCORRUPTED;
+		}
+
+		/*
+		 * Full inode chunks must be aligned to inode chunk size when
+		 * sparse inodes are enabled to support the sparse chunk
+		 * allocation algorithm and prevent overlapping inode records.
+		 */
+		if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) {
+			uint32_t	align;
+
+			align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+					>> sbp->sb_blocklog;
+			if (sbp->sb_inoalignmt != align) {
+				xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+					 sbp->sb_inoalignmt, align);
+				return -EINVAL;
+			}
+		}
+	} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+				XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+			xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits.");
+			return -EFSCORRUPTED;
+	}
+
+	if (unlikely(
+	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+		xfs_warn(mp,
+		"filesystem is marked as having an external log; "
+		"specify logdev on the mount command line.");
+		return -EINVAL;
+	}
+
+	if (unlikely(
+	    sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+		xfs_warn(mp,
+		"filesystem is marked as having an internal log; "
+		"do not specify logdev on the mount command line.");
+		return -EINVAL;
+	}
+
+	/* Compute agcount for this number of dblocks and agblocks */
+	if (sbp->sb_agblocks) {
+		agcount = div_u64_rem(sbp->sb_dblocks, sbp->sb_agblocks, &rem);
+		if (rem)
+			agcount++;
+	}
+
+	/*
+	 * More sanity checking.  Most of these were stolen directly from
+	 * xfs_repair.
+	 */
+	if (unlikely(
+	    sbp->sb_agcount <= 0					||
+	    sbp->sb_sectsize < XFS_MIN_SECTORSIZE			||
+	    sbp->sb_sectsize > XFS_MAX_SECTORSIZE			||
+	    sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG			||
+	    sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG			||
+	    sbp->sb_sectsize != (1 << sbp->sb_sectlog)			||
+	    sbp->sb_blocksize < XFS_MIN_BLOCKSIZE			||
+	    sbp->sb_blocksize > XFS_MAX_BLOCKSIZE			||
+	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
+	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_blocksize != (1 << sbp->sb_blocklog)		||
+	    sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
+	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
+	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
+	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
+	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
+	    sbp->sb_inodesize != (1 << sbp->sb_inodelog)		||
+	    sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE			||
+	    sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
+	    XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES	||
+	    XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES	||
+	    sbp->sb_agblklog != xfs_highbit32(sbp->sb_agblocks - 1) + 1	||
+	    agcount == 0 || agcount != sbp->sb_agcount			||
+	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
+	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
+	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
+	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)	||
+	    sbp->sb_dblocks == 0					||
+	    sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)			||
+	    sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp)			||
+	    sbp->sb_shared_vn != 0)) {
+		xfs_notice(mp, "SB sanity check failed");
+		return -EFSCORRUPTED;
+	}
+
+	/* Validate the realtime geometry; stolen from xfs_repair */
+	if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+	    sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+		xfs_notice(mp,
+			"realtime extent sanity check failed");
+		return -EFSCORRUPTED;
+	}
+
+	if (sbp->sb_rblocks == 0) {
+		if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+		    sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
+			xfs_notice(mp,
+				"realtime zeroed geometry check failed");
+			return -EFSCORRUPTED;
+		}
+	} else {
+		uint64_t	rexts;
+		uint64_t	rbmblocks;
+
+		rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
+		rbmblocks = howmany_64(sbp->sb_rextents,
+				       NBBY * sbp->sb_blocksize);
+
+		if (sbp->sb_rextents != rexts ||
+		    sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+		    sbp->sb_rbmblocks != rbmblocks) {
+			xfs_notice(mp,
+				"realtime geometry sanity check failed");
+			return -EFSCORRUPTED;
+		}
+	}
+
+	/*
+	 * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign)
+	 * would imply the image is corrupted.
+	 */
+	has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT;
+	if (!!sbp->sb_unit ^ has_dalign) {
+		xfs_notice(mp, "SB stripe alignment sanity check failed");
+		return -EFSCORRUPTED;
+	}
+
+	if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
+			XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
+		return -EFSCORRUPTED;
+
+	/*
+	 * Currently only very few inode sizes are supported.
+	 */
+	switch (sbp->sb_inodesize) {
+	case 256:
+	case 512:
+	case 1024:
+	case 2048:
+		break;
+	default:
+		xfs_warn(mp, "inode size of %d bytes not supported",
+				sbp->sb_inodesize);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+	/*
+	 * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+	 * leads to in-core values having two different values for a quota
+	 * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+	 * NULLFSINO.
+	 *
+	 * Note that this change affect only the in-core values. These
+	 * values are not written back to disk unless any quota information
+	 * is written to the disk. Even in that case, sb_pquotino field is
+	 * not written to disk unless the superblock supports pquotino.
+	 */
+	if (sbp->sb_uquotino == 0)
+		sbp->sb_uquotino = NULLFSINO;
+	if (sbp->sb_gquotino == 0)
+		sbp->sb_gquotino = NULLFSINO;
+	if (sbp->sb_pquotino == 0)
+		sbp->sb_pquotino = NULLFSINO;
+
+	/*
+	 * We need to do these manipilations only if we are working
+	 * with an older version of on-disk superblock.
+	 */
+	if (xfs_sb_is_v5(sbp))
+		return;
+
+	if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+		sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+					XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+	if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+		sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+					XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+	sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+
+	if (sbp->sb_qflags & XFS_PQUOTA_ACCT &&
+	    sbp->sb_gquotino != NULLFSINO)  {
+		/*
+		 * In older version of superblock, on-disk superblock only
+		 * has sb_gquotino, and in-core superblock has both sb_gquotino
+		 * and sb_pquotino. But, only one of them is supported at any
+		 * point of time. So, if PQUOTA is set in disk superblock,
+		 * copy over sb_gquotino to sb_pquotino.  The NULLFSINO test
+		 * above is to make sure we don't do this twice and wipe them
+		 * both out!
+		 */
+		sbp->sb_pquotino = sbp->sb_gquotino;
+		sbp->sb_gquotino = NULLFSINO;
+	}
+}
+
+static void
+__xfs_sb_from_disk(
+	struct xfs_sb	*to,
+	struct xfs_dsb	*from,
+	bool		convert_xquota)
+{
+	to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+	to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+	to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+	to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+	to->sb_rextents = be64_to_cpu(from->sb_rextents);
+	memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+	to->sb_logstart = be64_to_cpu(from->sb_logstart);
+	to->sb_rootino = be64_to_cpu(from->sb_rootino);
+	to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+	to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+	to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+	to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+	to->sb_agcount = be32_to_cpu(from->sb_agcount);
+	to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+	to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+	to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+	to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+	to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+	to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+	memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+	to->sb_blocklog = from->sb_blocklog;
+	to->sb_sectlog = from->sb_sectlog;
+	to->sb_inodelog = from->sb_inodelog;
+	to->sb_inopblog = from->sb_inopblog;
+	to->sb_agblklog = from->sb_agblklog;
+	to->sb_rextslog = from->sb_rextslog;
+	to->sb_inprogress = from->sb_inprogress;
+	to->sb_imax_pct = from->sb_imax_pct;
+	to->sb_icount = be64_to_cpu(from->sb_icount);
+	to->sb_ifree = be64_to_cpu(from->sb_ifree);
+	to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+	to->sb_frextents = be64_to_cpu(from->sb_frextents);
+	to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+	to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+	to->sb_qflags = be16_to_cpu(from->sb_qflags);
+	to->sb_flags = from->sb_flags;
+	to->sb_shared_vn = from->sb_shared_vn;
+	to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+	to->sb_unit = be32_to_cpu(from->sb_unit);
+	to->sb_width = be32_to_cpu(from->sb_width);
+	to->sb_dirblklog = from->sb_dirblklog;
+	to->sb_logsectlog = from->sb_logsectlog;
+	to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+	to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+	to->sb_features2 = be32_to_cpu(from->sb_features2);
+	to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+	to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+	to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+	to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+	to->sb_features_log_incompat =
+				be32_to_cpu(from->sb_features_log_incompat);
+	/* crc is only used on disk, not in memory; just init to 0 here. */
+	to->sb_crc = 0;
+	to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
+	to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+	to->sb_lsn = be64_to_cpu(from->sb_lsn);
+	/*
+	 * sb_meta_uuid is only on disk if it differs from sb_uuid and the
+	 * feature flag is set; if not set we keep it only in memory.
+	 */
+	if (xfs_sb_is_v5(to) &&
+	    (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID))
+		uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+	else
+		uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
+	/* Convert on-disk flags to in-memory flags? */
+	if (convert_xquota)
+		xfs_sb_quota_from_disk(to);
+}
+
+void
+xfs_sb_from_disk(
+	struct xfs_sb	*to,
+	struct xfs_dsb	*from)
+{
+	__xfs_sb_from_disk(to, from, true);
+}
+
+static void
+xfs_sb_quota_to_disk(
+	struct xfs_dsb	*to,
+	struct xfs_sb	*from)
+{
+	uint16_t	qflags = from->sb_qflags;
+
+	to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
+
+	/*
+	 * The in-memory superblock quota state matches the v5 on-disk format so
+	 * just write them out and return
+	 */
+	if (xfs_sb_is_v5(from)) {
+		to->sb_qflags = cpu_to_be16(from->sb_qflags);
+		to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+		to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
+		return;
+	}
+
+	/*
+	 * For older superblocks (v4), the in-core version of sb_qflags do not
+	 * have XFS_OQUOTA_* flags, whereas the on-disk version does.  So,
+	 * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
+	 */
+	qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+			XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+	if (from->sb_qflags &
+			(XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+		qflags |= XFS_OQUOTA_ENFD;
+	if (from->sb_qflags &
+			(XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+		qflags |= XFS_OQUOTA_CHKD;
+	to->sb_qflags = cpu_to_be16(qflags);
+
+	/*
+	 * GQUOTINO and PQUOTINO cannot be used together in versions
+	 * of superblock that do not have pquotino. from->sb_flags
+	 * tells us which quota is active and should be copied to
+	 * disk. If neither are active, we should NULL the inode.
+	 *
+	 * In all cases, the separate pquotino must remain 0 because it
+	 * is beyond the "end" of the valid non-pquotino superblock.
+	 */
+	if (from->sb_qflags & XFS_GQUOTA_ACCT)
+		to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+	else if (from->sb_qflags & XFS_PQUOTA_ACCT)
+		to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+	else {
+		/*
+		 * We can't rely on just the fields being logged to tell us
+		 * that it is safe to write NULLFSINO - we should only do that
+		 * if quotas are not actually enabled. Hence only write
+		 * NULLFSINO if both in-core quota inodes are NULL.
+		 */
+		if (from->sb_gquotino == NULLFSINO &&
+		    from->sb_pquotino == NULLFSINO)
+			to->sb_gquotino = cpu_to_be64(NULLFSINO);
+	}
+
+	to->sb_pquotino = 0;
+}
+
+void
+xfs_sb_to_disk(
+	struct xfs_dsb	*to,
+	struct xfs_sb	*from)
+{
+	xfs_sb_quota_to_disk(to, from);
+
+	to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
+	to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
+	to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
+	to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
+	to->sb_rextents = cpu_to_be64(from->sb_rextents);
+	memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+	to->sb_logstart = cpu_to_be64(from->sb_logstart);
+	to->sb_rootino = cpu_to_be64(from->sb_rootino);
+	to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
+	to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
+	to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
+	to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
+	to->sb_agcount = cpu_to_be32(from->sb_agcount);
+	to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
+	to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
+	to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
+	to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
+	to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
+	to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
+	memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+	to->sb_blocklog = from->sb_blocklog;
+	to->sb_sectlog = from->sb_sectlog;
+	to->sb_inodelog = from->sb_inodelog;
+	to->sb_inopblog = from->sb_inopblog;
+	to->sb_agblklog = from->sb_agblklog;
+	to->sb_rextslog = from->sb_rextslog;
+	to->sb_inprogress = from->sb_inprogress;
+	to->sb_imax_pct = from->sb_imax_pct;
+	to->sb_icount = cpu_to_be64(from->sb_icount);
+	to->sb_ifree = cpu_to_be64(from->sb_ifree);
+	to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
+	to->sb_frextents = cpu_to_be64(from->sb_frextents);
+
+	to->sb_flags = from->sb_flags;
+	to->sb_shared_vn = from->sb_shared_vn;
+	to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
+	to->sb_unit = cpu_to_be32(from->sb_unit);
+	to->sb_width = cpu_to_be32(from->sb_width);
+	to->sb_dirblklog = from->sb_dirblklog;
+	to->sb_logsectlog = from->sb_logsectlog;
+	to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
+	to->sb_logsunit = cpu_to_be32(from->sb_logsunit);
+
+	/*
+	 * We need to ensure that bad_features2 always matches features2.
+	 * Hence we enforce that here rather than having to remember to do it
+	 * everywhere else that updates features2.
+	 */
+	from->sb_bad_features2 = from->sb_features2;
+	to->sb_features2 = cpu_to_be32(from->sb_features2);
+	to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
+
+	if (!xfs_sb_is_v5(from))
+		return;
+
+	to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
+	to->sb_features_ro_compat =
+			cpu_to_be32(from->sb_features_ro_compat);
+	to->sb_features_incompat =
+			cpu_to_be32(from->sb_features_incompat);
+	to->sb_features_log_incompat =
+			cpu_to_be32(from->sb_features_log_incompat);
+	to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
+	to->sb_lsn = cpu_to_be64(from->sb_lsn);
+	if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
+		uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+}
+
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid.  We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ *
+ * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
+ * last field in V4 secondary superblocks.  So for secondary superblocks,
+ * we are more forgiving, and ignore CRC failures if the primary doesn't
+ * indicate that the fs version is V5.
+ */
+static void
+xfs_sb_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_sb		sb;
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_dsb		*dsb = bp->b_addr;
+	int			error;
+
+	/*
+	 * open code the version check to avoid needing to convert the entire
+	 * superblock from disk order just to check the version number
+	 */
+	if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+	    (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+						XFS_SB_VERSION_5) ||
+	     dsb->sb_crc != 0)) {
+
+		if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
+			/* Only fail bad secondaries on a known V5 filesystem */
+			if (xfs_buf_daddr(bp) == XFS_SB_DADDR ||
+			    xfs_has_crc(mp)) {
+				error = -EFSBADCRC;
+				goto out_error;
+			}
+		}
+	}
+
+	/*
+	 * Check all the superblock fields.  Don't byteswap the xquota flags
+	 * because _verify_common checks the on-disk values.
+	 */
+	__xfs_sb_from_disk(&sb, dsb, false);
+	error = xfs_validate_sb_common(mp, bp, &sb);
+	if (error)
+		goto out_error;
+	error = xfs_validate_sb_read(mp, &sb);
+
+out_error:
+	if (error == -EFSCORRUPTED || error == -EFSBADCRC)
+		xfs_verifier_error(bp, error, __this_address);
+	else if (error)
+		xfs_buf_ioerror(bp, error);
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, then run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_dsb	*dsb = bp->b_addr;
+
+	if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
+		/* XFS filesystem, verify noisily! */
+		xfs_sb_read_verify(bp);
+		return;
+	}
+	/* quietly fail */
+	xfs_buf_ioerror(bp, -EWRONGFS);
+}
+
+static void
+xfs_sb_write_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_sb		sb;
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	struct xfs_dsb		*dsb = bp->b_addr;
+	int			error;
+
+	/*
+	 * Check all the superblock fields.  Don't byteswap the xquota flags
+	 * because _verify_common checks the on-disk values.
+	 */
+	__xfs_sb_from_disk(&sb, dsb, false);
+	error = xfs_validate_sb_common(mp, bp, &sb);
+	if (error)
+		goto out_error;
+	error = xfs_validate_sb_write(mp, bp, &sb);
+	if (error)
+		goto out_error;
+
+	if (!xfs_sb_is_v5(&sb))
+		return;
+
+	if (bip)
+		dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
+	return;
+
+out_error:
+	xfs_verifier_error(bp, error, __this_address);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+	.name = "xfs_sb",
+	.magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
+	.verify_read = xfs_sb_read_verify,
+	.verify_write = xfs_sb_write_verify,
+};
+
+const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+	.name = "xfs_sb_quiet",
+	.magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
+	.verify_read = xfs_sb_quiet_read_verify,
+	.verify_write = xfs_sb_write_verify,
+};
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure.
+ *
+ * Inode geometry are calculated in xfs_ialloc_setup_geometry.
+ */
+void
+xfs_sb_mount_common(
+	struct xfs_mount	*mp,
+	struct xfs_sb		*sbp)
+{
+	mp->m_agfrotor = mp->m_agirotor = 0;
+	mp->m_maxagi = mp->m_sb.sb_agcount;
+	mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+	mp->m_blockmask = sbp->sb_blocksize - 1;
+	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+	mp->m_blockwmask = mp->m_blockwsize - 1;
+
+	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+	mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+	mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+	mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+	mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+	mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1);
+	mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0);
+	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
+	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+
+	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true);
+	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false);
+	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
+	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+
+	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+	mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
+}
+
+/*
+ * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
+ * into the superblock buffer to be logged.  It does not provide the higher
+ * level of locking that is needed to protect the in-core superblock from
+ * concurrent access.
+ */
+void
+xfs_log_sb(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp = xfs_trans_getsb(tp);
+
+	/*
+	 * Lazy sb counters don't update the in-core superblock so do that now.
+	 * If this is at unmount, the counters will be exactly correct, but at
+	 * any other time they will only be ballpark correct because of
+	 * reservations that have been taken out percpu counters. If we have an
+	 * unclean shutdown, this will be corrected by log recovery rebuilding
+	 * the counters from the AGF block counts.
+	 *
+	 * Do not update sb_frextents here because it is not part of the lazy
+	 * sb counters, despite having a percpu counter. It is always kept
+	 * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+	 * and hence we don't need have to update it here.
+	 */
+	if (xfs_has_lazysbcount(mp)) {
+		mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+		mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+		mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+	}
+
+	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+}
+
+/*
+ * xfs_sync_sb
+ *
+ * Sync the superblock to disk.
+ *
+ * Note that the caller is responsible for checking the frozen state of the
+ * filesystem. This procedure uses the non-blocking transaction allocator and
+ * thus will allow modifications to a frozen fs. This is required because this
+ * code can be called during the process of freezing where use of the high-level
+ * allocator would deadlock.
+ */
+int
+xfs_sync_sb(
+	struct xfs_mount	*mp,
+	bool			wait)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
+			XFS_TRANS_NO_WRITECOUNT, &tp);
+	if (error)
+		return error;
+
+	xfs_log_sb(tp);
+	if (wait)
+		xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp);
+}
+
+/*
+ * Update all the secondary superblocks to match the new state of the primary.
+ * Because we are completely overwriting all the existing fields in the
+ * secondary superblock buffers, there is no need to read them in from disk.
+ * Just get a new buffer, stamp it and write it.
+ *
+ * The sb buffers need to be cached here so that we serialise against other
+ * operations that access the secondary superblocks, but we don't want to keep
+ * them in memory once it is written so we mark it as a one-shot buffer.
+ */
+int
+xfs_update_secondary_sbs(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno = 1;
+	int			saved_error = 0;
+	int			error = 0;
+	LIST_HEAD		(buffer_list);
+
+	/* update secondary superblocks. */
+	for_each_perag_from(mp, agno, pag) {
+		struct xfs_buf		*bp;
+
+		error = xfs_buf_get(mp->m_ddev_targp,
+				 XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR),
+				 XFS_FSS_TO_BB(mp, 1), &bp);
+		/*
+		 * If we get an error reading or writing alternate superblocks,
+		 * continue.  xfs_repair chooses the "best" superblock based
+		 * on most matches; if we break early, we'll leave more
+		 * superblocks un-updated than updated, and xfs_repair may
+		 * pick them over the properly-updated primary.
+		 */
+		if (error) {
+			xfs_warn(mp,
+		"error allocating secondary superblock for ag %d",
+				pag->pag_agno);
+			if (!saved_error)
+				saved_error = error;
+			continue;
+		}
+
+		bp->b_ops = &xfs_sb_buf_ops;
+		xfs_buf_oneshot(bp);
+		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+		xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+		xfs_buf_delwri_queue(bp, &buffer_list);
+		xfs_buf_relse(bp);
+
+		/* don't hold too many buffers at once */
+		if (agno % 16)
+			continue;
+
+		error = xfs_buf_delwri_submit(&buffer_list);
+		if (error) {
+			xfs_warn(mp,
+		"write error %d updating a secondary superblock near ag %d",
+				error, pag->pag_agno);
+			if (!saved_error)
+				saved_error = error;
+			continue;
+		}
+	}
+	error = xfs_buf_delwri_submit(&buffer_list);
+	if (error) {
+		xfs_warn(mp,
+		"write error %d updating a secondary superblock near ag %d",
+			error, agno);
+	}
+
+	return saved_error ? saved_error : error;
+}
+
+/*
+ * Same behavior as xfs_sync_sb, except that it is always synchronous and it
+ * also writes the superblock buffer to disk sector 0 immediately.
+ */
+int
+xfs_sync_sb_buf(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans	*tp;
+	struct xfs_buf		*bp;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	bp = xfs_trans_getsb(tp);
+	xfs_log_sb(tp);
+	xfs_trans_bhold(tp, bp);
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out;
+	/*
+	 * write out the sb buffer to get the changes to disk
+	 */
+	error = xfs_bwrite(bp);
+out:
+	xfs_buf_relse(bp);
+	return error;
+}
+
+void
+xfs_fs_geometry(
+	struct xfs_mount	*mp,
+	struct xfs_fsop_geom	*geo,
+	int			struct_version)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+
+	memset(geo, 0, sizeof(struct xfs_fsop_geom));
+
+	geo->blocksize = sbp->sb_blocksize;
+	geo->rtextsize = sbp->sb_rextsize;
+	geo->agblocks = sbp->sb_agblocks;
+	geo->agcount = sbp->sb_agcount;
+	geo->logblocks = sbp->sb_logblocks;
+	geo->sectsize = sbp->sb_sectsize;
+	geo->inodesize = sbp->sb_inodesize;
+	geo->imaxpct = sbp->sb_imax_pct;
+	geo->datablocks = sbp->sb_dblocks;
+	geo->rtblocks = sbp->sb_rblocks;
+	geo->rtextents = sbp->sb_rextents;
+	geo->logstart = sbp->sb_logstart;
+	BUILD_BUG_ON(sizeof(geo->uuid) != sizeof(sbp->sb_uuid));
+	memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid));
+
+	if (struct_version < 2)
+		return;
+
+	geo->sunit = sbp->sb_unit;
+	geo->swidth = sbp->sb_width;
+
+	if (struct_version < 3)
+		return;
+
+	geo->version = XFS_FSOP_GEOM_VERSION;
+	geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
+		     XFS_FSOP_GEOM_FLAGS_DIRV2 |
+		     XFS_FSOP_GEOM_FLAGS_EXTFLG;
+	if (xfs_has_attr(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
+	if (xfs_has_quota(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA;
+	if (xfs_has_align(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
+	if (xfs_has_dalign(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
+	if (xfs_has_asciici(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
+	if (xfs_has_lazysbcount(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
+	if (xfs_has_attr2(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2;
+	if (xfs_has_projid32(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
+	if (xfs_has_crc(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB;
+	if (xfs_has_ftype(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE;
+	if (xfs_has_finobt(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT;
+	if (xfs_has_sparseinodes(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES;
+	if (xfs_has_rmapbt(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT;
+	if (xfs_has_reflink(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
+	if (xfs_has_bigtime(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
+	if (xfs_has_inobtcounts(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
+	if (xfs_has_sector(mp)) {
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
+		geo->logsectsize = sbp->sb_logsectsize;
+	} else {
+		geo->logsectsize = BBSIZE;
+	}
+	if (xfs_has_large_extent_counts(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
+	geo->rtsectsize = sbp->sb_blocksize;
+	geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
+
+	if (struct_version < 4)
+		return;
+
+	if (xfs_has_logv2(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2;
+
+	geo->logsunit = sbp->sb_logsunit;
+
+	if (struct_version < 5)
+		return;
+
+	geo->version = XFS_FSOP_GEOM_VERSION_V5;
+}
+
+/* Read a secondary superblock. */
+int
+xfs_sb_read_secondary(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(agno != 0 && agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+	if (error)
+		return error;
+	xfs_buf_set_ref(bp, XFS_SSB_REF);
+	*bpp = bp;
+	return 0;
+}
+
+/* Get an uninitialised secondary superblock buffer. */
+int
+xfs_sb_get_secondary(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(agno != 0 && agno != NULLAGNUMBER);
+	error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &bp);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_sb_buf_ops;
+	xfs_buf_oneshot(bp);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * sunit, swidth, sectorsize(optional with 0) should be all in bytes,
+ * so users won't be confused by values in error messages.
+ */
+bool
+xfs_validate_stripe_geometry(
+	struct xfs_mount	*mp,
+	__s64			sunit,
+	__s64			swidth,
+	int			sectorsize,
+	bool			silent)
+{
+	if (swidth > INT_MAX) {
+		if (!silent)
+			xfs_notice(mp,
+"stripe width (%lld) is too large", swidth);
+		return false;
+	}
+
+	if (sunit > swidth) {
+		if (!silent)
+			xfs_notice(mp,
+"stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
+		return false;
+	}
+
+	if (sectorsize && (int)sunit % sectorsize) {
+		if (!silent)
+			xfs_notice(mp,
+"stripe unit (%lld) must be a multiple of the sector size (%d)",
+				   sunit, sectorsize);
+		return false;
+	}
+
+	if (sunit && !swidth) {
+		if (!silent)
+			xfs_notice(mp,
+"invalid stripe unit (%lld) and stripe width of 0", sunit);
+		return false;
+	}
+
+	if (!sunit && swidth) {
+		if (!silent)
+			xfs_notice(mp,
+"invalid stripe width (%lld) and stripe unit of 0", swidth);
+		return false;
+	}
+
+	if (sunit && (int)swidth % (int)sunit) {
+		if (!silent)
+			xfs_notice(mp,
+"stripe width (%lld) must be a multiple of the stripe unit (%lld)",
+				   swidth, sunit);
+		return false;
+	}
+	return true;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
new file mode 100644
index 000000000..a5e14740e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SB_H__
+#define	__XFS_SB_H__
+
+struct xfs_mount;
+struct xfs_sb;
+struct xfs_dsb;
+struct xfs_trans;
+struct xfs_fsop_geom;
+struct xfs_perag;
+
+extern void	xfs_log_sb(struct xfs_trans *tp);
+extern int	xfs_sync_sb(struct xfs_mount *mp, bool wait);
+extern int	xfs_sync_sb_buf(struct xfs_mount *mp);
+extern void	xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
+extern void	xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
+extern void	xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
+extern void	xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+extern bool	xfs_sb_good_version(struct xfs_sb *sbp);
+extern uint64_t	xfs_sb_version_to_features(struct xfs_sb *sbp);
+
+extern int	xfs_update_secondary_sbs(struct xfs_mount *mp);
+
+#define XFS_FS_GEOM_MAX_STRUCT_VER	(4)
+extern void	xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo,
+				int struct_version);
+extern int	xfs_sb_read_secondary(struct xfs_mount *mp,
+				struct xfs_trans *tp, xfs_agnumber_t agno,
+				struct xfs_buf **bpp);
+extern int	xfs_sb_get_secondary(struct xfs_mount *mp,
+				struct xfs_trans *tp, xfs_agnumber_t agno,
+				struct xfs_buf **bpp);
+
+extern bool	xfs_validate_stripe_geometry(struct xfs_mount *mp,
+		__s64 sunit, __s64 swidth, int sectorsize, bool silent);
+
+#endif	/* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
new file mode 100644
index 000000000..c4381388c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SHARED_H__
+#define __XFS_SHARED_H__
+
+/*
+ * Definitions shared between kernel and userspace that don't fit into any other
+ * header file that is shared with userspace.
+ */
+struct xfs_ifork;
+struct xfs_buf;
+struct xfs_buf_ops;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+
+/*
+ * Buffer verifier operations are widely used, including userspace tools
+ */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbuf_ops;
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+/* log size calculation functions */
+int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int	xfs_log_calc_minimum_size(struct xfs_mount *);
+
+struct xfs_trans_res;
+void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
+				  struct xfs_trans_res *max_resp);
+
+/*
+ * Values for t_flags.
+ */
+/* Transaction needs to be logged */
+#define XFS_TRANS_DIRTY			(1u << 0)
+/* Superblock is dirty and needs to be logged */
+#define XFS_TRANS_SB_DIRTY		(1u << 1)
+/* Transaction took a permanent log reservation */
+#define XFS_TRANS_PERM_LOG_RES		(1u << 2)
+/* Synchronous transaction commit needed */
+#define XFS_TRANS_SYNC			(1u << 3)
+/* Transaction can use reserve block pool */
+#define XFS_TRANS_RESERVE		(1u << 4)
+/* Transaction should avoid VFS level superblock write accounting */
+#define XFS_TRANS_NO_WRITECOUNT		(1u << 5)
+/* Transaction has freed blocks returned to it's reservation */
+#define XFS_TRANS_RES_FDBLKS		(1u << 6)
+/* Transaction contains an intent done log item */
+#define XFS_TRANS_HAS_INTENT_DONE	(1u << 7)
+
+/*
+ * LOWMODE is used by the allocator to activate the lowspace algorithm - when
+ * free space is running low the extent allocator may choose to allocate an
+ * extent from an AG without leaving sufficient space for a btree split when
+ * inserting the new extent. In this case the allocator will enable the
+ * lowspace algorithm which is supposed to allow further allocations (such as
+ * btree splits and newroots) to allocate from sequential AGs. In order to
+ * avoid locking AGs out of order the lowspace algorithm will start searching
+ * for free space from AG 0. If the correct transaction reservations have been
+ * made then this algorithm will eventually find all the space it needs.
+ */
+#define XFS_TRANS_LOWMODE	0x100	/* allocate in low space mode */
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define	XFS_TRANS_SB_ICOUNT		0x00000001
+#define	XFS_TRANS_SB_IFREE		0x00000002
+#define	XFS_TRANS_SB_FDBLOCKS		0x00000004
+#define	XFS_TRANS_SB_RES_FDBLOCKS	0x00000008
+#define	XFS_TRANS_SB_FREXTENTS		0x00000010
+#define	XFS_TRANS_SB_RES_FREXTENTS	0x00000020
+#define	XFS_TRANS_SB_DBLOCKS		0x00000040
+#define	XFS_TRANS_SB_AGCOUNT		0x00000080
+#define	XFS_TRANS_SB_IMAXPCT		0x00000100
+#define	XFS_TRANS_SB_REXTSIZE		0x00000200
+#define	XFS_TRANS_SB_RBMBLOCKS		0x00000400
+#define	XFS_TRANS_SB_RBLOCKS		0x00000800
+#define	XFS_TRANS_SB_REXTENTS		0x00001000
+#define	XFS_TRANS_SB_REXTSLOG		0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer reference count
+ * values.  This determines how hard the buffer cache tries to hold onto the
+ * buffer.
+ */
+#define	XFS_AGF_REF		4
+#define	XFS_AGI_REF		4
+#define	XFS_AGFL_REF		3
+#define	XFS_INO_BTREE_REF	3
+#define	XFS_ALLOC_BTREE_REF	2
+#define	XFS_BMAP_BTREE_REF	2
+#define	XFS_RMAP_BTREE_REF	2
+#define	XFS_DIR_BTREE_REF	2
+#define	XFS_INO_REF		2
+#define	XFS_ATTR_BTREE_REF	1
+#define	XFS_DQUOT_REF		1
+#define	XFS_REFC_BTREE_REF	1
+#define	XFS_SSB_REF		0
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
+#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
+#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
+
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+			uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+			uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+				 struct xfs_inode *ip, struct xfs_ifork *ifp);
+xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
+
+/* Computed inode geometry for the filesystem. */
+struct xfs_ino_geometry {
+	/* Maximum inode count in this filesystem. */
+	uint64_t	maxicount;
+
+	/* Actual inode cluster buffer size, in bytes. */
+	unsigned int	inode_cluster_size;
+
+	/*
+	 * Desired inode cluster buffer size, in bytes.  This value is not
+	 * rounded up to at least one filesystem block, which is necessary for
+	 * the sole purpose of validating sb_spino_align.  Runtime code must
+	 * only ever use inode_cluster_size.
+	 */
+	unsigned int	inode_cluster_size_raw;
+
+	/* Inode cluster sizes, adjusted to be at least 1 fsb. */
+	unsigned int	inodes_per_cluster;
+	unsigned int	blocks_per_cluster;
+
+	/* Inode cluster alignment. */
+	unsigned int	cluster_align;
+	unsigned int	cluster_align_inodes;
+	unsigned int	inoalign_mask;	/* mask sb_inoalignmt if used */
+
+	unsigned int	inobt_mxr[2]; /* max inobt btree records */
+	unsigned int	inobt_mnr[2]; /* min inobt btree records */
+	unsigned int	inobt_maxlevels; /* max inobt btree levels. */
+
+	/* Size of inode allocations under normal operation. */
+	unsigned int	ialloc_inos;
+	unsigned int	ialloc_blks;
+
+	/* Minimum inode blocks for a sparse allocation. */
+	unsigned int	ialloc_min_blks;
+
+	/* stripe unit inode alignment */
+	unsigned int	ialloc_align;
+
+	unsigned int	agino_log;	/* #bits for agino in inum */
+
+	/* precomputed default inode attribute fork offset */
+	unsigned int	attr_fork_offset;
+
+	/* precomputed value for di_flags2 */
+	uint64_t	new_diflags2;
+
+};
+
+#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
new file mode 100644
index 000000000..bdc777b9e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+	struct xfs_mount *mp,
+	int		pathlen)
+{
+	int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+	return (pathlen + buflen - 1) / buflen;
+}
+
+int
+xfs_symlink_hdr_set(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
+
+	if (!xfs_has_crc(mp))
+		return 0;
+
+	memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
+	dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+	dsl->sl_offset = cpu_to_be32(offset);
+	dsl->sl_bytes = cpu_to_be32(size);
+	uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid);
+	dsl->sl_owner = cpu_to_be64(ino);
+	dsl->sl_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+	bp->b_ops = &xfs_symlink_buf_ops;
+
+	return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+	if (offset != be32_to_cpu(dsl->sl_offset))
+		return false;
+	if (size != be32_to_cpu(dsl->sl_bytes))
+		return false;
+	if (ino != be64_to_cpu(dsl->sl_owner))
+		return false;
+
+	/* ok */
+	return true;
+}
+
+static xfs_failaddr_t
+xfs_symlink_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
+
+	if (!xfs_has_crc(mp))
+		return __this_address;
+	if (!xfs_verify_magic(bp, dsl->sl_magic))
+		return __this_address;
+	if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+	if (xfs_buf_daddr(bp) != be64_to_cpu(dsl->sl_blkno))
+		return __this_address;
+	if (be32_to_cpu(dsl->sl_offset) +
+				be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN)
+		return __this_address;
+	if (dsl->sl_owner == 0)
+		return __this_address;
+	if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+		return __this_address;
+
+	return NULL;
+}
+
+static void
+xfs_symlink_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	xfs_failaddr_t	fa;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_has_crc(mp))
+		return;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_symlink_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_symlink_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_mount;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	xfs_failaddr_t		fa;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_has_crc(mp))
+		return;
+
+	fa = xfs_symlink_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (bip) {
+		struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+		dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	}
+	xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+	.name = "xfs_symlink",
+	.magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
+	.verify_read = xfs_symlink_read_verify,
+	.verify_write = xfs_symlink_write_verify,
+	.verify_struct = xfs_symlink_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	char			*buf;
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+
+	if (!xfs_has_crc(mp)) {
+		bp->b_ops = NULL;
+		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+		return;
+	}
+
+	/*
+	 * As this symlink fits in an inode literal area, it must also fit in
+	 * the smallest buffer the filesystem supports.
+	 */
+	ASSERT(BBTOB(bp->b_length) >=
+			ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+	bp->b_ops = &xfs_symlink_buf_ops;
+
+	buf = bp->b_addr;
+	buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+	memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
+					ifp->if_bytes - 1);
+}
+
+/*
+ * Verify the in-memory consistency of an inline symlink data fork. This
+ * does not do on-disk format checks.
+ */
+xfs_failaddr_t
+xfs_symlink_shortform_verify(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	char			*sfp = (char *)ifp->if_u1.if_data;
+	int			size = ifp->if_bytes;
+	char			*endp = sfp + size;
+
+	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+	/*
+	 * Zero length symlinks should never occur in memory as they are
+	 * never allowed to exist on disk.
+	 */
+	if (!size)
+		return __this_address;
+
+	/* No negative sizes or overly long symlink targets. */
+	if (size < 0 || size > XFS_SYMLINK_MAXLEN)
+		return __this_address;
+
+	/* No NULLs in the target either. */
+	if (memchr(sfp, 0, size - 1))
+		return __this_address;
+
+	/* We /did/ null-terminate the buffer, right? */
+	if (*endp != 0)
+		return __this_address;
+	return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
new file mode 100644
index 000000000..8b5547073
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+
+#include <linux/iversion.h>
+
+/*
+ * Add a locked inode to the transaction.
+ *
+ * The inode must be locked, and it cannot be associated with any transaction.
+ * If lock_flags is non-zero the inode will be unlocked on transaction commit.
+ */
+void
+xfs_trans_ijoin(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	uint			lock_flags)
+{
+	struct xfs_inode_log_item *iip;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	if (ip->i_itemp == NULL)
+		xfs_inode_item_init(ip, ip->i_mount);
+	iip = ip->i_itemp;
+
+	ASSERT(iip->ili_lock_flags == 0);
+	iip->ili_lock_flags = lock_flags;
+	ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &iip->ili_item);
+}
+
+/*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct timespec64	tv;
+
+	ASSERT(tp);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	tv = current_time(inode);
+
+	if (flags & XFS_ICHGTIME_MOD)
+		inode->i_mtime = tv;
+	if (flags & XFS_ICHGTIME_CHG)
+		inode->i_ctime = tv;
+	if (flags & XFS_ICHGTIME_CREATE)
+		ip->i_crtime = tv;
+}
+
+/*
+ * This is called to mark the fields indicated in fieldmask as needing to be
+ * logged when the transaction is committed.  The inode must already be
+ * associated with the given transaction.
+ *
+ * The values for fieldmask are defined in xfs_inode_item.h.  We always log all
+ * of the core inode if any of it has changed, and we always log all of the
+ * inline data/extents/b-tree root if any of them has changed.
+ *
+ * Grab and pin the cluster buffer associated with this inode to avoid RMW
+ * cycles at inode writeback time. Avoid the need to add error handling to every
+ * xfs_trans_log_inode() call by shutting down on read error.  This will cause
+ * transactions to fail and everything to error out, just like if we return a
+ * read error in a dirty transaction and cancel it.
+ */
+void
+xfs_trans_log_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	uint			flags)
+{
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	struct inode		*inode = VFS_I(ip);
+	uint			iversion_flags = 0;
+
+	ASSERT(iip);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+
+	/*
+	 * Don't bother with i_lock for the I_DIRTY_TIME check here, as races
+	 * don't matter - we either will need an extra transaction in 24 hours
+	 * to log the timestamps, or will clear already cleared fields in the
+	 * worst case.
+	 */
+	if (inode->i_state & I_DIRTY_TIME) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_DIRTY_TIME;
+		spin_unlock(&inode->i_lock);
+	}
+
+	/*
+	 * First time we log the inode in a transaction, bump the inode change
+	 * counter if it is configured for this to occur. While we have the
+	 * inode locked exclusively for metadata modification, we can usually
+	 * avoid setting XFS_ILOG_CORE if no one has queried the value since
+	 * the last time it was incremented. If we have XFS_ILOG_CORE already
+	 * set however, then go ahead and bump the i_version counter
+	 * unconditionally.
+	 */
+	if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
+		if (IS_I_VERSION(inode) &&
+		    inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
+			iversion_flags = XFS_ILOG_CORE;
+	}
+
+	/*
+	 * If we're updating the inode core or the timestamps and it's possible
+	 * to upgrade this inode to bigtime format, do so now.
+	 */
+	if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
+	    xfs_has_bigtime(ip->i_mount) &&
+	    !xfs_inode_has_bigtime(ip)) {
+		ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
+		flags |= XFS_ILOG_CORE;
+	}
+
+	/*
+	 * Inode verifiers do not check that the extent size hint is an integer
+	 * multiple of the rt extent size on a directory with both rtinherit
+	 * and extszinherit flags set.  If we're logging a directory that is
+	 * misconfigured in this way, clear the hint.
+	 */
+	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+	    (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+				   XFS_DIFLAG_EXTSZINHERIT);
+		ip->i_extsize = 0;
+		flags |= XFS_ILOG_CORE;
+	}
+
+	/*
+	 * Record the specific change for fdatasync optimisation. This allows
+	 * fdatasync to skip log forces for inodes that are only timestamp
+	 * dirty.
+	 */
+	spin_lock(&iip->ili_lock);
+	iip->ili_fsync_fields |= flags;
+
+	if (!iip->ili_item.li_buf) {
+		struct xfs_buf	*bp;
+		int		error;
+
+		/*
+		 * We hold the ILOCK here, so this inode is not going to be
+		 * flushed while we are here. Further, because there is no
+		 * buffer attached to the item, we know that there is no IO in
+		 * progress, so nothing will clear the ili_fields while we read
+		 * in the buffer. Hence we can safely drop the spin lock and
+		 * read the buffer knowing that the state will not change from
+		 * here.
+		 */
+		spin_unlock(&iip->ili_lock);
+		error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
+		if (error) {
+			xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
+			return;
+		}
+
+		/*
+		 * We need an explicit buffer reference for the log item but
+		 * don't want the buffer to remain attached to the transaction.
+		 * Hold the buffer but release the transaction reference once
+		 * we've attached the inode log item to the buffer log item
+		 * list.
+		 */
+		xfs_buf_hold(bp);
+		spin_lock(&iip->ili_lock);
+		iip->ili_item.li_buf = bp;
+		bp->b_flags |= _XBF_INODES;
+		list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
+		xfs_trans_brelse(tp, bp);
+	}
+
+	/*
+	 * Always OR in the bits from the ili_last_fields field.  This is to
+	 * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
+	 * in the eventual clearing of the ili_fields bits.  See the big comment
+	 * in xfs_iflush() for an explanation of this coordination mechanism.
+	 */
+	iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
+	spin_unlock(&iip->ili_lock);
+}
+
+int
+xfs_trans_roll_inode(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip)
+{
+	int			error;
+
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+	error = xfs_trans_roll(tpp);
+	if (!error)
+		xfs_trans_ijoin(*tpp, ip, 0);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
new file mode 100644
index 000000000..5b2f27cbd
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -0,0 +1,1028 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+
+#define _ALLOC	true
+#define _FREE	false
+
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+	return round_up(sizeof(struct xlog_op_header) +
+			sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+	uint		nbufs,
+	uint		size)
+{
+	return nbufs * (size + xfs_buf_log_overhead());
+}
+
+/*
+ * Per-extent log reservation for the btree changes involved in freeing or
+ * allocating an extent.  In classic XFS there were two trees that will be
+ * modified (bnobt + cntbt).  With rmap enabled, there are three trees
+ * (rmapbt).  The number of blocks reserved is based on the formula:
+ *
+ * num trees * ((2 blocks/level * max depth) - 1)
+ *
+ * Keep in mind that max depth is calculated separately for each type of tree.
+ */
+uint
+xfs_allocfree_block_count(
+	struct xfs_mount *mp,
+	uint		num_ops)
+{
+	uint		blocks;
+
+	blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
+	if (xfs_has_rmapbt(mp))
+		blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+
+	return blocks;
+}
+
+/*
+ * Per-extent log reservation for refcount btree changes.  These are never done
+ * in the same transaction as an allocation or a free, so we compute them
+ * separately.
+ */
+static unsigned int
+xfs_refcountbt_block_count(
+	struct xfs_mount	*mp,
+	unsigned int		num_ops)
+{
+	return num_ops * (2 * mp->m_refc_maxlevels - 1);
+}
+
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - 4 log op headers for object
+ *	- for the ilf, the inode core and 2 forks
+ * - inode log format object
+ * - the inode core
+ * - two inode forks containing bmap btree root blocks.
+ *	- the btree data contained by both forks will fit into the inode size,
+ *	  hence when combined with the inode core above, we have a total of the
+ *	  actual inode size.
+ *	- the BMBT headers need to be accounted separately, as they are
+ *	  additional to the records and pointers that fit inside the inode
+ *	  forks.
+ */
+STATIC uint
+xfs_calc_inode_res(
+	struct xfs_mount	*mp,
+	uint			ninodes)
+{
+	return ninodes *
+		(4 * sizeof(struct xlog_op_header) +
+		 sizeof(struct xfs_inode_log_format) +
+		 mp->m_sb.sb_inodesize +
+		 2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * Inode btree record insertion/removal modifies the inode btree and free space
+ * btrees (since the inobt does not use the agfl). This requires the following
+ * reservation:
+ *
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ *
+ * The caller must account for SB and AG header modifications, etc.
+ */
+STATIC uint
+xfs_calc_inobt_res(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
+			XFS_FSB_TO_B(mp, 1)) +
+				xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+			XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * The free inode btree is a conditional feature. The behavior differs slightly
+ * from that of the traditional inode btree in that the finobt tracks records
+ * for inode chunks with at least one free inode. A record can be removed from
+ * the tree during individual inode allocation. Therefore the finobt
+ * reservation is unconditional for both the inode chunk allocation and
+ * individual inode allocation (modify) cases.
+ *
+ * Behavior aside, the reservation for finobt modification is equivalent to the
+ * traditional inobt: cover a full finobt shape change plus block allocation.
+ */
+STATIC uint
+xfs_calc_finobt_res(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_has_finobt(mp))
+		return 0;
+
+	return xfs_calc_inobt_res(mp);
+}
+
+/*
+ * Calculate the reservation required to allocate or free an inode chunk. This
+ * includes:
+ *
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the inode chunk: m_ino_geo.ialloc_blks * N
+ *
+ * The size N of the inode chunk reservation depends on whether it is for
+ * allocation or free and which type of create transaction is in use. An inode
+ * chunk free always invalidates the buffers and only requires reservation for
+ * headers (N == 0). An inode chunk allocation requires a chunk sized
+ * reservation on v4 and older superblocks to initialize the chunk. No chunk
+ * reservation is required for allocation on v5 supers, which use ordered
+ * buffers to initialize.
+ */
+STATIC uint
+xfs_calc_inode_chunk_res(
+	struct xfs_mount	*mp,
+	bool			alloc)
+{
+	uint			res, size = 0;
+
+	res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+			       XFS_FSB_TO_B(mp, 1));
+	if (alloc) {
+		/* icreate tx uses ordered buffers */
+		if (xfs_has_v3inodes(mp))
+			return res;
+		size = XFS_FSB_TO_B(mp, 1);
+	}
+
+	res += xfs_calc_buf_res(M_IGEO(mp)->ialloc_blks, size);
+	return res;
+}
+
+/*
+ * Per-extent log reservation for the btree changes involved in freeing or
+ * allocating a realtime extent.  We have to be able to log as many rtbitmap
+ * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
+ * extents, as well as the realtime summary block.
+ */
+static unsigned int
+xfs_rtalloc_block_count(
+	struct xfs_mount	*mp,
+	unsigned int		num_ops)
+{
+	unsigned int		blksz = XFS_FSB_TO_B(mp, 1);
+	unsigned int		rtbmp_bytes;
+
+	rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+	return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
+}
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_defer_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_defer_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+/*
+ * Compute the log reservation required to handle the refcount update
+ * transaction.  Refcount updates are always done via deferred log items.
+ *
+ * This is calculated as:
+ * Data device refcount updates (t1):
+ *    the agfs of the ags containing the blocks: nr_ops * sector size
+ *    the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+static unsigned int
+xfs_calc_refcountbt_reservation(
+	struct xfs_mount	*mp,
+	unsigned int		nr_ops)
+{
+	unsigned int		blksz = XFS_FSB_TO_B(mp, 1);
+
+	if (!xfs_has_reflink(mp))
+		return 0;
+
+	return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+	       xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+}
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives (t1):
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * Or, if we're writing to a realtime file (t2):
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
+ *    the realtime summary: 1 block
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join (t3):
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
+ */
+STATIC uint
+xfs_calc_write_reservation(
+	struct xfs_mount	*mp,
+	bool			for_minlogsize)
+{
+	unsigned int		t1, t2, t3, t4;
+	unsigned int		blksz = XFS_FSB_TO_B(mp, 1);
+
+	t1 = xfs_calc_inode_res(mp, 1) +
+	     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
+	     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+	     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
+
+	if (xfs_has_realtime(mp)) {
+		t2 = xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+				     blksz) +
+		     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) +
+		     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);
+	} else {
+		t2 = 0;
+	}
+
+	t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+	     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
+
+	/*
+	 * In the early days of reflink, we included enough reservation to log
+	 * two refcountbt splits for each transaction.  The codebase runs
+	 * refcountbt updates in separate transactions now, so to compute the
+	 * minimum log size, add the refcountbtree splits back to t1 and t3 and
+	 * do not account them separately as t4.  Reflink did not support
+	 * realtime when the reservations were established, so no adjustment to
+	 * t2 is needed.
+	 */
+	if (for_minlogsize) {
+		unsigned int	adj = 0;
+
+		if (xfs_has_reflink(mp))
+			adj = xfs_calc_buf_res(
+					xfs_refcountbt_block_count(mp, 2),
+					blksz);
+		t1 += adj;
+		t3 += adj;
+		return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+	}
+
+	t4 = xfs_calc_refcountbt_reservation(mp, 1);
+	return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_write_reservation_minlogsize(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_write_reservation(mp, true);
+}
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify (t1):
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks (t2):
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ * Or, if it's a realtime file (t3):
+ *    the agf for each of the ags: 2 * sector size
+ *    the agfl for each of the ags: 2 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    the realtime bitmap:
+ *		2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
+ *    the realtime summary: 2 exts * 1 block
+ *    worst case split in allocation btrees per extent assuming 2 extents:
+ *		2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+	struct xfs_mount	*mp,
+	bool			for_minlogsize)
+{
+	unsigned int		t1, t2, t3, t4;
+	unsigned int		blksz = XFS_FSB_TO_B(mp, 1);
+
+	t1 = xfs_calc_inode_res(mp, 1) +
+	     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
+
+	t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+	     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
+
+	if (xfs_has_realtime(mp)) {
+		t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
+		     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
+	} else {
+		t3 = 0;
+	}
+
+	/*
+	 * In the early days of reflink, we included enough reservation to log
+	 * four refcountbt splits in the same transaction as bnobt/cntbt
+	 * updates.  The codebase runs refcountbt updates in separate
+	 * transactions now, so to compute the minimum log size, add the
+	 * refcount btree splits back here and do not compute them separately
+	 * as t4.  Reflink did not support realtime when the reservations were
+	 * established, so do not adjust t3.
+	 */
+	if (for_minlogsize) {
+		if (xfs_has_reflink(mp))
+			t2 += xfs_calc_buf_res(
+					xfs_refcountbt_block_count(mp, 4),
+					blksz);
+
+		return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+	}
+
+	t4 = xfs_calc_refcountbt_reservation(mp, 2);
+	return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_itruncate_reservation_minlogsize(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_itruncate_reservation(mp, true);
+}
+
+/*
+ * In renaming a files we can modify:
+ *    the five inodes involved: 5 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *	of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		max((xfs_calc_inode_res(mp, 5) +
+		     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For removing an inode from unlinked list at first, we can modify:
+ *    the agi hash list and counters: sector size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the on disk inode in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+	struct xfs_mount        *mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+	       2 * M_IGEO(mp)->inode_cluster_size;
+}
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_iunlink_remove_reservation(mp) +
+		max((xfs_calc_inode_res(mp, 2) +
+		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For adding an inode to unlinked list we can modify:
+ *    the agi hash list: sector size
+ *    the on disk inode: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+			M_IGEO(mp)->inode_cluster_size;
+}
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_iunlink_add_reservation(mp) +
+		max((xfs_calc_inode_res(mp, 2) +
+		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the finobt (record modification and allocation btrees)
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 2) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		(uint)XFS_FSB_TO_B(mp, 1) +
+		xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_finobt_res(mp);
+}
+
+/*
+ * For icreate we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode chunk (allocation, optional init)
+ *    the inobt (record insertion)
+ *    the finobt (optional, record insertion)
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		mp->m_sb.sb_sectsize +
+		xfs_calc_inode_chunk_res(mp, _ALLOC) +
+		xfs_calc_inobt_res(mp) +
+		xfs_calc_finobt_res(mp);
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		max(xfs_calc_icreate_resv_alloc(mp),
+		    xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+	struct xfs_mount        *mp)
+{
+	uint	res = XFS_DQUOT_LOGRES(mp);
+
+	res += xfs_calc_icreate_resv_alloc(mp);
+	return res + xfs_calc_iunlink_add_reservation(mp);
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_icreate_reservation(mp);
+}
+
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (XFS_SYMLINK_MAXLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_icreate_reservation(mp) +
+	       xfs_calc_buf_res(1, XFS_SYMLINK_MAXLEN);
+}
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter, AGF and AGFL: sector size
+ *    the on disk inode (agi unlinked list removal)
+ *    the inode chunk (invalidated, headers only)
+ *    the inode btree
+ *    the finobt (record insertion, removal or modification)
+ *
+ * Note that the inode chunk res. includes an allocfree res. for freeing of the
+ * inode chunk. This is technically extraneous because the inode chunk free is
+ * deferred (it occurs after a transaction roll). Include the extra reservation
+ * anyways since we've had reports of ifree transaction overruns due to too many
+ * agfl fixups during inode chunk frees.
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		xfs_calc_iunlink_remove_reservation(mp) +
+		xfs_calc_inode_chunk_res(mp, _FREE) +
+		xfs_calc_inobt_res(mp) +
+		xfs_calc_finobt_res(mp);
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ *	superblock
+ *	agi and agf
+ *	allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *	superblock: sector size
+ *	agf of the ag from which the extent is allocated: sector size
+ *	bmap btree for bitmap/summary inode: max depth * blocksize
+ *	bitmap/summary inode: inode size
+ *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *	one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *	superblock: sector size
+ *	bitmap inode: inode size
+ *	summary inode: inode size
+ *	one bitmap block: blocksize
+ *	summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_inode_res(mp, 2) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+		xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *	inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *	inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *	the inode being converted: inode size
+ *	agf block and superblock (for block allocation)
+ *	the new block (directory sized)
+ *	bmap blocks for the new directory block
+ *	allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
+		xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+	struct xfs_mount	*mp)
+{
+	return max((xfs_calc_inode_res(mp, 1) +
+		    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+				     XFS_FSB_TO_B(mp, 1))),
+		   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+		    xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4),
+				     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Setting an attribute at mount time.
+ *	the inode getting the attribute
+ *	the superblock for allocations
+ *	the agfs extents are allocated from
+ *	the attribute btree * max depth
+ *	the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ * 	the superblock for allocations: sector size
+ *	the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ *	ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		max((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+				      XFS_FSB_TO_B(mp, 1)) +
+		     (uint)XFS_FSB_TO_B(mp,
+					XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ *    the disk quota buffer: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(void)
+{
+	return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ *	the write transaction log space for quota file extent allocation
+ *	the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+	struct xfs_mount	*mp,
+	bool			for_minlogsize)
+{
+	return xfs_calc_write_reservation(mp, for_minlogsize) +
+		xfs_calc_buf_res(1,
+			XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+unsigned int
+xfs_calc_qm_dqalloc_reservation_minlogsize(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_qm_dqalloc_reservation(mp, true);
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+void
+xfs_trans_resv_calc(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	int			logcount_adj = 0;
+
+	/*
+	 * The following transactions are logged in physical format and
+	 * require a permanent reservation on space.
+	 */
+	resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+	resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+	resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+	resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+	resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+	resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+	resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+	resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+	resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+	resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+	resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+	resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+	resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp);
+	resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+	resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_create_tmpfile.tr_logres =
+			xfs_calc_create_tmpfile_reservation(mp);
+	resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+	resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+	resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+	resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+	resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+	resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+	resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+	resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+	resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+	resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+	resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+	resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+	resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+	resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+	resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+	resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp,
+			false);
+	resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+	resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	/*
+	 * The following transactions are logged in logical format with
+	 * a default log count.
+	 */
+	resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation();
+	resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+	resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	/* growdata requires permanent res; it can free space to the last AG */
+	resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+	resp->tr_growdata.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+	resp->tr_growdata.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	/* The following transaction are logged in logical format */
+	resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+	resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+	resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+	resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+	resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+	resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+	resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+
+	/*
+	 * Add one logcount for BUI items that appear with rmap or reflink,
+	 * one logcount for refcount intent items, and one logcount for rmap
+	 * intent items.
+	 */
+	if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp))
+		logcount_adj++;
+	if (xfs_has_reflink(mp))
+		logcount_adj++;
+	if (xfs_has_rmapbt(mp))
+		logcount_adj++;
+
+	resp->tr_itruncate.tr_logcount += logcount_adj;
+	resp->tr_write.tr_logcount += logcount_adj;
+	resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
new file mode 100644
index 000000000..0554b9d77
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_TRANS_RESV_H__
+#define	__XFS_TRANS_RESV_H__
+
+struct xfs_mount;
+
+/*
+ * structure for maintaining pre-calculated transaction reservations.
+ */
+struct xfs_trans_res {
+	uint	tr_logres;	/* log space unit in bytes per log ticket */
+	int	tr_logcount;	/* number of log operations per log ticket */
+	int	tr_logflags;	/* log flags, currently only used for indicating
+				 * a reservation request is permanent or not */
+};
+
+struct xfs_trans_resv {
+	struct xfs_trans_res	tr_write;	/* extent alloc trans */
+	struct xfs_trans_res	tr_itruncate;	/* truncate trans */
+	struct xfs_trans_res	tr_rename;	/* rename trans */
+	struct xfs_trans_res	tr_link;	/* link trans */
+	struct xfs_trans_res	tr_remove;	/* unlink trans */
+	struct xfs_trans_res	tr_symlink;	/* symlink trans */
+	struct xfs_trans_res	tr_create;	/* create trans */
+	struct xfs_trans_res	tr_create_tmpfile; /* create O_TMPFILE trans */
+	struct xfs_trans_res	tr_mkdir;	/* mkdir trans */
+	struct xfs_trans_res	tr_ifree;	/* inode free trans */
+	struct xfs_trans_res	tr_ichange;	/* inode update trans */
+	struct xfs_trans_res	tr_growdata;	/* fs data section grow trans */
+	struct xfs_trans_res	tr_addafork;	/* add inode attr fork trans */
+	struct xfs_trans_res	tr_writeid;	/* write setuid/setgid file */
+	struct xfs_trans_res	tr_attrinval;	/* attr fork buffer
+						 * invalidation */
+	struct xfs_trans_res	tr_attrsetm;	/* set/create an attribute at
+						 * mount time */
+	struct xfs_trans_res	tr_attrsetrt;	/* set/create an attribute at
+						 * runtime */
+	struct xfs_trans_res	tr_attrrm;	/* remove an attribute */
+	struct xfs_trans_res	tr_clearagi;	/* clear agi unlinked bucket */
+	struct xfs_trans_res	tr_growrtalloc;	/* grow realtime allocations */
+	struct xfs_trans_res	tr_growrtzero;	/* grow realtime zeroing */
+	struct xfs_trans_res	tr_growrtfree;	/* grow realtime freeing */
+	struct xfs_trans_res	tr_qm_setqlim;	/* adjust quota limits */
+	struct xfs_trans_res	tr_qm_dqalloc;	/* allocate quota on disk */
+	struct xfs_trans_res	tr_sb;		/* modify superblock */
+	struct xfs_trans_res	tr_fsyncts;	/* update timestamps on fsync */
+};
+
+/* shorthand way of accessing reservation structure */
+#define M_RES(mp)	(&(mp)->m_resv)
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define	XFS_DIROP_LOG_RES(mp)	\
+	(XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+	 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define	XFS_DIROP_LOG_COUNT(mp)	\
+	(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+	 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * Various log count values.
+ */
+#define	XFS_DEFAULT_LOG_COUNT		1
+#define	XFS_DEFAULT_PERM_LOG_COUNT	2
+#define	XFS_ITRUNCATE_LOG_COUNT		2
+#define XFS_INACTIVE_LOG_COUNT		2
+#define	XFS_CREATE_LOG_COUNT		2
+#define	XFS_CREATE_TMPFILE_LOG_COUNT	2
+#define	XFS_MKDIR_LOG_COUNT		3
+#define	XFS_SYMLINK_LOG_COUNT		3
+#define	XFS_REMOVE_LOG_COUNT		2
+#define	XFS_LINK_LOG_COUNT		2
+#define	XFS_RENAME_LOG_COUNT		2
+#define	XFS_WRITE_LOG_COUNT		2
+#define	XFS_ADDAFORK_LOG_COUNT		2
+#define	XFS_ATTRINVAL_LOG_COUNT		1
+#define	XFS_ATTRSET_LOG_COUNT		3
+#define	XFS_ATTRRM_LOG_COUNT		3
+
+/*
+ * Original log operation counts were overestimated in the early days of
+ * reflink.  These are retained here purely for minimum log size calculations
+ * and must not be used for runtime reservations.
+ */
+#define	XFS_ITRUNCATE_LOG_COUNT_REFLINK	8
+#define	XFS_WRITE_LOG_COUNT_REFLINK	8
+
+void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+
+unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
+
+#endif	/* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
new file mode 100644
index 000000000..87b31c69a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+
+/*
+ * Components of space reservations.
+ */
+
+/* Worst case number of rmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
+		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
+
+/* Adding one rmap could split every level up to the top of the tree. */
+#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+
+/*
+ * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled,
+ * so we must preserve this behavior to avoid changing the transaction space
+ * reservations and minimum log size calculations for existing filesystems.
+ */
+#define XFS_OLD_REFLINK_RMAP_MAXLEVELS		9
+
+/* Blocks we might need to add "b" rmaps to a tree. */
+#define XFS_NRMAPADD_SPACE_RES(mp, b)\
+	(((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+	  XFS_RMAPADD_SPACE_RES(mp))
+
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
+		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define	XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+	  XFS_EXTENTADD_SPACE_RES(mp,w))
+
+/* Blocks we might need to add "b" mappings & rmappings to a file. */
+#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
+	(XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \
+	 XFS_NRMAPADD_SPACE_RES((mp), (b)))
+
+#define	XFS_DAENTER_1B(mp,w)	\
+	((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
+#define	XFS_DAENTER_DBS(mp,w)	\
+	(XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
+#define	XFS_DAENTER_BLOCKS(mp,w)	\
+	(XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define	XFS_DAENTER_BMAP1B(mp,w)	\
+	XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define	XFS_DAENTER_BMAPS(mp,w)		\
+	(XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define	XFS_DAENTER_SPACE_RES(mp,w)	\
+	(XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define	XFS_DAREMOVE_SPACE_RES(mp,w)	XFS_DAENTER_BMAPS(mp,w)
+#define	XFS_DIRENTER_MAX_SPLIT(mp,nl)	1
+#define	XFS_DIRENTER_SPACE_RES(mp,nl)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+	 XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define	XFS_DIRREMOVE_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define	XFS_IALLOC_SPACE_RES(mp)	\
+	(M_IGEO(mp)->ialloc_blks + \
+	 ((xfs_has_finobt(mp) ? 2 : 1) * M_IGEO(mp)->inobt_maxlevels))
+
+/*
+ * Space reservation values for various transactions.
+ */
+#define	XFS_ADDAFORK_SPACE_RES(mp)	\
+	((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+#define	XFS_ATTRRM_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define	XFS_ATTRSET_SPACE_RES(mp, v)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define	XFS_CREATE_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_DIOSTRAT_SPACE_RES(mp, v)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define	XFS_GROWFS_SPACE_RES(mp)	\
+	(2 * (mp)->m_alloc_maxlevels)
+#define	XFS_GROWFSRT_SPACE_RES(mp,b)	\
+	((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define	XFS_LINK_SPACE_RES(mp,nl)	\
+	XFS_DIRENTER_SPACE_RES(mp,nl)
+#define	XFS_MKDIR_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_QM_DQALLOC_SPACE_RES(mp)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+	 XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define	XFS_QM_QINOCREATE_SPACE_RES(mp)	\
+	XFS_IALLOC_SPACE_RES(mp)
+#define	XFS_REMOVE_SPACE_RES(mp)	\
+	XFS_DIRREMOVE_SPACE_RES(mp)
+#define	XFS_RENAME_SPACE_RES(mp,nl)	\
+	(XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_SYMLINK_SPACE_RES(mp,nl,b)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp)		\
+	(xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0)
+
+
+#endif	/* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
new file mode 100644
index 000000000..5c2765934
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2017 Oracle.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+
+
+/*
+ * Verify that an AG block number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agno_agbno(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno)
+{
+	xfs_agblock_t		eoag;
+
+	eoag = xfs_ag_block_count(mp, agno);
+	if (agbno >= eoag)
+		return false;
+	if (agbno <= XFS_AGFL_BLOCK(mp))
+		return false;
+	return true;
+}
+
+/*
+ * Verify that an FS block number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+inline bool
+xfs_verify_fsbno(
+	struct xfs_mount	*mp,
+	xfs_fsblock_t		fsbno)
+{
+	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
+
+	if (agno >= mp->m_sb.sb_agcount)
+		return false;
+	return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+}
+
+/*
+ * Verify that a data device extent is fully contained inside the filesystem,
+ * does not cross an AG boundary, and does not point at static metadata.
+ */
+bool
+xfs_verify_fsbext(
+	struct xfs_mount	*mp,
+	xfs_fsblock_t		fsbno,
+	xfs_fsblock_t		len)
+{
+	if (fsbno + len <= fsbno)
+		return false;
+
+	if (!xfs_verify_fsbno(mp, fsbno))
+		return false;
+
+	if (!xfs_verify_fsbno(mp, fsbno + len - 1))
+		return false;
+
+	return  XFS_FSB_TO_AGNO(mp, fsbno) ==
+		XFS_FSB_TO_AGNO(mp, fsbno + len - 1);
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agno_agino(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino)
+{
+	xfs_agino_t		first;
+	xfs_agino_t		last;
+
+	xfs_agino_range(mp, agno, &first, &last);
+	return agino >= first && agino <= last;
+}
+
+/*
+ * Verify that an FS inode number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+inline bool
+xfs_verify_ino(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
+
+	if (agno >= mp->m_sb.sb_agcount)
+		return false;
+	if (XFS_AGINO_TO_INO(mp, agno, agino) != ino)
+		return false;
+	return xfs_verify_agno_agino(mp, agno, agino);
+}
+
+/* Is this an internal inode number? */
+inline bool
+xfs_internal_inum(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+		(xfs_has_quota(mp) &&
+		 xfs_is_quota_inode(&mp->m_sb, ino));
+}
+
+/*
+ * Verify that a directory entry's inode number doesn't point at an internal
+ * inode, empty space, or static AG metadata.
+ */
+bool
+xfs_verify_dir_ino(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	if (xfs_internal_inum(mp, ino))
+		return false;
+	return xfs_verify_ino(mp, ino);
+}
+
+/*
+ * Verify that an realtime block number pointer doesn't point off the
+ * end of the realtime device.
+ */
+inline bool
+xfs_verify_rtbno(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	return rtbno < mp->m_sb.sb_rblocks;
+}
+
+/* Verify that a realtime device extent is fully contained inside the volume. */
+bool
+xfs_verify_rtext(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno,
+	xfs_rtblock_t		len)
+{
+	if (rtbno + len <= rtbno)
+		return false;
+
+	if (!xfs_verify_rtbno(mp, rtbno))
+		return false;
+
+	return xfs_verify_rtbno(mp, rtbno + len - 1);
+}
+
+/* Calculate the range of valid icount values. */
+inline void
+xfs_icount_range(
+	struct xfs_mount	*mp,
+	unsigned long long	*min,
+	unsigned long long	*max)
+{
+	unsigned long long	nr_inos = 0;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	/* root, rtbitmap, rtsum all live in the first chunk */
+	*min = XFS_INODES_PER_CHUNK;
+
+	for_each_perag(mp, agno, pag)
+		nr_inos += pag->agino_max - pag->agino_min + 1;
+	*max = nr_inos;
+}
+
+/* Sanity-checking of inode counts. */
+bool
+xfs_verify_icount(
+	struct xfs_mount	*mp,
+	unsigned long long	icount)
+{
+	unsigned long long	min, max;
+
+	xfs_icount_range(mp, &min, &max);
+	return icount >= min && icount <= max;
+}
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+	struct xfs_mount	*mp,
+	xfs_fileoff_t		dabno)
+{
+	xfs_dablk_t		max_dablk = -1U;
+
+	return dabno <= max_dablk;
+}
+
+/* Check that a file block offset does not exceed the maximum. */
+bool
+xfs_verify_fileoff(
+	struct xfs_mount	*mp,
+	xfs_fileoff_t		off)
+{
+	return off <= XFS_MAX_FILEOFF;
+}
+
+/* Check that a range of file block offsets do not exceed the maximum. */
+bool
+xfs_verify_fileext(
+	struct xfs_mount	*mp,
+	xfs_fileoff_t		off,
+	xfs_fileoff_t		len)
+{
+	if (off + len <= off)
+		return false;
+
+	if (!xfs_verify_fileoff(mp, off))
+		return false;
+
+	return xfs_verify_fileoff(mp, off + len - 1);
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
new file mode 100644
index 000000000..5ebdda7e1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_TYPES_H__
+#define	__XFS_TYPES_H__
+
+typedef uint32_t	prid_t;		/* project ID */
+
+typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
+typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
+typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
+typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
+typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
+typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
+typedef int64_t		xfs_fsize_t;	/* bytes in a file */
+typedef uint64_t	xfs_ufsize_t;	/* unsigned bytes in a file */
+
+typedef int32_t		xfs_suminfo_t;	/* type of bitmap summary info */
+typedef uint32_t	xfs_rtword_t;	/* word type for bitmap manipulations */
+
+typedef int64_t		xfs_lsn_t;	/* log sequence number */
+typedef int64_t		xfs_csn_t;	/* CIL sequence number */
+
+typedef uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
+typedef uint32_t	xfs_dahash_t;	/* dir/attr hash value */
+
+typedef uint64_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
+typedef uint64_t	xfs_rfsblock_t;	/* blockno in filesystem (raw) */
+typedef uint64_t	xfs_rtblock_t;	/* extent (block) in realtime area */
+typedef uint64_t	xfs_fileoff_t;	/* block number in a file */
+typedef uint64_t	xfs_filblks_t;	/* number of blocks in a file */
+
+typedef int64_t		xfs_srtblock_t;	/* signed version of xfs_rtblock_t */
+
+/*
+ * New verifiers will return the instruction address of the failing check.
+ * NULL means everything is ok.
+ */
+typedef void *		xfs_failaddr_t;
+
+/*
+ * Null values for the types.
+ */
+#define	NULLFSBLOCK	((xfs_fsblock_t)-1)
+#define	NULLRFSBLOCK	((xfs_rfsblock_t)-1)
+#define	NULLRTBLOCK	((xfs_rtblock_t)-1)
+#define	NULLFILEOFF	((xfs_fileoff_t)-1)
+
+#define	NULLAGBLOCK	((xfs_agblock_t)-1)
+#define	NULLAGNUMBER	((xfs_agnumber_t)-1)
+
+#define NULLCOMMITLSN	((xfs_lsn_t)-1)
+
+#define	NULLFSINO	((xfs_ino_t)-1)
+#define	NULLAGINO	((xfs_agino_t)-1)
+
+/*
+ * Minimum and maximum blocksize and sectorsize.
+ * The blocksize upper limit is pretty much arbitrary.
+ * The sectorsize upper limit is due to sizeof(sb_sectsize).
+ * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes
+ * cannot be used.
+ */
+#define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
+#define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
+#define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
+#define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_CRC_BLOCKSIZE	(1 << (XFS_MIN_BLOCKSIZE_LOG + 1))
+#define XFS_MIN_SECTORSIZE_LOG	9	/* i.e. 512 bytes */
+#define XFS_MAX_SECTORSIZE_LOG	15	/* i.e. 32768 bytes */
+#define XFS_MIN_SECTORSIZE	(1 << XFS_MIN_SECTORSIZE_LOG)
+#define XFS_MAX_SECTORSIZE	(1 << XFS_MAX_SECTORSIZE_LOG)
+
+/*
+ * Inode fork identifiers.
+ */
+#define	XFS_DATA_FORK	0
+#define	XFS_ATTR_FORK	1
+#define	XFS_COW_FORK	2
+
+#define XFS_WHICHFORK_STRINGS \
+	{ XFS_DATA_FORK, 	"data" }, \
+	{ XFS_ATTR_FORK,	"attr" }, \
+	{ XFS_COW_FORK,		"cow" }
+
+/*
+ * Min numbers of data/attr fork btree root pointers.
+ */
+#define MINDBTPTRS	3
+#define MINABTPTRS	2
+
+/*
+ * MAXNAMELEN is the length (including the terminating null) of
+ * the longest permissible file (component) name.
+ */
+#define MAXNAMELEN	256
+
+/*
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
+ */
+typedef enum {
+	XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
+} xfs_lookup_t;
+
+#define XFS_AG_BTREE_CMP_FORMAT_STR \
+	{ XFS_LOOKUP_EQi,	"eq" }, \
+	{ XFS_LOOKUP_LEi,	"le" }, \
+	{ XFS_LOOKUP_GEi,	"ge" }
+
+/*
+ * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
+ * please keep the TRACE_DEFINE_ENUMs for it up to date.
+ */
+typedef enum {
+	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
+	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
+} xfs_btnum_t;
+
+#define XFS_BTNUM_STRINGS \
+	{ XFS_BTNUM_BNOi,	"bnobt" }, \
+	{ XFS_BTNUM_CNTi,	"cntbt" }, \
+	{ XFS_BTNUM_RMAPi,	"rmapbt" }, \
+	{ XFS_BTNUM_BMAPi,	"bmbt" }, \
+	{ XFS_BTNUM_INOi,	"inobt" }, \
+	{ XFS_BTNUM_FINOi,	"finobt" }, \
+	{ XFS_BTNUM_REFCi,	"refcbt" }
+
+struct xfs_name {
+	const unsigned char	*name;
+	int			len;
+	int			type;
+};
+
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef uint32_t	xfs_dqid_t;
+
+/*
+ * Constants for bit manipulations.
+ */
+#define	XFS_NBBYLOG	3		/* log2(NBBY) */
+#define	XFS_WORDLOG	2		/* log2(sizeof(xfs_rtword_t)) */
+#define	XFS_NBWORDLOG	(XFS_NBBYLOG + XFS_WORDLOG)
+#define	XFS_NBWORD	(1 << XFS_NBWORDLOG)
+#define	XFS_WORDMASK	((1 << XFS_WORDLOG) - 1)
+
+struct xfs_iext_cursor {
+	struct xfs_iext_leaf	*leaf;
+	int			pos;
+};
+
+typedef enum {
+	XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+} xfs_exntst_t;
+
+typedef struct xfs_bmbt_irec
+{
+	xfs_fileoff_t	br_startoff;	/* starting file offset */
+	xfs_fsblock_t	br_startblock;	/* starting block number */
+	xfs_filblks_t	br_blockcount;	/* number of blocks */
+	xfs_exntst_t	br_state;	/* extent state */
+} xfs_bmbt_irec_t;
+
+enum xfs_refc_domain {
+	XFS_REFC_DOMAIN_SHARED = 0,
+	XFS_REFC_DOMAIN_COW,
+};
+
+#define XFS_REFC_DOMAIN_STRINGS \
+	{ XFS_REFC_DOMAIN_SHARED,	"shared" }, \
+	{ XFS_REFC_DOMAIN_COW,		"cow" }
+
+struct xfs_refcount_irec {
+	xfs_agblock_t	rc_startblock;	/* starting block number */
+	xfs_extlen_t	rc_blockcount;	/* count of free blocks */
+	xfs_nlink_t	rc_refcount;	/* number of inodes linked here */
+	enum xfs_refc_domain	rc_domain; /* shared or cow staging extent? */
+};
+
+#define XFS_RMAP_ATTR_FORK		(1 << 0)
+#define XFS_RMAP_BMBT_BLOCK		(1 << 1)
+#define XFS_RMAP_UNWRITTEN		(1 << 2)
+#define XFS_RMAP_KEY_FLAGS		(XFS_RMAP_ATTR_FORK | \
+					 XFS_RMAP_BMBT_BLOCK)
+#define XFS_RMAP_REC_FLAGS		(XFS_RMAP_UNWRITTEN)
+struct xfs_rmap_irec {
+	xfs_agblock_t	rm_startblock;	/* extent start block */
+	xfs_extlen_t	rm_blockcount;	/* extent length */
+	uint64_t	rm_owner;	/* extent owner */
+	uint64_t	rm_offset;	/* offset within the owner */
+	unsigned int	rm_flags;	/* state flags */
+};
+
+/* per-AG block reservation types */
+enum xfs_ag_resv_type {
+	XFS_AG_RESV_NONE = 0,
+	XFS_AG_RESV_AGFL,
+	XFS_AG_RESV_METADATA,
+	XFS_AG_RESV_RMAPBT,
+};
+
+/*
+ * Type verifier functions
+ */
+struct xfs_mount;
+
+bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
+		xfs_fsblock_t len);
+
+bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
+bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
+		xfs_rtblock_t len);
+bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
+void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
+		unsigned long long *max);
+bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
+bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
+		xfs_fileoff_t len);
+
+#endif	/* __XFS_TYPES_H__ */
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
new file mode 100644
index 000000000..79155eec3
--- /dev/null
+++ b/fs/xfs/mrlock.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/rwsem.h>
+
+typedef struct {
+	struct rw_semaphore	mr_lock;
+#if defined(DEBUG) || defined(XFS_WARN)
+	int			mr_writer;
+#endif
+} mrlock_t;
+
+#if defined(DEBUG) || defined(XFS_WARN)
+#define mrinit(mrp, name)	\
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
+#else
+#define mrinit(mrp, name)	\
+	do { init_rwsem(&(mrp)->mr_lock); } while (0)
+#endif
+
+#define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
+#define mrfree(mrp)		do { } while (0)
+
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
+{
+	down_read_nested(&mrp->mr_lock, subclass);
+}
+
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
+{
+	down_write_nested(&mrp->mr_lock, subclass);
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 1;
+#endif
+}
+
+static inline int mrtryaccess(mrlock_t *mrp)
+{
+	return down_read_trylock(&mrp->mr_lock);
+}
+
+static inline int mrtryupdate(mrlock_t *mrp)
+{
+	if (!down_write_trylock(&mrp->mr_lock))
+		return 0;
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 1;
+#endif
+	return 1;
+}
+
+static inline void mrunlock_excl(mrlock_t *mrp)
+{
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 0;
+#endif
+	up_write(&mrp->mr_lock);
+}
+
+static inline void mrunlock_shared(mrlock_t *mrp)
+{
+	up_read(&mrp->mr_lock);
+}
+
+static inline void mrdemote(mrlock_t *mrp)
+{
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 0;
+#endif
+	downgrade_write(&mrp->mr_lock);
+}
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
new file mode 100644
index 000000000..b7b838bd4
--- /dev/null
+++ b/fs/xfs/scrub/agheader.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Superblock */
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_superblock_xref(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agnumber_t		agno = sc->sm->sm_agno;
+	xfs_agblock_t		agbno;
+	int			error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	agbno = XFS_SB_BLOCK(mp);
+
+	error = xchk_ag_init_existing(sc, agno, &sc->sa);
+	if (!xchk_xref_process_error(sc, agno, agbno, &error))
+		return;
+
+	xchk_xref_is_used_space(sc, agbno, 1);
+	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+	xchk_xref_is_not_shared(sc, agbno, 1);
+
+	/* scrub teardown will take care of sc->sa for us */
+}
+
+/*
+ * Scrub the filesystem superblock.
+ *
+ * Note: We do /not/ attempt to check AG 0's superblock.  Mount is
+ * responsible for validating all the geometry information in sb 0, so
+ * if the filesystem is capable of initiating online scrub, then clearly
+ * sb 0 is ok and we can use its information to check everything else.
+ */
+int
+xchk_superblock(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*bp;
+	struct xfs_dsb		*sb;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	uint32_t		v2_ok;
+	__be32			features_mask;
+	int			error;
+	__be16			vernum_mask;
+
+	agno = sc->sm->sm_agno;
+	if (agno == 0)
+		return 0;
+
+	/*
+	 * Grab an active reference to the perag structure.  If we can't get
+	 * it, we're racing with something that's tearing down the AG, so
+	 * signal that the AG no longer exists.
+	 */
+	pag = xfs_perag_get(mp, agno);
+	if (!pag)
+		return -ENOENT;
+
+	error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp);
+	/*
+	 * The superblock verifier can return several different error codes
+	 * if it thinks the superblock doesn't look right.  For a mount these
+	 * would all get bounced back to userspace, but if we're here then the
+	 * fs mounted successfully, which means that this secondary superblock
+	 * is simply incorrect.  Treat all these codes the same way we treat
+	 * any corruption.
+	 */
+	switch (error) {
+	case -EINVAL:	/* also -EWRONGFS */
+	case -ENOSYS:
+	case -EFBIG:
+		error = -EFSCORRUPTED;
+		fallthrough;
+	default:
+		break;
+	}
+	if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error))
+		goto out_pag;
+
+	sb = bp->b_addr;
+
+	/*
+	 * Verify the geometries match.  Fields that are permanently
+	 * set by mkfs are checked; fields that can be updated later
+	 * (and are not propagated to backup superblocks) are preen
+	 * checked.
+	 */
+	if (sb->sb_blocksize != cpu_to_be32(mp->m_sb.sb_blocksize))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_dblocks != cpu_to_be64(mp->m_sb.sb_dblocks))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_rblocks != cpu_to_be64(mp->m_sb.sb_rblocks))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_rextents != cpu_to_be64(mp->m_sb.sb_rextents))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (!uuid_equal(&sb->sb_uuid, &mp->m_sb.sb_uuid))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_logstart != cpu_to_be64(mp->m_sb.sb_logstart))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_agblocks != cpu_to_be32(mp->m_sb.sb_agblocks))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_agcount != cpu_to_be32(mp->m_sb.sb_agcount))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_rbmblocks != cpu_to_be32(mp->m_sb.sb_rbmblocks))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_logblocks != cpu_to_be32(mp->m_sb.sb_logblocks))
+		xchk_block_set_corrupt(sc, bp);
+
+	/* Check sb_versionnum bits that are set at mkfs time. */
+	vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS |
+				  XFS_SB_VERSION_NUMBITS |
+				  XFS_SB_VERSION_ALIGNBIT |
+				  XFS_SB_VERSION_DALIGNBIT |
+				  XFS_SB_VERSION_SHAREDBIT |
+				  XFS_SB_VERSION_LOGV2BIT |
+				  XFS_SB_VERSION_SECTORBIT |
+				  XFS_SB_VERSION_EXTFLGBIT |
+				  XFS_SB_VERSION_DIRV2BIT);
+	if ((sb->sb_versionnum & vernum_mask) !=
+	    (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask))
+		xchk_block_set_corrupt(sc, bp);
+
+	/* Check sb_versionnum bits that can be set after mkfs time. */
+	vernum_mask = cpu_to_be16(XFS_SB_VERSION_ATTRBIT |
+				  XFS_SB_VERSION_NLINKBIT |
+				  XFS_SB_VERSION_QUOTABIT);
+	if ((sb->sb_versionnum & vernum_mask) !=
+	    (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_sectsize != cpu_to_be16(mp->m_sb.sb_sectsize))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_inodesize != cpu_to_be16(mp->m_sb.sb_inodesize))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_inopblock != cpu_to_be16(mp->m_sb.sb_inopblock))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (memcmp(sb->sb_fname, mp->m_sb.sb_fname, sizeof(sb->sb_fname)))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_blocklog != mp->m_sb.sb_blocklog)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_sectlog != mp->m_sb.sb_sectlog)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_inodelog != mp->m_sb.sb_inodelog)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_inopblog != mp->m_sb.sb_inopblog)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_agblklog != mp->m_sb.sb_agblklog)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_rextslog != mp->m_sb.sb_rextslog)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_imax_pct != mp->m_sb.sb_imax_pct)
+		xchk_block_set_preen(sc, bp);
+
+	/*
+	 * Skip the summary counters since we track them in memory anyway.
+	 * sb_icount, sb_ifree, sb_fdblocks, sb_frexents
+	 */
+
+	if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
+		xchk_block_set_preen(sc, bp);
+
+	/*
+	 * Skip the quota flags since repair will force quotacheck.
+	 * sb_qflags
+	 */
+
+	if (sb->sb_flags != mp->m_sb.sb_flags)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_shared_vn != mp->m_sb.sb_shared_vn)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_inoalignmt != cpu_to_be32(mp->m_sb.sb_inoalignmt))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_unit != cpu_to_be32(mp->m_sb.sb_unit))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_width != cpu_to_be32(mp->m_sb.sb_width))
+		xchk_block_set_preen(sc, bp);
+
+	if (sb->sb_dirblklog != mp->m_sb.sb_dirblklog)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_logsectlog != mp->m_sb.sb_logsectlog)
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_logsectsize != cpu_to_be16(mp->m_sb.sb_logsectsize))
+		xchk_block_set_corrupt(sc, bp);
+
+	if (sb->sb_logsunit != cpu_to_be32(mp->m_sb.sb_logsunit))
+		xchk_block_set_corrupt(sc, bp);
+
+	/* Do we see any invalid bits in sb_features2? */
+	if (!xfs_sb_version_hasmorebits(&mp->m_sb)) {
+		if (sb->sb_features2 != 0)
+			xchk_block_set_corrupt(sc, bp);
+	} else {
+		v2_ok = XFS_SB_VERSION2_OKBITS;
+		if (xfs_sb_is_v5(&mp->m_sb))
+			v2_ok |= XFS_SB_VERSION2_CRCBIT;
+
+		if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok)))
+			xchk_block_set_corrupt(sc, bp);
+
+		if (sb->sb_features2 != sb->sb_bad_features2)
+			xchk_block_set_preen(sc, bp);
+	}
+
+	/* Check sb_features2 flags that are set at mkfs time. */
+	features_mask = cpu_to_be32(XFS_SB_VERSION2_LAZYSBCOUNTBIT |
+				    XFS_SB_VERSION2_PROJID32BIT |
+				    XFS_SB_VERSION2_CRCBIT |
+				    XFS_SB_VERSION2_FTYPE);
+	if ((sb->sb_features2 & features_mask) !=
+	    (cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
+		xchk_block_set_corrupt(sc, bp);
+
+	/* Check sb_features2 flags that can be set after mkfs time. */
+	features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT);
+	if ((sb->sb_features2 & features_mask) !=
+	    (cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
+		xchk_block_set_preen(sc, bp);
+
+	if (!xfs_has_crc(mp)) {
+		/* all v5 fields must be zero */
+		if (memchr_inv(&sb->sb_features_compat, 0,
+				sizeof(struct xfs_dsb) -
+				offsetof(struct xfs_dsb, sb_features_compat)))
+			xchk_block_set_corrupt(sc, bp);
+	} else {
+		/* compat features must match */
+		if (sb->sb_features_compat !=
+				cpu_to_be32(mp->m_sb.sb_features_compat))
+			xchk_block_set_corrupt(sc, bp);
+
+		/* ro compat features must match */
+		if (sb->sb_features_ro_compat !=
+				cpu_to_be32(mp->m_sb.sb_features_ro_compat))
+			xchk_block_set_corrupt(sc, bp);
+
+		/*
+		 * NEEDSREPAIR is ignored on a secondary super, so we should
+		 * clear it when we find it, though it's not a corruption.
+		 */
+		features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
+		if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^
+				sb->sb_features_incompat) & features_mask)
+			xchk_block_set_preen(sc, bp);
+
+		/* all other incompat features must match */
+		if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^
+				sb->sb_features_incompat) & ~features_mask)
+			xchk_block_set_corrupt(sc, bp);
+
+		/*
+		 * log incompat features protect newer log record types from
+		 * older log recovery code.  Log recovery doesn't check the
+		 * secondary supers, so we can clear these if needed.
+		 */
+		if (sb->sb_features_log_incompat)
+			xchk_block_set_preen(sc, bp);
+
+		/* Don't care about sb_crc */
+
+		if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
+			xchk_block_set_corrupt(sc, bp);
+
+		if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
+			xchk_block_set_preen(sc, bp);
+
+		/* Don't care about sb_lsn */
+	}
+
+	if (xfs_has_metauuid(mp)) {
+		/* The metadata UUID must be the same for all supers */
+		if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid))
+			xchk_block_set_corrupt(sc, bp);
+	}
+
+	/* Everything else must be zero. */
+	if (memchr_inv(sb + 1, 0,
+			BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+		xchk_block_set_corrupt(sc, bp);
+
+	xchk_superblock_xref(sc, bp);
+out_pag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+/* AGF */
+
+/* Tally freespace record lengths. */
+STATIC int
+xchk_agf_record_bno_lengths(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_alloc_rec_incore *rec,
+	void				*priv)
+{
+	xfs_extlen_t			*blocks = priv;
+
+	(*blocks) += rec->ar_blockcount;
+	return 0;
+}
+
+/* Check agf_freeblks */
+static inline void
+xchk_agf_xref_freeblks(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	xfs_extlen_t		blocks = 0;
+	int			error;
+
+	if (!sc->sa.bno_cur)
+		return;
+
+	error = xfs_alloc_query_all(sc->sa.bno_cur,
+			xchk_agf_record_bno_lengths, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur))
+		return;
+	if (blocks != be32_to_cpu(agf->agf_freeblks))
+		xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+}
+
+/* Cross reference the AGF with the cntbt (freespace by length btree) */
+static inline void
+xchk_agf_xref_cntbt(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		blocks;
+	int			have;
+	int			error;
+
+	if (!sc->sa.cnt_cur)
+		return;
+
+	/* Any freespace at all? */
+	error = xfs_alloc_lookup_le(sc->sa.cnt_cur, 0, -1U, &have);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
+		return;
+	if (!have) {
+		if (agf->agf_freeblks != cpu_to_be32(0))
+			xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+		return;
+	}
+
+	/* Check agf_longest */
+	error = xfs_alloc_get_rec(sc->sa.cnt_cur, &agbno, &blocks, &have);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
+		return;
+	if (!have || blocks != be32_to_cpu(agf->agf_longest))
+		xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+}
+
+/* Check the btree block counts in the AGF against the btrees. */
+STATIC void
+xchk_agf_xref_btreeblks(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agblock_t		blocks;
+	xfs_agblock_t		btreeblks;
+	int			error;
+
+	/* agf_btreeblks didn't exist before lazysbcount */
+	if (!xfs_has_lazysbcount(sc->mp))
+		return;
+
+	/* Check agf_rmap_blocks; set up for agf_btreeblks check */
+	if (sc->sa.rmap_cur) {
+		error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks);
+		if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+			return;
+		btreeblks = blocks - 1;
+		if (blocks != be32_to_cpu(agf->agf_rmap_blocks))
+			xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+	} else {
+		btreeblks = 0;
+	}
+
+	/*
+	 * No rmap cursor; we can't xref if we have the rmapbt feature.
+	 * We also can't do it if we're missing the free space btree cursors.
+	 */
+	if ((xfs_has_rmapbt(mp) && !sc->sa.rmap_cur) ||
+	    !sc->sa.bno_cur || !sc->sa.cnt_cur)
+		return;
+
+	/* Check agf_btreeblks */
+	error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur))
+		return;
+	btreeblks += blocks - 1;
+
+	error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
+		return;
+	btreeblks += blocks - 1;
+
+	if (btreeblks != be32_to_cpu(agf->agf_btreeblks))
+		xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+}
+
+/* Check agf_refcount_blocks against tree size */
+static inline void
+xchk_agf_xref_refcblks(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	xfs_agblock_t		blocks;
+	int			error;
+
+	if (!sc->sa.refc_cur)
+		return;
+
+	error = xfs_btree_count_blocks(sc->sa.refc_cur, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
+		return;
+	if (blocks != be32_to_cpu(agf->agf_refcount_blocks))
+		xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_agf_xref(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agblock_t		agbno;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	agbno = XFS_AGF_BLOCK(mp);
+
+	xchk_ag_btcur_init(sc, &sc->sa);
+
+	xchk_xref_is_used_space(sc, agbno, 1);
+	xchk_agf_xref_freeblks(sc);
+	xchk_agf_xref_cntbt(sc);
+	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+	xchk_agf_xref_btreeblks(sc);
+	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_agf_xref_refcblks(sc);
+
+	/* scrub teardown will take care of sc->sa for us */
+}
+
+/* Scrub the AGF. */
+int
+xchk_agf(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_agf		*agf;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno = sc->sm->sm_agno;
+	xfs_agblock_t		agbno;
+	xfs_agblock_t		eoag;
+	xfs_agblock_t		agfl_first;
+	xfs_agblock_t		agfl_last;
+	xfs_agblock_t		agfl_count;
+	xfs_agblock_t		fl_count;
+	int			level;
+	int			error = 0;
+
+	error = xchk_ag_read_headers(sc, agno, &sc->sa);
+	if (!xchk_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error))
+		goto out;
+	xchk_buffer_recheck(sc, sc->sa.agf_bp);
+
+	agf = sc->sa.agf_bp->b_addr;
+	pag = sc->sa.pag;
+
+	/* Check the AG length */
+	eoag = be32_to_cpu(agf->agf_length);
+	if (eoag != pag->block_count)
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+	/* Check the AGF btree roots and levels */
+	agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+	if (!xfs_verify_agbno(pag, agbno))
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+	agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+	if (!xfs_verify_agbno(pag, agbno))
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+	level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+	if (level <= 0 || level > mp->m_alloc_maxlevels)
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+	level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+	if (level <= 0 || level > mp->m_alloc_maxlevels)
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+	if (xfs_has_rmapbt(mp)) {
+		agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+		if (!xfs_verify_agbno(pag, agbno))
+			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+		level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+		if (level <= 0 || level > mp->m_rmap_maxlevels)
+			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+	}
+
+	if (xfs_has_reflink(mp)) {
+		agbno = be32_to_cpu(agf->agf_refcount_root);
+		if (!xfs_verify_agbno(pag, agbno))
+			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+		level = be32_to_cpu(agf->agf_refcount_level);
+		if (level <= 0 || level > mp->m_refc_maxlevels)
+			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+	}
+
+	/* Check the AGFL counters */
+	agfl_first = be32_to_cpu(agf->agf_flfirst);
+	agfl_last = be32_to_cpu(agf->agf_fllast);
+	agfl_count = be32_to_cpu(agf->agf_flcount);
+	if (agfl_last > agfl_first)
+		fl_count = agfl_last - agfl_first + 1;
+	else
+		fl_count = xfs_agfl_size(mp) - agfl_first + agfl_last + 1;
+	if (agfl_count != 0 && fl_count != agfl_count)
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+	/* Do the incore counters match? */
+	if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks))
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+	if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount))
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+	if (xfs_has_lazysbcount(sc->mp) &&
+	    pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+
+	xchk_agf_xref(sc);
+out:
+	return error;
+}
+
+/* AGFL */
+
+struct xchk_agfl_info {
+	unsigned int		sz_entries;
+	unsigned int		nr_entries;
+	xfs_agblock_t		*entries;
+	struct xfs_scrub	*sc;
+};
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_agfl_block_xref(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno)
+{
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	xchk_xref_is_used_space(sc, agbno, 1);
+	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG);
+	xchk_xref_is_not_shared(sc, agbno, 1);
+}
+
+/* Scrub an AGFL block. */
+STATIC int
+xchk_agfl_block(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agbno,
+	void			*priv)
+{
+	struct xchk_agfl_info	*sai = priv;
+	struct xfs_scrub	*sc = sai->sc;
+
+	if (xfs_verify_agbno(sc->sa.pag, agbno) &&
+	    sai->nr_entries < sai->sz_entries)
+		sai->entries[sai->nr_entries++] = agbno;
+	else
+		xchk_block_set_corrupt(sc, sc->sa.agfl_bp);
+
+	xchk_agfl_block_xref(sc, agbno);
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return -ECANCELED;
+
+	return 0;
+}
+
+static int
+xchk_agblock_cmp(
+	const void		*pa,
+	const void		*pb)
+{
+	const xfs_agblock_t	*a = pa;
+	const xfs_agblock_t	*b = pb;
+
+	return (int)*a - (int)*b;
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_agfl_xref(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agblock_t		agbno;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	agbno = XFS_AGFL_BLOCK(mp);
+
+	xchk_ag_btcur_init(sc, &sc->sa);
+
+	xchk_xref_is_used_space(sc, agbno, 1);
+	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+	xchk_xref_is_not_shared(sc, agbno, 1);
+
+	/*
+	 * Scrub teardown will take care of sc->sa for us.  Leave sc->sa
+	 * active so that the agfl block xref can use it too.
+	 */
+}
+
+/* Scrub the AGFL. */
+int
+xchk_agfl(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_agfl_info	sai;
+	struct xfs_agf		*agf;
+	xfs_agnumber_t		agno = sc->sm->sm_agno;
+	unsigned int		agflcount;
+	unsigned int		i;
+	int			error;
+
+	error = xchk_ag_read_headers(sc, agno, &sc->sa);
+	if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
+		goto out;
+	if (!sc->sa.agf_bp)
+		return -EFSCORRUPTED;
+	xchk_buffer_recheck(sc, sc->sa.agfl_bp);
+
+	xchk_agfl_xref(sc);
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/* Allocate buffer to ensure uniqueness of AGFL entries. */
+	agf = sc->sa.agf_bp->b_addr;
+	agflcount = be32_to_cpu(agf->agf_flcount);
+	if (agflcount > xfs_agfl_size(sc->mp)) {
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+		goto out;
+	}
+	memset(&sai, 0, sizeof(sai));
+	sai.sc = sc;
+	sai.sz_entries = agflcount;
+	sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
+			KM_MAYFAIL);
+	if (!sai.entries) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	/* Check the blocks in the AGFL. */
+	error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr,
+			sc->sa.agfl_bp, xchk_agfl_block, &sai);
+	if (error == -ECANCELED) {
+		error = 0;
+		goto out_free;
+	}
+	if (error)
+		goto out_free;
+
+	if (agflcount != sai.nr_entries) {
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+		goto out_free;
+	}
+
+	/* Sort entries, check for duplicates. */
+	sort(sai.entries, sai.nr_entries, sizeof(sai.entries[0]),
+			xchk_agblock_cmp, NULL);
+	for (i = 1; i < sai.nr_entries; i++) {
+		if (sai.entries[i] == sai.entries[i - 1]) {
+			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+			break;
+		}
+	}
+
+out_free:
+	kmem_free(sai.entries);
+out:
+	return error;
+}
+
+/* AGI */
+
+/* Check agi_count/agi_freecount */
+static inline void
+xchk_agi_xref_icounts(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
+	xfs_agino_t		icount;
+	xfs_agino_t		freecount;
+	int			error;
+
+	if (!sc->sa.ino_cur)
+		return;
+
+	error = xfs_ialloc_count_inodes(sc->sa.ino_cur, &icount, &freecount);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur))
+		return;
+	if (be32_to_cpu(agi->agi_count) != icount ||
+	    be32_to_cpu(agi->agi_freecount) != freecount)
+		xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+}
+
+/* Check agi_[fi]blocks against tree size */
+static inline void
+xchk_agi_xref_fiblocks(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
+	xfs_agblock_t		blocks;
+	int			error = 0;
+
+	if (!xfs_has_inobtcounts(sc->mp))
+		return;
+
+	if (sc->sa.ino_cur) {
+		error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks);
+		if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur))
+			return;
+		if (blocks != be32_to_cpu(agi->agi_iblocks))
+			xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+	}
+
+	if (sc->sa.fino_cur) {
+		error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks);
+		if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur))
+			return;
+		if (blocks != be32_to_cpu(agi->agi_fblocks))
+			xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+	}
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_agi_xref(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agblock_t		agbno;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	agbno = XFS_AGI_BLOCK(mp);
+
+	xchk_ag_btcur_init(sc, &sc->sa);
+
+	xchk_xref_is_used_space(sc, agbno, 1);
+	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
+	xchk_agi_xref_icounts(sc);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_agi_xref_fiblocks(sc);
+
+	/* scrub teardown will take care of sc->sa for us */
+}
+
+/* Scrub the AGI. */
+int
+xchk_agi(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_agi		*agi;
+	struct xfs_perag	*pag;
+	struct xfs_ino_geometry	*igeo = M_IGEO(sc->mp);
+	xfs_agnumber_t		agno = sc->sm->sm_agno;
+	xfs_agblock_t		agbno;
+	xfs_agblock_t		eoag;
+	xfs_agino_t		agino;
+	xfs_agino_t		first_agino;
+	xfs_agino_t		last_agino;
+	xfs_agino_t		icount;
+	int			i;
+	int			level;
+	int			error = 0;
+
+	error = xchk_ag_read_headers(sc, agno, &sc->sa);
+	if (!xchk_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error))
+		goto out;
+	xchk_buffer_recheck(sc, sc->sa.agi_bp);
+
+	agi = sc->sa.agi_bp->b_addr;
+	pag = sc->sa.pag;
+
+	/* Check the AG length */
+	eoag = be32_to_cpu(agi->agi_length);
+	if (eoag != pag->block_count)
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+	/* Check btree roots and levels */
+	agbno = be32_to_cpu(agi->agi_root);
+	if (!xfs_verify_agbno(pag, agbno))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+	level = be32_to_cpu(agi->agi_level);
+	if (level <= 0 || level > igeo->inobt_maxlevels)
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+	if (xfs_has_finobt(mp)) {
+		agbno = be32_to_cpu(agi->agi_free_root);
+		if (!xfs_verify_agbno(pag, agbno))
+			xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+		level = be32_to_cpu(agi->agi_free_level);
+		if (level <= 0 || level > igeo->inobt_maxlevels)
+			xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+	}
+
+	/* Check inode counters */
+	xfs_agino_range(mp, agno, &first_agino, &last_agino);
+	icount = be32_to_cpu(agi->agi_count);
+	if (icount > last_agino - first_agino + 1 ||
+	    icount < be32_to_cpu(agi->agi_freecount))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+	/* Check inode pointers */
+	agino = be32_to_cpu(agi->agi_newino);
+	if (!xfs_verify_agino_or_null(pag, agino))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+	agino = be32_to_cpu(agi->agi_dirino);
+	if (!xfs_verify_agino_or_null(pag, agino))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+	/* Check unlinked inode buckets */
+	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+		agino = be32_to_cpu(agi->agi_unlinked[i]);
+		if (!xfs_verify_agino_or_null(pag, agino))
+			xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+	}
+
+	if (agi->agi_pad32 != cpu_to_be32(0))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+	/* Do the incore counters match? */
+	if (pag->pagi_count != be32_to_cpu(agi->agi_count))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+	if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+
+	xchk_agi_xref(sc);
+out:
+	return error;
+}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
new file mode 100644
index 000000000..1b0b4e243
--- /dev/null
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -0,0 +1,947 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+
+/* Superblock */
+
+/* Repair the superblock. */
+int
+xrep_superblock(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*bp;
+	xfs_agnumber_t		agno;
+	int			error;
+
+	/* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
+	agno = sc->sm->sm_agno;
+	if (agno == 0)
+		return -EOPNOTSUPP;
+
+	error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
+	if (error)
+		return error;
+
+	/* Copy AG 0's superblock to this one. */
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+
+	/*
+	 * Don't write out a secondary super with NEEDSREPAIR or log incompat
+	 * features set, since both are ignored when set on a secondary.
+	 */
+	if (xfs_has_crc(mp)) {
+		struct xfs_dsb		*sb = bp->b_addr;
+
+		sb->sb_features_incompat &=
+				~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
+		sb->sb_features_log_incompat = 0;
+	}
+
+	/* Write this to disk. */
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+	return error;
+}
+
+/* AGF */
+
+struct xrep_agf_allocbt {
+	struct xfs_scrub	*sc;
+	xfs_agblock_t		freeblks;
+	xfs_agblock_t		longest;
+};
+
+/* Record free space shape information. */
+STATIC int
+xrep_agf_walk_allocbt(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_alloc_rec_incore *rec,
+	void				*priv)
+{
+	struct xrep_agf_allocbt		*raa = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(raa->sc, &error))
+		return error;
+
+	raa->freeblks += rec->ar_blockcount;
+	if (rec->ar_blockcount > raa->longest)
+		raa->longest = rec->ar_blockcount;
+	return error;
+}
+
+/* Does this AGFL block look sane? */
+STATIC int
+xrep_agf_check_agfl_block(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agbno,
+	void			*priv)
+{
+	struct xfs_scrub	*sc = priv;
+
+	if (!xfs_verify_agbno(sc->sa.pag, agbno))
+		return -EFSCORRUPTED;
+	return 0;
+}
+
+/*
+ * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
+ * XFS_BTNUM_ names here to avoid creating a sparse array.
+ */
+enum {
+	XREP_AGF_BNOBT = 0,
+	XREP_AGF_CNTBT,
+	XREP_AGF_RMAPBT,
+	XREP_AGF_REFCOUNTBT,
+	XREP_AGF_END,
+	XREP_AGF_MAX
+};
+
+/* Check a btree root candidate. */
+static inline bool
+xrep_check_btree_root(
+	struct xfs_scrub		*sc,
+	struct xrep_find_ag_btree	*fab)
+{
+	return xfs_verify_agbno(sc->sa.pag, fab->root) &&
+	       fab->height <= fab->maxlevels;
+}
+
+/*
+ * Given the btree roots described by *fab, find the roots, check them for
+ * sanity, and pass the root data back out via *fab.
+ *
+ * This is /also/ a chicken and egg problem because we have to use the rmapbt
+ * (rooted in the AGF) to find the btrees rooted in the AGF.  We also have no
+ * idea if the btrees make any sense.  If we hit obvious corruptions in those
+ * btrees we'll bail out.
+ */
+STATIC int
+xrep_agf_find_btrees(
+	struct xfs_scrub		*sc,
+	struct xfs_buf			*agf_bp,
+	struct xrep_find_ag_btree	*fab,
+	struct xfs_buf			*agfl_bp)
+{
+	struct xfs_agf			*old_agf = agf_bp->b_addr;
+	int				error;
+
+	/* Go find the root data. */
+	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp);
+	if (error)
+		return error;
+
+	/* We must find the bnobt, cntbt, and rmapbt roots. */
+	if (!xrep_check_btree_root(sc, &fab[XREP_AGF_BNOBT]) ||
+	    !xrep_check_btree_root(sc, &fab[XREP_AGF_CNTBT]) ||
+	    !xrep_check_btree_root(sc, &fab[XREP_AGF_RMAPBT]))
+		return -EFSCORRUPTED;
+
+	/*
+	 * We relied on the rmapbt to reconstruct the AGF.  If we get a
+	 * different root then something's seriously wrong.
+	 */
+	if (fab[XREP_AGF_RMAPBT].root !=
+	    be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi]))
+		return -EFSCORRUPTED;
+
+	/* We must find the refcountbt root if that feature is enabled. */
+	if (xfs_has_reflink(sc->mp) &&
+	    !xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT]))
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/*
+ * Reinitialize the AGF header, making an in-core copy of the old contents so
+ * that we know which in-core state needs to be reinitialized.
+ */
+STATIC void
+xrep_agf_init_header(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agf_bp,
+	struct xfs_agf		*old_agf)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_agf		*agf = agf_bp->b_addr;
+
+	memcpy(old_agf, agf, sizeof(*old_agf));
+	memset(agf, 0, BBTOB(agf_bp->b_length));
+	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+	agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+	agf->agf_length = cpu_to_be32(sc->sa.pag->block_count);
+	agf->agf_flfirst = old_agf->agf_flfirst;
+	agf->agf_fllast = old_agf->agf_fllast;
+	agf->agf_flcount = old_agf->agf_flcount;
+	if (xfs_has_crc(mp))
+		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+
+	/* Mark the incore AGF data stale until we're done fixing things. */
+	ASSERT(sc->sa.pag->pagf_init);
+	sc->sa.pag->pagf_init = 0;
+}
+
+/* Set btree root information in an AGF. */
+STATIC void
+xrep_agf_set_roots(
+	struct xfs_scrub		*sc,
+	struct xfs_agf			*agf,
+	struct xrep_find_ag_btree	*fab)
+{
+	agf->agf_roots[XFS_BTNUM_BNOi] =
+			cpu_to_be32(fab[XREP_AGF_BNOBT].root);
+	agf->agf_levels[XFS_BTNUM_BNOi] =
+			cpu_to_be32(fab[XREP_AGF_BNOBT].height);
+
+	agf->agf_roots[XFS_BTNUM_CNTi] =
+			cpu_to_be32(fab[XREP_AGF_CNTBT].root);
+	agf->agf_levels[XFS_BTNUM_CNTi] =
+			cpu_to_be32(fab[XREP_AGF_CNTBT].height);
+
+	agf->agf_roots[XFS_BTNUM_RMAPi] =
+			cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
+	agf->agf_levels[XFS_BTNUM_RMAPi] =
+			cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
+
+	if (xfs_has_reflink(sc->mp)) {
+		agf->agf_refcount_root =
+				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root);
+		agf->agf_refcount_level =
+				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].height);
+	}
+}
+
+/* Update all AGF fields which derive from btree contents. */
+STATIC int
+xrep_agf_calc_from_btrees(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agf_bp)
+{
+	struct xrep_agf_allocbt	raa = { .sc = sc };
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_agf		*agf = agf_bp->b_addr;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agblock_t		btreeblks;
+	xfs_agblock_t		blocks;
+	int			error;
+
+	/* Update the AGF counters from the bnobt. */
+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+			sc->sa.pag, XFS_BTNUM_BNO);
+	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
+	if (error)
+		goto err;
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+	btreeblks = blocks - 1;
+	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
+	agf->agf_longest = cpu_to_be32(raa.longest);
+
+	/* Update the AGF counters from the cntbt. */
+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+			sc->sa.pag, XFS_BTNUM_CNT);
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+	btreeblks += blocks - 1;
+
+	/* Update the AGF counters from the rmapbt. */
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+	agf->agf_rmap_blocks = cpu_to_be32(blocks);
+	btreeblks += blocks - 1;
+
+	agf->agf_btreeblks = cpu_to_be32(btreeblks);
+
+	/* Update the AGF counters from the refcountbt. */
+	if (xfs_has_reflink(mp)) {
+		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
+				sc->sa.pag);
+		error = xfs_btree_count_blocks(cur, &blocks);
+		if (error)
+			goto err;
+		xfs_btree_del_cursor(cur, error);
+		agf->agf_refcount_blocks = cpu_to_be32(blocks);
+	}
+
+	return 0;
+err:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/* Commit the new AGF and reinitialize the incore state. */
+STATIC int
+xrep_agf_commit_new(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agf_bp)
+{
+	struct xfs_perag	*pag;
+	struct xfs_agf		*agf = agf_bp->b_addr;
+
+	/* Trigger fdblocks recalculation */
+	xfs_force_summary_recalc(sc->mp);
+
+	/* Write this to disk. */
+	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
+	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
+
+	/* Now reinitialize the in-core counters we changed. */
+	pag = sc->sa.pag;
+	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+	pag->pagf_levels[XFS_BTNUM_BNOi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+	pag->pagf_levels[XFS_BTNUM_CNTi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+	pag->pagf_levels[XFS_BTNUM_RMAPi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
+	pag->pagf_init = 1;
+
+	return 0;
+}
+
+/* Repair the AGF. v5 filesystems only. */
+int
+xrep_agf(
+	struct xfs_scrub		*sc)
+{
+	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
+		[XREP_AGF_BNOBT] = {
+			.rmap_owner = XFS_RMAP_OWN_AG,
+			.buf_ops = &xfs_bnobt_buf_ops,
+			.maxlevels = sc->mp->m_alloc_maxlevels,
+		},
+		[XREP_AGF_CNTBT] = {
+			.rmap_owner = XFS_RMAP_OWN_AG,
+			.buf_ops = &xfs_cntbt_buf_ops,
+			.maxlevels = sc->mp->m_alloc_maxlevels,
+		},
+		[XREP_AGF_RMAPBT] = {
+			.rmap_owner = XFS_RMAP_OWN_AG,
+			.buf_ops = &xfs_rmapbt_buf_ops,
+			.maxlevels = sc->mp->m_rmap_maxlevels,
+		},
+		[XREP_AGF_REFCOUNTBT] = {
+			.rmap_owner = XFS_RMAP_OWN_REFC,
+			.buf_ops = &xfs_refcountbt_buf_ops,
+			.maxlevels = sc->mp->m_refc_maxlevels,
+		},
+		[XREP_AGF_END] = {
+			.buf_ops = NULL,
+		},
+	};
+	struct xfs_agf			old_agf;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*agf_bp;
+	struct xfs_buf			*agfl_bp;
+	struct xfs_agf			*agf;
+	int				error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_has_rmapbt(mp))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Make sure we have the AGF buffer, as scrub might have decided it
+	 * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
+	 */
+	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+						XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
+	if (error)
+		return error;
+	agf_bp->b_ops = &xfs_agf_buf_ops;
+	agf = agf_bp->b_addr;
+
+	/*
+	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
+	 * the AGFL now; these blocks might have once been part of the
+	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
+	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
+	 * because we can't do any serious cross-referencing with any of the
+	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
+	 * then we'll bail out.
+	 */
+	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+	if (error)
+		return error;
+
+	/*
+	 * Spot-check the AGFL blocks; if they're obviously corrupt then
+	 * there's nothing we can do but bail out.
+	 */
+	error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp,
+			xrep_agf_check_agfl_block, sc);
+	if (error)
+		return error;
+
+	/*
+	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
+	 * see the function for more details.
+	 */
+	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
+	if (error)
+		return error;
+
+	/* Start rewriting the header and implant the btrees we found. */
+	xrep_agf_init_header(sc, agf_bp, &old_agf);
+	xrep_agf_set_roots(sc, agf, fab);
+	error = xrep_agf_calc_from_btrees(sc, agf_bp);
+	if (error)
+		goto out_revert;
+
+	/* Commit the changes and reinitialize incore state. */
+	return xrep_agf_commit_new(sc, agf_bp);
+
+out_revert:
+	/* Mark the incore AGF state stale and revert the AGF. */
+	sc->sa.pag->pagf_init = 0;
+	memcpy(agf, &old_agf, sizeof(old_agf));
+	return error;
+}
+
+/* AGFL */
+
+struct xrep_agfl {
+	/* Bitmap of other OWN_AG metadata blocks. */
+	struct xbitmap		agmetablocks;
+
+	/* Bitmap of free space. */
+	struct xbitmap		*freesp;
+
+	struct xfs_scrub	*sc;
+};
+
+/* Record all OWN_AG (free space btree) information from the rmap data. */
+STATIC int
+xrep_agfl_walk_rmap(
+	struct xfs_btree_cur	*cur,
+	const struct xfs_rmap_irec *rec,
+	void			*priv)
+{
+	struct xrep_agfl	*ra = priv;
+	xfs_fsblock_t		fsb;
+	int			error = 0;
+
+	if (xchk_should_terminate(ra->sc, &error))
+		return error;
+
+	/* Record all the OWN_AG blocks. */
+	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+				rec->rm_startblock);
+		error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount);
+		if (error)
+			return error;
+	}
+
+	return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
+}
+
+/*
+ * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
+ * which blocks belong to the AGFL.
+ *
+ * Compute the set of old AGFL blocks by subtracting from the list of OWN_AG
+ * blocks the list of blocks owned by all other OWN_AG metadata (bnobt, cntbt,
+ * rmapbt).  These are the old AGFL blocks, so return that list and the number
+ * of blocks we're actually going to put back on the AGFL.
+ */
+STATIC int
+xrep_agfl_collect_blocks(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agf_bp,
+	struct xbitmap		*agfl_extents,
+	xfs_agblock_t		*flcount)
+{
+	struct xrep_agfl	ra;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ra.sc = sc;
+	ra.freesp = agfl_extents;
+	xbitmap_init(&ra.agmetablocks);
+
+	/* Find all space used by the free space btrees & rmapbt. */
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
+	error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+
+	/* Find all blocks currently being used by the bnobt. */
+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+			sc->sa.pag, XFS_BTNUM_BNO);
+	error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+
+	/* Find all blocks currently being used by the cntbt. */
+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+			sc->sa.pag, XFS_BTNUM_CNT);
+	error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
+	if (error)
+		goto err;
+
+	xfs_btree_del_cursor(cur, error);
+
+	/*
+	 * Drop the freesp meta blocks that are in use by btrees.
+	 * The remaining blocks /should/ be AGFL blocks.
+	 */
+	error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
+	xbitmap_destroy(&ra.agmetablocks);
+	if (error)
+		return error;
+
+	/*
+	 * Calculate the new AGFL size.  If we found more blocks than fit in
+	 * the AGFL we'll free them later.
+	 */
+	*flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
+			 xfs_agfl_size(mp));
+	return 0;
+
+err:
+	xbitmap_destroy(&ra.agmetablocks);
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/* Update the AGF and reset the in-core state. */
+STATIC void
+xrep_agfl_update_agf(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agf_bp,
+	xfs_agblock_t		flcount)
+{
+	struct xfs_agf		*agf = agf_bp->b_addr;
+
+	ASSERT(flcount <= xfs_agfl_size(sc->mp));
+
+	/* Trigger fdblocks recalculation */
+	xfs_force_summary_recalc(sc->mp);
+
+	/* Update the AGF counters. */
+	if (sc->sa.pag->pagf_init)
+		sc->sa.pag->pagf_flcount = flcount;
+	agf->agf_flfirst = cpu_to_be32(0);
+	agf->agf_flcount = cpu_to_be32(flcount);
+	agf->agf_fllast = cpu_to_be32(flcount - 1);
+
+	xfs_alloc_log_agf(sc->tp, agf_bp,
+			XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+}
+
+/* Write out a totally new AGFL. */
+STATIC void
+xrep_agfl_init_header(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agfl_bp,
+	struct xbitmap		*agfl_extents,
+	xfs_agblock_t		flcount)
+{
+	struct xfs_mount	*mp = sc->mp;
+	__be32			*agfl_bno;
+	struct xbitmap_range	*br;
+	struct xbitmap_range	*n;
+	struct xfs_agfl		*agfl;
+	xfs_agblock_t		agbno;
+	unsigned int		fl_off;
+
+	ASSERT(flcount <= xfs_agfl_size(mp));
+
+	/*
+	 * Start rewriting the header by setting the bno[] array to
+	 * NULLAGBLOCK, then setting AGFL header fields.
+	 */
+	agfl = XFS_BUF_TO_AGFL(agfl_bp);
+	memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
+	agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+	agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+	uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+
+	/*
+	 * Fill the AGFL with the remaining blocks.  If agfl_extents has more
+	 * blocks than fit in the AGFL, they will be freed in a subsequent
+	 * step.
+	 */
+	fl_off = 0;
+	agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
+	for_each_xbitmap_extent(br, n, agfl_extents) {
+		agbno = XFS_FSB_TO_AGBNO(mp, br->start);
+
+		trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno,
+				br->len);
+
+		while (br->len > 0 && fl_off < flcount) {
+			agfl_bno[fl_off] = cpu_to_be32(agbno);
+			fl_off++;
+			agbno++;
+
+			/*
+			 * We've now used br->start by putting it in the AGFL,
+			 * so bump br so that we don't reap the block later.
+			 */
+			br->start++;
+			br->len--;
+		}
+
+		if (br->len)
+			break;
+		list_del(&br->list);
+		kmem_free(br);
+	}
+
+	/* Write new AGFL to disk. */
+	xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
+	xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
+}
+
+/* Repair the AGFL. */
+int
+xrep_agfl(
+	struct xfs_scrub	*sc)
+{
+	struct xbitmap		agfl_extents;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*agf_bp;
+	struct xfs_buf		*agfl_bp;
+	xfs_agblock_t		flcount;
+	int			error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_has_rmapbt(mp))
+		return -EOPNOTSUPP;
+
+	xbitmap_init(&agfl_extents);
+
+	/*
+	 * Read the AGF so that we can query the rmapbt.  We hope that there's
+	 * nothing wrong with the AGF, but all the AG header repair functions
+	 * have this chicken-and-egg problem.
+	 */
+	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
+	if (error)
+		return error;
+
+	/*
+	 * Make sure we have the AGFL buffer, as scrub might have decided it
+	 * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
+	 */
+	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+						XFS_AGFL_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
+	if (error)
+		return error;
+	agfl_bp->b_ops = &xfs_agfl_buf_ops;
+
+	/* Gather all the extents we're going to put on the new AGFL. */
+	error = xrep_agfl_collect_blocks(sc, agf_bp, &agfl_extents, &flcount);
+	if (error)
+		goto err;
+
+	/*
+	 * Update AGF and AGFL.  We reset the global free block counter when
+	 * we adjust the AGF flcount (which can fail) so avoid updating any
+	 * buffers until we know that part works.
+	 */
+	xrep_agfl_update_agf(sc, agf_bp, flcount);
+	xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
+
+	/*
+	 * Ok, the AGFL should be ready to go now.  Roll the transaction to
+	 * make the new AGFL permanent before we start using it to return
+	 * freespace overflow to the freespace btrees.
+	 */
+	sc->sa.agf_bp = agf_bp;
+	sc->sa.agfl_bp = agfl_bp;
+	error = xrep_roll_ag_trans(sc);
+	if (error)
+		goto err;
+
+	/* Dump any AGFL overflow. */
+	error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+			XFS_AG_RESV_AGFL);
+err:
+	xbitmap_destroy(&agfl_extents);
+	return error;
+}
+
+/* AGI */
+
+/*
+ * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
+ * XFS_BTNUM_ names here to avoid creating a sparse array.
+ */
+enum {
+	XREP_AGI_INOBT = 0,
+	XREP_AGI_FINOBT,
+	XREP_AGI_END,
+	XREP_AGI_MAX
+};
+
+/*
+ * Given the inode btree roots described by *fab, find the roots, check them
+ * for sanity, and pass the root data back out via *fab.
+ */
+STATIC int
+xrep_agi_find_btrees(
+	struct xfs_scrub		*sc,
+	struct xrep_find_ag_btree	*fab)
+{
+	struct xfs_buf			*agf_bp;
+	struct xfs_mount		*mp = sc->mp;
+	int				error;
+
+	/* Read the AGF. */
+	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
+	if (error)
+		return error;
+
+	/* Find the btree roots. */
+	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL);
+	if (error)
+		return error;
+
+	/* We must find the inobt root. */
+	if (!xrep_check_btree_root(sc, &fab[XREP_AGI_INOBT]))
+		return -EFSCORRUPTED;
+
+	/* We must find the finobt root if that feature is enabled. */
+	if (xfs_has_finobt(mp) &&
+	    !xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT]))
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/*
+ * Reinitialize the AGI header, making an in-core copy of the old contents so
+ * that we know which in-core state needs to be reinitialized.
+ */
+STATIC void
+xrep_agi_init_header(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agi_bp,
+	struct xfs_agi		*old_agi)
+{
+	struct xfs_agi		*agi = agi_bp->b_addr;
+	struct xfs_mount	*mp = sc->mp;
+
+	memcpy(old_agi, agi, sizeof(*old_agi));
+	memset(agi, 0, BBTOB(agi_bp->b_length));
+	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+	agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+	agi->agi_length = cpu_to_be32(sc->sa.pag->block_count);
+	agi->agi_newino = cpu_to_be32(NULLAGINO);
+	agi->agi_dirino = cpu_to_be32(NULLAGINO);
+	if (xfs_has_crc(mp))
+		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
+
+	/* We don't know how to fix the unlinked list yet. */
+	memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked,
+			sizeof(agi->agi_unlinked));
+
+	/* Mark the incore AGF data stale until we're done fixing things. */
+	ASSERT(sc->sa.pag->pagi_init);
+	sc->sa.pag->pagi_init = 0;
+}
+
+/* Set btree root information in an AGI. */
+STATIC void
+xrep_agi_set_roots(
+	struct xfs_scrub		*sc,
+	struct xfs_agi			*agi,
+	struct xrep_find_ag_btree	*fab)
+{
+	agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
+	agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
+
+	if (xfs_has_finobt(sc->mp)) {
+		agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root);
+		agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height);
+	}
+}
+
+/* Update the AGI counters. */
+STATIC int
+xrep_agi_calc_from_btrees(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agi_bp)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_agi		*agi = agi_bp->b_addr;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agino_t		count;
+	xfs_agino_t		freecount;
+	int			error;
+
+	cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
+			sc->sa.pag, XFS_BTNUM_INO);
+	error = xfs_ialloc_count_inodes(cur, &count, &freecount);
+	if (error)
+		goto err;
+	if (xfs_has_inobtcounts(mp)) {
+		xfs_agblock_t	blocks;
+
+		error = xfs_btree_count_blocks(cur, &blocks);
+		if (error)
+			goto err;
+		agi->agi_iblocks = cpu_to_be32(blocks);
+	}
+	xfs_btree_del_cursor(cur, error);
+
+	agi->agi_count = cpu_to_be32(count);
+	agi->agi_freecount = cpu_to_be32(freecount);
+
+	if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
+		xfs_agblock_t	blocks;
+
+		cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
+				sc->sa.pag, XFS_BTNUM_FINO);
+		error = xfs_btree_count_blocks(cur, &blocks);
+		if (error)
+			goto err;
+		xfs_btree_del_cursor(cur, error);
+		agi->agi_fblocks = cpu_to_be32(blocks);
+	}
+
+	return 0;
+err:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/* Trigger reinitialization of the in-core data. */
+STATIC int
+xrep_agi_commit_new(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agi_bp)
+{
+	struct xfs_perag	*pag;
+	struct xfs_agi		*agi = agi_bp->b_addr;
+
+	/* Trigger inode count recalculation */
+	xfs_force_summary_recalc(sc->mp);
+
+	/* Write this to disk. */
+	xfs_trans_buf_set_type(sc->tp, agi_bp, XFS_BLFT_AGI_BUF);
+	xfs_trans_log_buf(sc->tp, agi_bp, 0, BBTOB(agi_bp->b_length) - 1);
+
+	/* Now reinitialize the in-core counters if necessary. */
+	pag = sc->sa.pag;
+	pag->pagi_count = be32_to_cpu(agi->agi_count);
+	pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+	pag->pagi_init = 1;
+
+	return 0;
+}
+
+/* Repair the AGI. */
+int
+xrep_agi(
+	struct xfs_scrub		*sc)
+{
+	struct xrep_find_ag_btree	fab[XREP_AGI_MAX] = {
+		[XREP_AGI_INOBT] = {
+			.rmap_owner = XFS_RMAP_OWN_INOBT,
+			.buf_ops = &xfs_inobt_buf_ops,
+			.maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
+		},
+		[XREP_AGI_FINOBT] = {
+			.rmap_owner = XFS_RMAP_OWN_INOBT,
+			.buf_ops = &xfs_finobt_buf_ops,
+			.maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
+		},
+		[XREP_AGI_END] = {
+			.buf_ops = NULL
+		},
+	};
+	struct xfs_agi			old_agi;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*agi_bp;
+	struct xfs_agi			*agi;
+	int				error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_has_rmapbt(mp))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Make sure we have the AGI buffer, as scrub might have decided it
+	 * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
+	 */
+	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+						XFS_AGI_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL);
+	if (error)
+		return error;
+	agi_bp->b_ops = &xfs_agi_buf_ops;
+	agi = agi_bp->b_addr;
+
+	/* Find the AGI btree roots. */
+	error = xrep_agi_find_btrees(sc, fab);
+	if (error)
+		return error;
+
+	/* Start rewriting the header and implant the btrees we found. */
+	xrep_agi_init_header(sc, agi_bp, &old_agi);
+	xrep_agi_set_roots(sc, agi, fab);
+	error = xrep_agi_calc_from_btrees(sc, agi_bp);
+	if (error)
+		goto out_revert;
+
+	/* Reinitialize in-core state. */
+	return xrep_agi_commit_new(sc, agi_bp);
+
+out_revert:
+	/* Mark the incore AGI state stale and revert the AGI. */
+	sc->sa.pag->pagi_init = 0;
+	memcpy(agi, &old_agi, sizeof(old_agi));
+	return error;
+}
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
new file mode 100644
index 000000000..3b38f4e2a
--- /dev/null
+++ b/fs/xfs/scrub/alloc.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "xfs_ag.h"
+
+/*
+ * Set us up to scrub free space btrees.
+ */
+int
+xchk_setup_ag_allocbt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_setup_ag_btree(sc, false);
+}
+
+/* Free space btree scrubber. */
+/*
+ * Ensure there's a corresponding cntbt/bnobt record matching this
+ * bnobt/cntbt record, respectively.
+ */
+STATIC void
+xchk_allocbt_xref_other(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	struct xfs_btree_cur	**pcur;
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	int			has_otherrec;
+	int			error;
+
+	if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
+		pcur = &sc->sa.cnt_cur;
+	else
+		pcur = &sc->sa.bno_cur;
+	if (!*pcur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec);
+	if (!xchk_should_check_xref(sc, &error, pcur))
+		return;
+	if (!has_otherrec) {
+		xchk_btree_xref_set_corrupt(sc, *pcur, 0);
+		return;
+	}
+
+	error = xfs_alloc_get_rec(*pcur, &fbno, &flen, &has_otherrec);
+	if (!xchk_should_check_xref(sc, &error, pcur))
+		return;
+	if (!has_otherrec) {
+		xchk_btree_xref_set_corrupt(sc, *pcur, 0);
+		return;
+	}
+
+	if (fbno != agbno || flen != len)
+		xchk_btree_xref_set_corrupt(sc, *pcur, 0);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_allocbt_xref(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	xchk_allocbt_xref_other(sc, agbno, len);
+	xchk_xref_is_not_inode_chunk(sc, agbno, len);
+	xchk_xref_has_no_owner(sc, agbno, len);
+	xchk_xref_is_not_shared(sc, agbno, len);
+}
+
+/* Scrub a bnobt/cntbt record. */
+STATIC int
+xchk_allocbt_rec(
+	struct xchk_btree	*bs,
+	const union xfs_btree_rec *rec)
+{
+	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
+	xfs_agblock_t		bno;
+	xfs_extlen_t		len;
+
+	bno = be32_to_cpu(rec->alloc.ar_startblock);
+	len = be32_to_cpu(rec->alloc.ar_blockcount);
+
+	if (!xfs_verify_agbext(pag, bno, len))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	xchk_allocbt_xref(bs->sc, bno, len);
+
+	return 0;
+}
+
+/* Scrub the freespace btrees for some AG. */
+STATIC int
+xchk_allocbt(
+	struct xfs_scrub	*sc,
+	xfs_btnum_t		which)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
+	return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL);
+}
+
+int
+xchk_bnobt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_allocbt(sc, XFS_BTNUM_BNO);
+}
+
+int
+xchk_cntbt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_allocbt(sc, XFS_BTNUM_CNT);
+}
+
+/* xref check that the extent is not free */
+void
+xchk_xref_is_used_space(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	bool			is_freesp;
+	int			error;
+
+	if (!sc->sa.bno_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur))
+		return;
+	if (is_freesp)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0);
+}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
new file mode 100644
index 000000000..b6f0c9f3f
--- /dev/null
+++ b/fs/xfs/scrub/attr.c
@@ -0,0 +1,527 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/dabtree.h"
+#include "scrub/attr.h"
+
+/*
+ * Allocate enough memory to hold an attr value and attr block bitmaps,
+ * reallocating the buffer if necessary.  Buffer contents are not preserved
+ * across a reallocation.
+ */
+static int
+xchk_setup_xattr_buf(
+	struct xfs_scrub	*sc,
+	size_t			value_size,
+	gfp_t			flags)
+{
+	size_t			sz;
+	struct xchk_xattr_buf	*ab = sc->buf;
+
+	/*
+	 * We need enough space to read an xattr value from the file or enough
+	 * space to hold three copies of the xattr free space bitmap.  We don't
+	 * need the buffer space for both purposes at the same time.
+	 */
+	sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+	sz = max_t(size_t, sz, value_size);
+
+	/*
+	 * If there's already a buffer, figure out if we need to reallocate it
+	 * to accommodate a larger size.
+	 */
+	if (ab) {
+		if (sz <= ab->sz)
+			return 0;
+		kmem_free(ab);
+		sc->buf = NULL;
+	}
+
+	/*
+	 * Don't zero the buffer upon allocation to avoid runtime overhead.
+	 * All users must be careful never to read uninitialized contents.
+	 */
+	ab = kvmalloc(sizeof(*ab) + sz, flags);
+	if (!ab)
+		return -ENOMEM;
+
+	ab->sz = sz;
+	sc->buf = ab;
+	return 0;
+}
+
+/* Set us up to scrub an inode's extended attributes. */
+int
+xchk_setup_xattr(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	/*
+	 * We failed to get memory while checking attrs, so this time try to
+	 * get all the memory we're ever going to need.  Allocate the buffer
+	 * without the inode lock held, which means we can sleep.
+	 */
+	if (sc->flags & XCHK_TRY_HARDER) {
+		error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL);
+		if (error)
+			return error;
+	}
+
+	return xchk_setup_inode_contents(sc, 0);
+}
+
+/* Extended Attributes */
+
+struct xchk_xattr {
+	struct xfs_attr_list_context	context;
+	struct xfs_scrub		*sc;
+};
+
+/*
+ * Check that an extended attribute key can be looked up by hash.
+ *
+ * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked)
+ * to call this function for every attribute key in an inode.  Once
+ * we're here, we load the attribute value to see if any errors happen,
+ * or if we get more or less data than we expected.
+ */
+static void
+xchk_xattr_listent(
+	struct xfs_attr_list_context	*context,
+	int				flags,
+	unsigned char			*name,
+	int				namelen,
+	int				valuelen)
+{
+	struct xchk_xattr		*sx;
+	struct xfs_da_args		args = { NULL };
+	int				error = 0;
+
+	sx = container_of(context, struct xchk_xattr, context);
+
+	if (xchk_should_terminate(sx->sc, &error)) {
+		context->seen_enough = error;
+		return;
+	}
+
+	if (flags & XFS_ATTR_INCOMPLETE) {
+		/* Incomplete attr key, just mark the inode for preening. */
+		xchk_ino_set_preen(sx->sc, context->dp->i_ino);
+		return;
+	}
+
+	/* Does this name make sense? */
+	if (!xfs_attr_namecheck(name, namelen)) {
+		xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+		return;
+	}
+
+	/*
+	 * Try to allocate enough memory to extrat the attr value.  If that
+	 * doesn't work, we overload the seen_enough variable to convey
+	 * the error message back to the main scrub function.
+	 */
+	error = xchk_setup_xattr_buf(sx->sc, valuelen,
+			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+	if (error == -ENOMEM)
+		error = -EDEADLOCK;
+	if (error) {
+		context->seen_enough = error;
+		return;
+	}
+
+	args.op_flags = XFS_DA_OP_NOTIME;
+	args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK;
+	args.geo = context->dp->i_mount->m_attr_geo;
+	args.whichfork = XFS_ATTR_FORK;
+	args.dp = context->dp;
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.trans = context->tp;
+	args.value = xchk_xattr_valuebuf(sx->sc);
+	args.valuelen = valuelen;
+
+	error = xfs_attr_get_ilocked(&args);
+	/* ENODATA means the hash lookup failed and the attr is bad */
+	if (error == -ENODATA)
+		error = -EFSCORRUPTED;
+	if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
+			&error))
+		goto fail_xref;
+	if (args.valuelen != valuelen)
+		xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
+					     args.blkno);
+fail_xref:
+	if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		context->seen_enough = 1;
+	return;
+}
+
+/*
+ * Mark a range [start, start+len) in this map.  Returns true if the
+ * region was free, and false if there's a conflict or a problem.
+ *
+ * Within a char, the lowest bit of the char represents the byte with
+ * the smallest address
+ */
+STATIC bool
+xchk_xattr_set_map(
+	struct xfs_scrub	*sc,
+	unsigned long		*map,
+	unsigned int		start,
+	unsigned int		len)
+{
+	unsigned int		mapsize = sc->mp->m_attr_geo->blksize;
+	bool			ret = true;
+
+	if (start >= mapsize)
+		return false;
+	if (start + len > mapsize) {
+		len = mapsize - start;
+		ret = false;
+	}
+
+	if (find_next_bit(map, mapsize, start) < start + len)
+		ret = false;
+	bitmap_set(map, start, len);
+
+	return ret;
+}
+
+/*
+ * Check the leaf freemap from the usage bitmap.  Returns false if the
+ * attr freemap has problems or points to used space.
+ */
+STATIC bool
+xchk_xattr_check_freemap(
+	struct xfs_scrub		*sc,
+	unsigned long			*map,
+	struct xfs_attr3_icleaf_hdr	*leafhdr)
+{
+	unsigned long			*freemap = xchk_xattr_freemap(sc);
+	unsigned long			*dstmap = xchk_xattr_dstmap(sc);
+	unsigned int			mapsize = sc->mp->m_attr_geo->blksize;
+	int				i;
+
+	/* Construct bitmap of freemap contents. */
+	bitmap_zero(freemap, mapsize);
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		if (!xchk_xattr_set_map(sc, freemap,
+				leafhdr->freemap[i].base,
+				leafhdr->freemap[i].size))
+			return false;
+	}
+
+	/* Look for bits that are set in freemap and are marked in use. */
+	return bitmap_and(dstmap, freemap, map, mapsize) == 0;
+}
+
+/*
+ * Check this leaf entry's relations to everything else.
+ * Returns the number of bytes used for the name/value data.
+ */
+STATIC void
+xchk_xattr_entry(
+	struct xchk_da_btree		*ds,
+	int				level,
+	char				*buf_end,
+	struct xfs_attr_leafblock	*leaf,
+	struct xfs_attr3_icleaf_hdr	*leafhdr,
+	struct xfs_attr_leaf_entry	*ent,
+	int				idx,
+	unsigned int			*usedbytes,
+	__u32				*last_hashval)
+{
+	struct xfs_mount		*mp = ds->state->mp;
+	unsigned long			*usedmap = xchk_xattr_usedmap(ds->sc);
+	char				*name_end;
+	struct xfs_attr_leaf_name_local	*lentry;
+	struct xfs_attr_leaf_name_remote *rentry;
+	unsigned int			nameidx;
+	unsigned int			namesize;
+
+	if (ent->pad2 != 0)
+		xchk_da_set_corrupt(ds, level);
+
+	/* Hash values in order? */
+	if (be32_to_cpu(ent->hashval) < *last_hashval)
+		xchk_da_set_corrupt(ds, level);
+	*last_hashval = be32_to_cpu(ent->hashval);
+
+	nameidx = be16_to_cpu(ent->nameidx);
+	if (nameidx < leafhdr->firstused ||
+	    nameidx >= mp->m_attr_geo->blksize) {
+		xchk_da_set_corrupt(ds, level);
+		return;
+	}
+
+	/* Check the name information. */
+	if (ent->flags & XFS_ATTR_LOCAL) {
+		lentry = xfs_attr3_leaf_name_local(leaf, idx);
+		namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+				be16_to_cpu(lentry->valuelen));
+		name_end = (char *)lentry + namesize;
+		if (lentry->namelen == 0)
+			xchk_da_set_corrupt(ds, level);
+	} else {
+		rentry = xfs_attr3_leaf_name_remote(leaf, idx);
+		namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+		name_end = (char *)rentry + namesize;
+		if (rentry->namelen == 0 || rentry->valueblk == 0)
+			xchk_da_set_corrupt(ds, level);
+	}
+	if (name_end > buf_end)
+		xchk_da_set_corrupt(ds, level);
+
+	if (!xchk_xattr_set_map(ds->sc, usedmap, nameidx, namesize))
+		xchk_da_set_corrupt(ds, level);
+	if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		*usedbytes += namesize;
+}
+
+/* Scrub an attribute leaf. */
+STATIC int
+xchk_xattr_block(
+	struct xchk_da_btree		*ds,
+	int				level)
+{
+	struct xfs_attr3_icleaf_hdr	leafhdr;
+	struct xfs_mount		*mp = ds->state->mp;
+	struct xfs_da_state_blk		*blk = &ds->state->path.blk[level];
+	struct xfs_buf			*bp = blk->bp;
+	xfs_dablk_t			*last_checked = ds->private;
+	struct xfs_attr_leafblock	*leaf = bp->b_addr;
+	struct xfs_attr_leaf_entry	*ent;
+	struct xfs_attr_leaf_entry	*entries;
+	unsigned long			*usedmap;
+	char				*buf_end;
+	size_t				off;
+	__u32				last_hashval = 0;
+	unsigned int			usedbytes = 0;
+	unsigned int			hdrsize;
+	int				i;
+	int				error;
+
+	if (*last_checked == blk->blkno)
+		return 0;
+
+	/* Allocate memory for block usage checking. */
+	error = xchk_setup_xattr_buf(ds->sc, 0,
+			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+	if (error == -ENOMEM)
+		return -EDEADLOCK;
+	if (error)
+		return error;
+	usedmap = xchk_xattr_usedmap(ds->sc);
+
+	*last_checked = blk->blkno;
+	bitmap_zero(usedmap, mp->m_attr_geo->blksize);
+
+	/* Check all the padding. */
+	if (xfs_has_crc(ds->sc->mp)) {
+		struct xfs_attr3_leafblock	*leaf = bp->b_addr;
+
+		if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 ||
+		    leaf->hdr.info.hdr.pad != 0)
+			xchk_da_set_corrupt(ds, level);
+	} else {
+		if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0)
+			xchk_da_set_corrupt(ds, level);
+	}
+
+	/* Check the leaf header */
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+	hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+
+	if (leafhdr.usedbytes > mp->m_attr_geo->blksize)
+		xchk_da_set_corrupt(ds, level);
+	if (leafhdr.firstused > mp->m_attr_geo->blksize)
+		xchk_da_set_corrupt(ds, level);
+	if (leafhdr.firstused < hdrsize)
+		xchk_da_set_corrupt(ds, level);
+	if (!xchk_xattr_set_map(ds->sc, usedmap, 0, hdrsize))
+		xchk_da_set_corrupt(ds, level);
+
+	if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	entries = xfs_attr3_leaf_entryp(leaf);
+	if ((char *)&entries[leafhdr.count] > (char *)leaf + leafhdr.firstused)
+		xchk_da_set_corrupt(ds, level);
+
+	buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+	for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
+		/* Mark the leaf entry itself. */
+		off = (char *)ent - (char *)leaf;
+		if (!xchk_xattr_set_map(ds->sc, usedmap, off,
+				sizeof(xfs_attr_leaf_entry_t))) {
+			xchk_da_set_corrupt(ds, level);
+			goto out;
+		}
+
+		/* Check the entry and nameval. */
+		xchk_xattr_entry(ds, level, buf_end, leaf, &leafhdr,
+				ent, i, &usedbytes, &last_hashval);
+
+		if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+			goto out;
+	}
+
+	if (!xchk_xattr_check_freemap(ds->sc, usedmap, &leafhdr))
+		xchk_da_set_corrupt(ds, level);
+
+	if (leafhdr.usedbytes != usedbytes)
+		xchk_da_set_corrupt(ds, level);
+
+out:
+	return 0;
+}
+
+/* Scrub a attribute btree record. */
+STATIC int
+xchk_xattr_rec(
+	struct xchk_da_btree		*ds,
+	int				level)
+{
+	struct xfs_mount		*mp = ds->state->mp;
+	struct xfs_da_state_blk		*blk = &ds->state->path.blk[level];
+	struct xfs_attr_leaf_name_local	*lentry;
+	struct xfs_attr_leaf_name_remote	*rentry;
+	struct xfs_buf			*bp;
+	struct xfs_attr_leaf_entry	*ent;
+	xfs_dahash_t			calc_hash;
+	xfs_dahash_t			hash;
+	int				nameidx;
+	int				hdrsize;
+	unsigned int			badflags;
+	int				error;
+
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+	ent = xfs_attr3_leaf_entryp(blk->bp->b_addr) + blk->index;
+
+	/* Check the whole block, if necessary. */
+	error = xchk_xattr_block(ds, level);
+	if (error)
+		goto out;
+	if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/* Check the hash of the entry. */
+	error = xchk_da_btree_hash(ds, level, &ent->hashval);
+	if (error)
+		goto out;
+
+	/* Find the attr entry's location. */
+	bp = blk->bp;
+	hdrsize = xfs_attr3_leaf_hdr_size(bp->b_addr);
+	nameidx = be16_to_cpu(ent->nameidx);
+	if (nameidx < hdrsize || nameidx >= mp->m_attr_geo->blksize) {
+		xchk_da_set_corrupt(ds, level);
+		goto out;
+	}
+
+	/* Retrieve the entry and check it. */
+	hash = be32_to_cpu(ent->hashval);
+	badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
+			XFS_ATTR_INCOMPLETE);
+	if ((ent->flags & badflags) != 0)
+		xchk_da_set_corrupt(ds, level);
+	if (ent->flags & XFS_ATTR_LOCAL) {
+		lentry = (struct xfs_attr_leaf_name_local *)
+				(((char *)bp->b_addr) + nameidx);
+		if (lentry->namelen <= 0) {
+			xchk_da_set_corrupt(ds, level);
+			goto out;
+		}
+		calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen);
+	} else {
+		rentry = (struct xfs_attr_leaf_name_remote *)
+				(((char *)bp->b_addr) + nameidx);
+		if (rentry->namelen <= 0) {
+			xchk_da_set_corrupt(ds, level);
+			goto out;
+		}
+		calc_hash = xfs_da_hashname(rentry->name, rentry->namelen);
+	}
+	if (calc_hash != hash)
+		xchk_da_set_corrupt(ds, level);
+
+out:
+	return error;
+}
+
+/* Scrub the extended attribute metadata. */
+int
+xchk_xattr(
+	struct xfs_scrub		*sc)
+{
+	struct xchk_xattr		sx;
+	xfs_dablk_t			last_checked = -1U;
+	int				error = 0;
+
+	if (!xfs_inode_hasattr(sc->ip))
+		return -ENOENT;
+
+	memset(&sx, 0, sizeof(sx));
+	/* Check attribute tree structure */
+	error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec,
+			&last_checked);
+	if (error)
+		goto out;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/* Check that every attr key can also be looked up by hash. */
+	sx.context.dp = sc->ip;
+	sx.context.resynch = 1;
+	sx.context.put_listent = xchk_xattr_listent;
+	sx.context.tp = sc->tp;
+	sx.context.allow_incomplete = true;
+	sx.sc = sc;
+
+	/*
+	 * Look up every xattr in this file by name.
+	 *
+	 * Use the backend implementation of xfs_attr_list to call
+	 * xchk_xattr_listent on every attribute key in this inode.
+	 * In other words, we use the same iterator/callback mechanism
+	 * that listattr uses to scrub extended attributes, though in our
+	 * _listent function, we check the value of the attribute.
+	 *
+	 * The VFS only locks i_rwsem when modifying attrs, so keep all
+	 * three locks held because that's the only way to ensure we're
+	 * the only thread poking into the da btree.  We traverse the da
+	 * btree while holding a leaf buffer locked for the xattr name
+	 * iteration, which doesn't really follow the usual buffer
+	 * locking order.
+	 */
+	error = xfs_attr_list_ilocked(&sx.context);
+	if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
+		goto out;
+
+	/* Did our listent function try to return any errors? */
+	if (sx.context.seen_enough < 0)
+		error = sx.context.seen_enough;
+out:
+	return error;
+}
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
new file mode 100644
index 000000000..3590e10e3
--- /dev/null
+++ b/fs/xfs/scrub/attr.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_ATTR_H__
+#define __XFS_SCRUB_ATTR_H__
+
+/*
+ * Temporary storage for online scrub and repair of extended attributes.
+ */
+struct xchk_xattr_buf {
+	/* Size of @buf, in bytes. */
+	size_t			sz;
+
+	/*
+	 * Memory buffer -- either used for extracting attr values while
+	 * walking the attributes; or for computing attr block bitmaps when
+	 * checking the attribute tree.
+	 *
+	 * Each bitmap contains enough bits to track every byte in an attr
+	 * block (rounded up to the size of an unsigned long).  The attr block
+	 * used space bitmap starts at the beginning of the buffer; the free
+	 * space bitmap follows immediately after; and we have a third buffer
+	 * for storing intermediate bitmap results.
+	 */
+	uint8_t			buf[];
+};
+
+/* A place to store attribute values. */
+static inline uint8_t *
+xchk_xattr_valuebuf(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_xattr_buf	*ab = sc->buf;
+
+	return ab->buf;
+}
+
+/* A bitmap of space usage computed by walking an attr leaf block. */
+static inline unsigned long *
+xchk_xattr_usedmap(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_xattr_buf	*ab = sc->buf;
+
+	return (unsigned long *)ab->buf;
+}
+
+/* A bitmap of free space computed by walking attr leaf block free info. */
+static inline unsigned long *
+xchk_xattr_freemap(
+	struct xfs_scrub	*sc)
+{
+	return xchk_xattr_usedmap(sc) +
+			BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+}
+
+/* A bitmap used to hold temporary results. */
+static inline unsigned long *
+xchk_xattr_dstmap(
+	struct xfs_scrub	*sc)
+{
+	return xchk_xattr_freemap(sc) +
+			BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+}
+
+#endif	/* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
new file mode 100644
index 000000000..b89bf9de9
--- /dev/null
+++ b/fs/xfs/scrub/bitmap.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "scrub/bitmap.h"
+
+/*
+ * Set a range of this bitmap.  Caller must ensure the range is not set.
+ *
+ * This is the logical equivalent of bitmap |= mask(start, len).
+ */
+int
+xbitmap_set(
+	struct xbitmap		*bitmap,
+	uint64_t		start,
+	uint64_t		len)
+{
+	struct xbitmap_range	*bmr;
+
+	bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL);
+	if (!bmr)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&bmr->list);
+	bmr->start = start;
+	bmr->len = len;
+	list_add_tail(&bmr->list, &bitmap->list);
+
+	return 0;
+}
+
+/* Free everything related to this bitmap. */
+void
+xbitmap_destroy(
+	struct xbitmap		*bitmap)
+{
+	struct xbitmap_range	*bmr;
+	struct xbitmap_range	*n;
+
+	for_each_xbitmap_extent(bmr, n, bitmap) {
+		list_del(&bmr->list);
+		kmem_free(bmr);
+	}
+}
+
+/* Set up a per-AG block bitmap. */
+void
+xbitmap_init(
+	struct xbitmap		*bitmap)
+{
+	INIT_LIST_HEAD(&bitmap->list);
+}
+
+/* Compare two btree extents. */
+static int
+xbitmap_range_cmp(
+	void			*priv,
+	const struct list_head	*a,
+	const struct list_head	*b)
+{
+	struct xbitmap_range	*ap;
+	struct xbitmap_range	*bp;
+
+	ap = container_of(a, struct xbitmap_range, list);
+	bp = container_of(b, struct xbitmap_range, list);
+
+	if (ap->start > bp->start)
+		return 1;
+	if (ap->start < bp->start)
+		return -1;
+	return 0;
+}
+
+/*
+ * Remove all the blocks mentioned in @sub from the extents in @bitmap.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @bitmap; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sub.  This routine subtracts all the extents
+ * mentioned in sub from all the extents linked in @bitmap, which leaves
+ * @bitmap as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure.  The blocks mentioned in
+ * @bitmap can be reaped.
+ *
+ * This is the logical equivalent of bitmap &= ~sub.
+ */
+#define LEFT_ALIGNED	(1 << 0)
+#define RIGHT_ALIGNED	(1 << 1)
+int
+xbitmap_disunion(
+	struct xbitmap		*bitmap,
+	struct xbitmap		*sub)
+{
+	struct list_head	*lp;
+	struct xbitmap_range	*br;
+	struct xbitmap_range	*new_br;
+	struct xbitmap_range	*sub_br;
+	uint64_t		sub_start;
+	uint64_t		sub_len;
+	int			state;
+	int			error = 0;
+
+	if (list_empty(&bitmap->list) || list_empty(&sub->list))
+		return 0;
+	ASSERT(!list_empty(&sub->list));
+
+	list_sort(NULL, &bitmap->list, xbitmap_range_cmp);
+	list_sort(NULL, &sub->list, xbitmap_range_cmp);
+
+	/*
+	 * Now that we've sorted both lists, we iterate bitmap once, rolling
+	 * forward through sub and/or bitmap as necessary until we find an
+	 * overlap or reach the end of either list.  We do not reset lp to the
+	 * head of bitmap nor do we reset sub_br to the head of sub.  The
+	 * list traversal is similar to merge sort, but we're deleting
+	 * instead.  In this manner we avoid O(n^2) operations.
+	 */
+	sub_br = list_first_entry(&sub->list, struct xbitmap_range,
+			list);
+	lp = bitmap->list.next;
+	while (lp != &bitmap->list) {
+		br = list_entry(lp, struct xbitmap_range, list);
+
+		/*
+		 * Advance sub_br and/or br until we find a pair that
+		 * intersect or we run out of extents.
+		 */
+		while (sub_br->start + sub_br->len <= br->start) {
+			if (list_is_last(&sub_br->list, &sub->list))
+				goto out;
+			sub_br = list_next_entry(sub_br, list);
+		}
+		if (sub_br->start >= br->start + br->len) {
+			lp = lp->next;
+			continue;
+		}
+
+		/* trim sub_br to fit the extent we have */
+		sub_start = sub_br->start;
+		sub_len = sub_br->len;
+		if (sub_br->start < br->start) {
+			sub_len -= br->start - sub_br->start;
+			sub_start = br->start;
+		}
+		if (sub_len > br->len)
+			sub_len = br->len;
+
+		state = 0;
+		if (sub_start == br->start)
+			state |= LEFT_ALIGNED;
+		if (sub_start + sub_len == br->start + br->len)
+			state |= RIGHT_ALIGNED;
+		switch (state) {
+		case LEFT_ALIGNED:
+			/* Coincides with only the left. */
+			br->start += sub_len;
+			br->len -= sub_len;
+			break;
+		case RIGHT_ALIGNED:
+			/* Coincides with only the right. */
+			br->len -= sub_len;
+			lp = lp->next;
+			break;
+		case LEFT_ALIGNED | RIGHT_ALIGNED:
+			/* Total overlap, just delete ex. */
+			lp = lp->next;
+			list_del(&br->list);
+			kmem_free(br);
+			break;
+		case 0:
+			/*
+			 * Deleting from the middle: add the new right extent
+			 * and then shrink the left extent.
+			 */
+			new_br = kmem_alloc(sizeof(struct xbitmap_range),
+					KM_MAYFAIL);
+			if (!new_br) {
+				error = -ENOMEM;
+				goto out;
+			}
+			INIT_LIST_HEAD(&new_br->list);
+			new_br->start = sub_start + sub_len;
+			new_br->len = br->start + br->len - new_br->start;
+			list_add(&new_br->list, &br->list);
+			br->len = sub_start - br->start;
+			lp = lp->next;
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
+	}
+
+out:
+	return error;
+}
+#undef LEFT_ALIGNED
+#undef RIGHT_ALIGNED
+
+/*
+ * Record all btree blocks seen while iterating all records of a btree.
+ *
+ * We know that the btree query_all function starts at the left edge and walks
+ * towards the right edge of the tree.  Therefore, we know that we can walk up
+ * the btree cursor towards the root; if the pointer for a given level points
+ * to the first record/key in that block, we haven't seen this block before;
+ * and therefore we need to remember that we saw this block in the btree.
+ *
+ * So if our btree is:
+ *
+ *    4
+ *  / | \
+ * 1  2  3
+ *
+ * Pretend for this example that each leaf block has 100 btree records.  For
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1.  Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4.  The list is [1, 4].
+ *
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop.  The list remains [1, 4].
+ *
+ * For the 101st btree record, we've moved onto leaf block 2.  Now
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2.  We see that
+ * bc_levels[1].ptr == 2, so we exit the loop.  The list is now [1, 4, 2].
+ *
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
+ *
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list.  Now it is [1, 4, 2, 3].
+ *
+ * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ */
+
+/*
+ * Record all the buffers pointed to by the btree cursor.  Callers already
+ * engaged in a btree walk should call this function to capture the list of
+ * blocks going from the leaf towards the root.
+ */
+int
+xbitmap_set_btcur_path(
+	struct xbitmap		*bitmap,
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_buf		*bp;
+	xfs_fsblock_t		fsb;
+	int			i;
+	int			error;
+
+	for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
+		xfs_btree_get_block(cur, i, &bp);
+		if (!bp)
+			continue;
+		fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+		error = xbitmap_set(bitmap, fsb, 1);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/* Collect a btree's block in the bitmap. */
+STATIC int
+xbitmap_collect_btblock(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*priv)
+{
+	struct xbitmap		*bitmap = priv;
+	struct xfs_buf		*bp;
+	xfs_fsblock_t		fsbno;
+
+	xfs_btree_get_block(cur, level, &bp);
+	if (!bp)
+		return 0;
+
+	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+	return xbitmap_set(bitmap, fsbno, 1);
+}
+
+/* Walk the btree and mark the bitmap wherever a btree block is found. */
+int
+xbitmap_set_btblocks(
+	struct xbitmap		*bitmap,
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock,
+			XFS_BTREE_VISIT_ALL, bitmap);
+}
+
+/* How many bits are set in this bitmap? */
+uint64_t
+xbitmap_hweight(
+	struct xbitmap		*bitmap)
+{
+	struct xbitmap_range	*bmr;
+	struct xbitmap_range	*n;
+	uint64_t		ret = 0;
+
+	for_each_xbitmap_extent(bmr, n, bitmap)
+		ret += bmr->len;
+
+	return ret;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
new file mode 100644
index 000000000..900646b72
--- /dev/null
+++ b/fs/xfs/scrub/bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_BITMAP_H__
+#define __XFS_SCRUB_BITMAP_H__
+
+struct xbitmap_range {
+	struct list_head	list;
+	uint64_t		start;
+	uint64_t		len;
+};
+
+struct xbitmap {
+	struct list_head	list;
+};
+
+void xbitmap_init(struct xbitmap *bitmap);
+void xbitmap_destroy(struct xbitmap *bitmap);
+
+#define for_each_xbitmap_extent(bex, n, bitmap) \
+	list_for_each_entry_safe((bex), (n), &(bitmap)->list, list)
+
+#define for_each_xbitmap_block(b, bex, n, bitmap) \
+	list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \
+		for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++)
+
+int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
+int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
+int xbitmap_set_btcur_path(struct xbitmap *bitmap,
+		struct xfs_btree_cur *cur);
+int xbitmap_set_btblocks(struct xbitmap *bitmap,
+		struct xfs_btree_cur *cur);
+uint64_t xbitmap_hweight(struct xbitmap *bitmap);
+
+#endif	/* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
new file mode 100644
index 000000000..f0b9cb650
--- /dev/null
+++ b/fs/xfs/scrub/bmap.c
@@ -0,0 +1,741 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "xfs_ag.h"
+
+/* Set us up with an inode's bmap. */
+int
+xchk_setup_inode_bmap(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	error = xchk_get_inode(sc);
+	if (error)
+		goto out;
+
+	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+
+	/*
+	 * We don't want any ephemeral data fork updates sitting around
+	 * while we inspect block mappings, so wait for directio to finish
+	 * and flush dirty data if we have delalloc reservations.
+	 */
+	if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
+	    sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
+		struct address_space	*mapping = VFS_I(sc->ip)->i_mapping;
+
+		inode_dio_wait(VFS_I(sc->ip));
+
+		/*
+		 * Try to flush all incore state to disk before we examine the
+		 * space mappings for the data fork.  Leave accumulated errors
+		 * in the mapping for the writer threads to consume.
+		 *
+		 * On ENOSPC or EIO writeback errors, we continue into the
+		 * extent mapping checks because write failures do not
+		 * necessarily imply anything about the correctness of the file
+		 * metadata.  The metadata and the file data could be on
+		 * completely separate devices; a media failure might only
+		 * affect a subset of the disk, etc.  We can handle delalloc
+		 * extents in the scrubber, so leaving them in memory is fine.
+		 */
+		error = filemap_fdatawrite(mapping);
+		if (!error)
+			error = filemap_fdatawait_keep_errors(mapping);
+		if (error && (error != -ENOSPC && error != -EIO))
+			goto out;
+	}
+
+	/* Got the inode, lock it and we're ready to go. */
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		goto out;
+	sc->ilock_flags |= XFS_ILOCK_EXCL;
+	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+	/* scrub teardown will unlock and release the inode */
+	return error;
+}
+
+/*
+ * Inode fork block mapping (BMBT) scrubber.
+ * More complex than the others because we have to scrub
+ * all the extents regardless of whether or not the fork
+ * is in btree format.
+ */
+
+struct xchk_bmap_info {
+	struct xfs_scrub	*sc;
+	xfs_fileoff_t		lastoff;
+	bool			is_rt;
+	bool			is_shared;
+	bool			was_loaded;
+	int			whichfork;
+};
+
+/* Look for a corresponding rmap for this irec. */
+static inline bool
+xchk_bmap_get_rmap(
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec,
+	xfs_agblock_t		agbno,
+	uint64_t		owner,
+	struct xfs_rmap_irec	*rmap)
+{
+	xfs_fileoff_t		offset;
+	unsigned int		rflags = 0;
+	int			has_rmap;
+	int			error;
+
+	if (info->whichfork == XFS_ATTR_FORK)
+		rflags |= XFS_RMAP_ATTR_FORK;
+	if (irec->br_state == XFS_EXT_UNWRITTEN)
+		rflags |= XFS_RMAP_UNWRITTEN;
+
+	/*
+	 * CoW staging extents are owned (on disk) by the refcountbt, so
+	 * their rmaps do not have offsets.
+	 */
+	if (info->whichfork == XFS_COW_FORK)
+		offset = 0;
+	else
+		offset = irec->br_startoff;
+
+	/*
+	 * If the caller thinks this could be a shared bmbt extent (IOWs,
+	 * any data fork extent of a reflink inode) then we have to use the
+	 * range rmap lookup to make sure we get the correct owner/offset.
+	 */
+	if (info->is_shared) {
+		error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
+				owner, offset, rflags, rmap, &has_rmap);
+	} else {
+		error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
+				owner, offset, rflags, rmap, &has_rmap);
+	}
+	if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
+		return false;
+
+	if (!has_rmap)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+			irec->br_startoff);
+	return has_rmap;
+}
+
+/* Make sure that we have rmapbt records for this extent. */
+STATIC void
+xchk_bmap_xref_rmap(
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec,
+	xfs_agblock_t		agbno)
+{
+	struct xfs_rmap_irec	rmap;
+	unsigned long long	rmap_end;
+	uint64_t		owner;
+
+	if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
+		return;
+
+	if (info->whichfork == XFS_COW_FORK)
+		owner = XFS_RMAP_OWN_COW;
+	else
+		owner = info->sc->ip->i_ino;
+
+	/* Find the rmap record for this irec. */
+	if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+		return;
+
+	/* Check the rmap. */
+	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
+	if (rmap.rm_startblock > agbno ||
+	    agbno + irec->br_blockcount > rmap_end)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	/*
+	 * Check the logical offsets if applicable.  CoW staging extents
+	 * don't track logical offsets since the mappings only exist in
+	 * memory.
+	 */
+	if (info->whichfork != XFS_COW_FORK) {
+		rmap_end = (unsigned long long)rmap.rm_offset +
+				rmap.rm_blockcount;
+		if (rmap.rm_offset > irec->br_startoff ||
+		    irec->br_startoff + irec->br_blockcount > rmap_end)
+			xchk_fblock_xref_set_corrupt(info->sc,
+					info->whichfork, irec->br_startoff);
+	}
+
+	if (rmap.rm_owner != owner)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	/*
+	 * Check for discrepancies between the unwritten flag in the irec and
+	 * the rmap.  Note that the (in-memory) CoW fork distinguishes between
+	 * unwritten and written extents, but we don't track that in the rmap
+	 * records because the blocks are owned (on-disk) by the refcountbt,
+	 * which doesn't track unwritten state.
+	 */
+	if (owner != XFS_RMAP_OWN_COW &&
+	    !!(irec->br_state == XFS_EXT_UNWRITTEN) !=
+	    !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	if (!!(info->whichfork == XFS_ATTR_FORK) !=
+	    !!(rmap.rm_flags & XFS_RMAP_ATTR_FORK))
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+	if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+}
+
+/* Cross-reference a single rtdev extent record. */
+STATIC void
+xchk_bmap_rt_iextent_xref(
+	struct xfs_inode	*ip,
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
+			irec->br_blockcount);
+}
+
+/* Cross-reference a single datadev extent record. */
+STATIC void
+xchk_bmap_iextent_xref(
+	struct xfs_inode	*ip,
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_mount	*mp = info->sc->mp;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		len;
+	int			error;
+
+	agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
+	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
+	len = irec->br_blockcount;
+
+	error = xchk_ag_init_existing(info->sc, agno, &info->sc->sa);
+	if (!xchk_fblock_process_error(info->sc, info->whichfork,
+			irec->br_startoff, &error))
+		goto out_free;
+
+	xchk_xref_is_used_space(info->sc, agbno, len);
+	xchk_xref_is_not_inode_chunk(info->sc, agbno, len);
+	xchk_bmap_xref_rmap(info, irec, agbno);
+	switch (info->whichfork) {
+	case XFS_DATA_FORK:
+		if (xfs_is_reflink_inode(info->sc->ip))
+			break;
+		fallthrough;
+	case XFS_ATTR_FORK:
+		xchk_xref_is_not_shared(info->sc, agbno,
+				irec->br_blockcount);
+		break;
+	case XFS_COW_FORK:
+		xchk_xref_is_cow_staging(info->sc, agbno,
+				irec->br_blockcount);
+		break;
+	}
+
+out_free:
+	xchk_ag_free(info->sc, &info->sc->sa);
+}
+
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+	struct xfs_inode	*ip,
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		off;
+
+	if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+		return;
+
+	if (!xfs_verify_dablk(mp, irec->br_startoff))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	off = irec->br_startoff + irec->br_blockcount - 1;
+	if (!xfs_verify_dablk(mp, off))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
+/* Scrub a single extent record. */
+STATIC int
+xchk_bmap_iextent(
+	struct xfs_inode	*ip,
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_mount	*mp = info->sc->mp;
+	int			error = 0;
+
+	/*
+	 * Check for out-of-order extents.  This record could have come
+	 * from the incore list, for which there is no ordering check.
+	 */
+	if (irec->br_startoff < info->lastoff)
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	xchk_bmap_dirattr_extent(ip, info, irec);
+
+	/* There should never be a "hole" extent in either extent list. */
+	if (irec->br_startblock == HOLESTARTBLOCK)
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	/*
+	 * Check for delalloc extents.  We never iterate the ones in the
+	 * in-core extent scan, and we should never see these in the bmbt.
+	 */
+	if (isnullstartblock(irec->br_startblock))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	/* Make sure the extent points to a valid place. */
+	if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+	if (info->is_rt &&
+	    !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+	if (!info->is_rt &&
+	    !xfs_verify_fsbext(mp, irec->br_startblock, irec->br_blockcount))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	/* We don't allow unwritten extents on attr forks. */
+	if (irec->br_state == XFS_EXT_UNWRITTEN &&
+	    info->whichfork == XFS_ATTR_FORK)
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	if (info->is_rt)
+		xchk_bmap_rt_iextent_xref(ip, info, irec);
+	else
+		xchk_bmap_iextent_xref(ip, info, irec);
+
+	info->lastoff = irec->br_startoff + irec->br_blockcount;
+	return error;
+}
+
+/* Scrub a bmbt record. */
+STATIC int
+xchk_bmapbt_rec(
+	struct xchk_btree	*bs,
+	const union xfs_btree_rec *rec)
+{
+	struct xfs_bmbt_irec	irec;
+	struct xfs_bmbt_irec	iext_irec;
+	struct xfs_iext_cursor	icur;
+	struct xchk_bmap_info	*info = bs->private;
+	struct xfs_inode	*ip = bs->cur->bc_ino.ip;
+	struct xfs_buf		*bp = NULL;
+	struct xfs_btree_block	*block;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, info->whichfork);
+	uint64_t		owner;
+	int			i;
+
+	/*
+	 * Check the owners of the btree blocks up to the level below
+	 * the root since the verifiers don't do that.
+	 */
+	if (xfs_has_crc(bs->cur->bc_mp) &&
+	    bs->cur->bc_levels[0].ptr == 1) {
+		for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
+			block = xfs_btree_get_block(bs->cur, i, &bp);
+			owner = be64_to_cpu(block->bb_u.l.bb_owner);
+			if (owner != ip->i_ino)
+				xchk_fblock_set_corrupt(bs->sc,
+						info->whichfork, 0);
+		}
+	}
+
+	/*
+	 * Check that the incore extent tree contains an extent that matches
+	 * this one exactly.  We validate those cached bmaps later, so we don't
+	 * need to check them here.  If the incore extent tree was just loaded
+	 * from disk by the scrubber, we assume that its contents match what's
+	 * on disk (we still hold the ILOCK) and skip the equivalence check.
+	 */
+	if (!info->was_loaded)
+		return 0;
+
+	xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+	if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
+				&iext_irec) ||
+	    irec.br_startoff != iext_irec.br_startoff ||
+	    irec.br_startblock != iext_irec.br_startblock ||
+	    irec.br_blockcount != iext_irec.br_blockcount ||
+	    irec.br_state != iext_irec.br_state)
+		xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+				irec.br_startoff);
+	return 0;
+}
+
+/* Scan the btree records. */
+STATIC int
+xchk_bmap_btree(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	struct xchk_bmap_info	*info)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip = sc->ip;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	/* Load the incore bmap cache if it's not loaded. */
+	info->was_loaded = !xfs_need_iread_extents(ifp);
+
+	error = xfs_iread_extents(sc->tp, ip, whichfork);
+	if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+		goto out;
+
+	/* Check the btree structure. */
+	cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
+	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+	error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
+	xfs_btree_del_cursor(cur, error);
+out:
+	return error;
+}
+
+struct xchk_bmap_check_rmap_info {
+	struct xfs_scrub	*sc;
+	int			whichfork;
+	struct xfs_iext_cursor	icur;
+};
+
+/* Can we find bmaps that fit this rmap? */
+STATIC int
+xchk_bmap_check_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_rmap_irec		check_rec;
+	struct xchk_bmap_check_rmap_info	*sbcri = priv;
+	struct xfs_ifork		*ifp;
+	struct xfs_scrub		*sc = sbcri->sc;
+	bool				have_map;
+
+	/* Is this even the right fork? */
+	if (rec->rm_owner != sc->ip->i_ino)
+		return 0;
+	if ((sbcri->whichfork == XFS_ATTR_FORK) ^
+	    !!(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		return 0;
+
+	/* Now look up the bmbt record. */
+	ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork);
+	if (!ifp) {
+		xchk_fblock_set_corrupt(sc, sbcri->whichfork,
+				rec->rm_offset);
+		goto out;
+	}
+	have_map = xfs_iext_lookup_extent(sc->ip, ifp, rec->rm_offset,
+			&sbcri->icur, &irec);
+	if (!have_map)
+		xchk_fblock_set_corrupt(sc, sbcri->whichfork,
+				rec->rm_offset);
+	/*
+	 * bmap extent record lengths are constrained to 2^21 blocks in length
+	 * because of space constraints in the on-disk metadata structure.
+	 * However, rmap extent record lengths are constrained only by AG
+	 * length, so we have to loop through the bmbt to make sure that the
+	 * entire rmap is covered by bmbt records.
+	 */
+	check_rec = *rec;
+	while (have_map) {
+		if (irec.br_startoff != check_rec.rm_offset)
+			xchk_fblock_set_corrupt(sc, sbcri->whichfork,
+					check_rec.rm_offset);
+		if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
+				cur->bc_ag.pag->pag_agno,
+				check_rec.rm_startblock))
+			xchk_fblock_set_corrupt(sc, sbcri->whichfork,
+					check_rec.rm_offset);
+		if (irec.br_blockcount > check_rec.rm_blockcount)
+			xchk_fblock_set_corrupt(sc, sbcri->whichfork,
+					check_rec.rm_offset);
+		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+			break;
+		check_rec.rm_startblock += irec.br_blockcount;
+		check_rec.rm_offset += irec.br_blockcount;
+		check_rec.rm_blockcount -= irec.br_blockcount;
+		if (check_rec.rm_blockcount == 0)
+			break;
+		have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec);
+		if (!have_map)
+			xchk_fblock_set_corrupt(sc, sbcri->whichfork,
+					check_rec.rm_offset);
+	}
+
+out:
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return -ECANCELED;
+	return 0;
+}
+
+/* Make sure each rmap has a corresponding bmbt entry. */
+STATIC int
+xchk_bmap_check_ag_rmaps(
+	struct xfs_scrub		*sc,
+	int				whichfork,
+	struct xfs_perag		*pag)
+{
+	struct xchk_bmap_check_rmap_info	sbcri;
+	struct xfs_btree_cur		*cur;
+	struct xfs_buf			*agf;
+	int				error;
+
+	error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf);
+	if (error)
+		return error;
+
+	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, pag);
+
+	sbcri.sc = sc;
+	sbcri.whichfork = whichfork;
+	error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
+	if (error == -ECANCELED)
+		error = 0;
+
+	xfs_btree_del_cursor(cur, error);
+	xfs_trans_brelse(sc->tp, agf);
+	return error;
+}
+
+/* Make sure each rmap has a corresponding bmbt entry. */
+STATIC int
+xchk_bmap_check_rmaps(
+	struct xfs_scrub	*sc,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	bool			zero_size;
+	int			error;
+
+	if (!xfs_has_rmapbt(sc->mp) ||
+	    whichfork == XFS_COW_FORK ||
+	    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return 0;
+
+	/* Don't support realtime rmap checks yet. */
+	if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
+		return 0;
+
+	ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
+
+	/*
+	 * Only do this for complex maps that are in btree format, or for
+	 * situations where we would seem to have a size but zero extents.
+	 * The inode repair code can zap broken iforks, which means we have
+	 * to flag this bmap as corrupt if there are rmaps that need to be
+	 * reattached.
+	 */
+
+	if (whichfork == XFS_DATA_FORK)
+		zero_size = i_size_read(VFS_I(sc->ip)) == 0;
+	else
+		zero_size = false;
+
+	if (ifp->if_format != XFS_DINODE_FMT_BTREE &&
+	    (zero_size || ifp->if_nextents > 0))
+		return 0;
+
+	for_each_perag(sc->mp, agno, pag) {
+		error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
+		if (error)
+			break;
+		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+			break;
+	}
+	if (pag)
+		xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Scrub an inode fork's block mappings.
+ *
+ * First we scan every record in every btree block, if applicable.
+ * Then we unconditionally scan the incore extent cache.
+ */
+STATIC int
+xchk_bmap(
+	struct xfs_scrub	*sc,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	irec;
+	struct xchk_bmap_info	info = { NULL };
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip = sc->ip;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	xfs_fileoff_t		endoff;
+	struct xfs_iext_cursor	icur;
+	int			error = 0;
+
+	/* Non-existent forks can be ignored. */
+	if (!ifp)
+		goto out;
+
+	info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+	info.whichfork = whichfork;
+	info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
+	info.sc = sc;
+
+	switch (whichfork) {
+	case XFS_COW_FORK:
+		/* No CoW forks on non-reflink inodes/filesystems. */
+		if (!xfs_is_reflink_inode(ip)) {
+			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+			goto out;
+		}
+		break;
+	case XFS_ATTR_FORK:
+		if (!xfs_has_attr(mp) && !xfs_has_attr2(mp))
+			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		break;
+	default:
+		ASSERT(whichfork == XFS_DATA_FORK);
+		break;
+	}
+
+	/* Check the fork values */
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_UUID:
+	case XFS_DINODE_FMT_DEV:
+	case XFS_DINODE_FMT_LOCAL:
+		/* No mappings to check. */
+		goto out;
+	case XFS_DINODE_FMT_EXTENTS:
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (whichfork == XFS_COW_FORK) {
+			xchk_fblock_set_corrupt(sc, whichfork, 0);
+			goto out;
+		}
+
+		error = xchk_bmap_btree(sc, whichfork, &info);
+		if (error)
+			goto out;
+		break;
+	default:
+		xchk_fblock_set_corrupt(sc, whichfork, 0);
+		goto out;
+	}
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/* Find the offset of the last extent in the mapping. */
+	error = xfs_bmap_last_offset(ip, &endoff, whichfork);
+	if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+		goto out;
+
+	/* Scrub extent records. */
+	info.lastoff = 0;
+	ifp = xfs_ifork_ptr(ip, whichfork);
+	for_each_xfs_iext(ifp, &icur, &irec) {
+		if (xchk_should_terminate(sc, &error) ||
+		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+			goto out;
+		if (isnullstartblock(irec.br_startblock))
+			continue;
+		if (irec.br_startoff >= endoff) {
+			xchk_fblock_set_corrupt(sc, whichfork,
+					irec.br_startoff);
+			goto out;
+		}
+		error = xchk_bmap_iextent(ip, &info, &irec);
+		if (error)
+			goto out;
+	}
+
+	error = xchk_bmap_check_rmaps(sc, whichfork);
+	if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
+		goto out;
+out:
+	return error;
+}
+
+/* Scrub an inode's data fork. */
+int
+xchk_bmap_data(
+	struct xfs_scrub	*sc)
+{
+	return xchk_bmap(sc, XFS_DATA_FORK);
+}
+
+/* Scrub an inode's attr fork. */
+int
+xchk_bmap_attr(
+	struct xfs_scrub	*sc)
+{
+	return xchk_bmap(sc, XFS_ATTR_FORK);
+}
+
+/* Scrub an inode's CoW fork. */
+int
+xchk_bmap_cow(
+	struct xfs_scrub	*sc)
+{
+	if (!xfs_is_reflink_inode(sc->ip))
+		return -ENOENT;
+
+	return xchk_bmap(sc, XFS_COW_FORK);
+}
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
new file mode 100644
index 000000000..2f4519590
--- /dev/null
+++ b/fs/xfs/scrub/btree.c
@@ -0,0 +1,748 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* btree scrubbing */
+
+/*
+ * Check for btree operation errors.  See the section about handling
+ * operational errors in common.c.
+ */
+static bool
+__xchk_btree_process_error(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*error,
+	__u32			errflag,
+	void			*ret_ip)
+{
+	if (*error == 0)
+		return true;
+
+	switch (*error) {
+	case -EDEADLOCK:
+		/* Used to restart an op with deadlock avoidance. */
+		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
+		break;
+	case -EFSBADCRC:
+	case -EFSCORRUPTED:
+		/* Note the badness but don't abort. */
+		sc->sm->sm_flags |= errflag;
+		*error = 0;
+		fallthrough;
+	default:
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			trace_xchk_ifork_btree_op_error(sc, cur, level,
+					*error, ret_ip);
+		else
+			trace_xchk_btree_op_error(sc, cur, level,
+					*error, ret_ip);
+		break;
+	}
+	return false;
+}
+
+bool
+xchk_btree_process_error(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*error)
+{
+	return __xchk_btree_process_error(sc, cur, level, error,
+			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
+bool
+xchk_btree_xref_process_error(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*error)
+{
+	return __xchk_btree_process_error(sc, cur, level, error,
+			XFS_SCRUB_OFLAG_XFAIL, __return_address);
+}
+
+/* Record btree block corruption. */
+static void
+__xchk_btree_set_corrupt(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_cur	*cur,
+	int			level,
+	__u32			errflag,
+	void			*ret_ip)
+{
+	sc->sm->sm_flags |= errflag;
+
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+		trace_xchk_ifork_btree_error(sc, cur, level,
+				ret_ip);
+	else
+		trace_xchk_btree_error(sc, cur, level,
+				ret_ip);
+}
+
+void
+xchk_btree_set_corrupt(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
+			__return_address);
+}
+
+void
+xchk_btree_xref_set_corrupt(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
+			__return_address);
+}
+
+/*
+ * Make sure this record is in order and doesn't stray outside of the parent
+ * keys.
+ */
+STATIC void
+xchk_btree_rec(
+	struct xchk_btree	*bs)
+{
+	struct xfs_btree_cur	*cur = bs->cur;
+	union xfs_btree_rec	*rec;
+	union xfs_btree_key	key;
+	union xfs_btree_key	hkey;
+	union xfs_btree_key	*keyp;
+	struct xfs_btree_block	*block;
+	struct xfs_btree_block	*keyblock;
+	struct xfs_buf		*bp;
+
+	block = xfs_btree_get_block(cur, 0, &bp);
+	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
+
+	trace_xchk_btree_rec(bs->sc, cur, 0);
+
+	/* If this isn't the first record, are they in order? */
+	if (cur->bc_levels[0].ptr > 1 &&
+	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
+		xchk_btree_set_corrupt(bs->sc, cur, 0);
+	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
+
+	if (cur->bc_nlevels == 1)
+		return;
+
+	/* Is this at least as large as the parent low key? */
+	cur->bc_ops->init_key_from_rec(&key, rec);
+	keyblock = xfs_btree_get_block(cur, 1, &bp);
+	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
+	if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
+		xchk_btree_set_corrupt(bs->sc, cur, 1);
+
+	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+		return;
+
+	/* Is this no larger than the parent high key? */
+	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
+	if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
+		xchk_btree_set_corrupt(bs->sc, cur, 1);
+}
+
+/*
+ * Make sure this key is in order and doesn't stray outside of the parent
+ * keys.
+ */
+STATIC void
+xchk_btree_key(
+	struct xchk_btree	*bs,
+	int			level)
+{
+	struct xfs_btree_cur	*cur = bs->cur;
+	union xfs_btree_key	*key;
+	union xfs_btree_key	*keyp;
+	struct xfs_btree_block	*block;
+	struct xfs_btree_block	*keyblock;
+	struct xfs_buf		*bp;
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
+
+	trace_xchk_btree_key(bs->sc, cur, level);
+
+	/* If this isn't the first key, are they in order? */
+	if (cur->bc_levels[level].ptr > 1 &&
+	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
+		xchk_btree_set_corrupt(bs->sc, cur, level);
+	memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
+
+	if (level + 1 >= cur->bc_nlevels)
+		return;
+
+	/* Is this at least as large as the parent low key? */
+	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
+	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
+	if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
+		xchk_btree_set_corrupt(bs->sc, cur, level);
+
+	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+		return;
+
+	/* Is this no larger than the parent high key? */
+	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
+	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
+			keyblock);
+	if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
+		xchk_btree_set_corrupt(bs->sc, cur, level);
+}
+
+/*
+ * Check a btree pointer.  Returns true if it's ok to use this pointer.
+ * Callers do not need to set the corrupt flag.
+ */
+static bool
+xchk_btree_ptr_ok(
+	struct xchk_btree	*bs,
+	int			level,
+	union xfs_btree_ptr	*ptr)
+{
+	bool			res;
+
+	/* A btree rooted in an inode has no block pointer to the root. */
+	if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == bs->cur->bc_nlevels)
+		return true;
+
+	/* Otherwise, check the pointers. */
+	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
+	else
+		res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
+	if (!res)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+
+	return res;
+}
+
+/* Check that a btree block's sibling matches what we expect it. */
+STATIC int
+xchk_btree_block_check_sibling(
+	struct xchk_btree	*bs,
+	int			level,
+	int			direction,
+	union xfs_btree_ptr	*sibling)
+{
+	struct xfs_btree_cur	*cur = bs->cur;
+	struct xfs_btree_block	*pblock;
+	struct xfs_buf		*pbp;
+	struct xfs_btree_cur	*ncur = NULL;
+	union xfs_btree_ptr	*pp;
+	int			success;
+	int			error;
+
+	error = xfs_btree_dup_cursor(cur, &ncur);
+	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
+	    !ncur)
+		return error;
+
+	/*
+	 * If the pointer is null, we shouldn't be able to move the upper
+	 * level pointer anywhere.
+	 */
+	if (xfs_btree_ptr_is_null(cur, sibling)) {
+		if (direction > 0)
+			error = xfs_btree_increment(ncur, level + 1, &success);
+		else
+			error = xfs_btree_decrement(ncur, level + 1, &success);
+		if (error == 0 && success)
+			xchk_btree_set_corrupt(bs->sc, cur, level);
+		error = 0;
+		goto out;
+	}
+
+	/* Increment upper level pointer. */
+	if (direction > 0)
+		error = xfs_btree_increment(ncur, level + 1, &success);
+	else
+		error = xfs_btree_decrement(ncur, level + 1, &success);
+	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
+		goto out;
+	if (!success) {
+		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
+		goto out;
+	}
+
+	/* Compare upper level pointer to sibling pointer. */
+	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
+	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
+	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
+		goto out;
+	if (pbp)
+		xchk_buffer_recheck(bs->sc, pbp);
+
+	if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
+		xchk_btree_set_corrupt(bs->sc, cur, level);
+out:
+	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/* Check the siblings of a btree block. */
+STATIC int
+xchk_btree_block_check_siblings(
+	struct xchk_btree	*bs,
+	struct xfs_btree_block	*block)
+{
+	struct xfs_btree_cur	*cur = bs->cur;
+	union xfs_btree_ptr	leftsib;
+	union xfs_btree_ptr	rightsib;
+	int			level;
+	int			error = 0;
+
+	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
+	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
+	level = xfs_btree_get_level(block);
+
+	/* Root block should never have siblings. */
+	if (level == cur->bc_nlevels - 1) {
+		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
+		    !xfs_btree_ptr_is_null(cur, &rightsib))
+			xchk_btree_set_corrupt(bs->sc, cur, level);
+		goto out;
+	}
+
+	/*
+	 * Does the left & right sibling pointers match the adjacent
+	 * parent level pointers?
+	 * (These function absorbs error codes for us.)
+	 */
+	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
+	if (error)
+		return error;
+	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
+	if (error)
+		return error;
+out:
+	return error;
+}
+
+struct check_owner {
+	struct list_head	list;
+	xfs_daddr_t		daddr;
+	int			level;
+};
+
+/*
+ * Make sure this btree block isn't in the free list and that there's
+ * an rmap record for it.
+ */
+STATIC int
+xchk_btree_check_block_owner(
+	struct xchk_btree	*bs,
+	int			level,
+	xfs_daddr_t		daddr)
+{
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	xfs_btnum_t		btnum;
+	bool			init_sa;
+	int			error = 0;
+
+	if (!bs->cur)
+		return 0;
+
+	btnum = bs->cur->bc_btnum;
+	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
+	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
+
+	init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
+	if (init_sa) {
+		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
+		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
+				level, &error))
+			goto out_free;
+	}
+
+	xchk_xref_is_used_space(bs->sc, agbno, 1);
+	/*
+	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
+	 * have to nullify it (to shut down further block owner checks) if
+	 * self-xref encounters problems.
+	 */
+	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
+		bs->cur = NULL;
+
+	xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
+	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
+		bs->cur = NULL;
+
+out_free:
+	if (init_sa)
+		xchk_ag_free(bs->sc, &bs->sc->sa);
+
+	return error;
+}
+
+/* Check the owner of a btree block. */
+STATIC int
+xchk_btree_check_owner(
+	struct xchk_btree	*bs,
+	int			level,
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_cur	*cur = bs->cur;
+	struct check_owner	*co;
+
+	/*
+	 * In theory, xfs_btree_get_block should only give us a null buffer
+	 * pointer for the root of a root-in-inode btree type, but we need
+	 * to check defensively here in case the cursor state is also screwed
+	 * up.
+	 */
+	if (bp == NULL) {
+		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+		return 0;
+	}
+
+	/*
+	 * We want to cross-reference each btree block with the bnobt
+	 * and the rmapbt.  We cannot cross-reference the bnobt or
+	 * rmapbt while scanning the bnobt or rmapbt, respectively,
+	 * because we cannot alter the cursor and we'd prefer not to
+	 * duplicate cursors.  Therefore, save the buffer daddr for
+	 * later scanning.
+	 */
+	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
+		co = kmem_alloc(sizeof(struct check_owner),
+				KM_MAYFAIL);
+		if (!co)
+			return -ENOMEM;
+		co->level = level;
+		co->daddr = xfs_buf_daddr(bp);
+		list_add_tail(&co->list, &bs->to_check);
+		return 0;
+	}
+
+	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
+}
+
+/* Decide if we want to check minrecs of a btree block in the inode root. */
+static inline bool
+xchk_btree_check_iroot_minrecs(
+	struct xchk_btree	*bs)
+{
+	/*
+	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
+	 * would miscalculate the space required for the data fork bmbt root
+	 * when adding an attr fork, and promote the iroot contents to an
+	 * external block unnecessarily.  This went unnoticed for many years
+	 * until scrub found filesystems in this state.  Inode rooted btrees are
+	 * not supposed to have immediate child blocks that are small enough
+	 * that the contents could fit in the inode root, but we can't fail
+	 * existing filesystems, so instead we disable the check for data fork
+	 * bmap btrees when there's an attr fork.
+	 */
+	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
+	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
+	    xfs_inode_has_attr_fork(bs->sc->ip))
+		return false;
+
+	return true;
+}
+
+/*
+ * Check that this btree block has at least minrecs records or is one of the
+ * special blocks that don't require that.
+ */
+STATIC void
+xchk_btree_check_minrecs(
+	struct xchk_btree	*bs,
+	int			level,
+	struct xfs_btree_block	*block)
+{
+	struct xfs_btree_cur	*cur = bs->cur;
+	unsigned int		root_level = cur->bc_nlevels - 1;
+	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
+
+	/* More records than minrecs means the block is ok. */
+	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
+		return;
+
+	/*
+	 * For btrees rooted in the inode, it's possible that the root block
+	 * contents spilled into a regular ondisk block because there wasn't
+	 * enough space in the inode root.  The number of records in that
+	 * child block might be less than the standard minrecs, but that's ok
+	 * provided that there's only one direct child of the root.
+	 */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 2) {
+		struct xfs_btree_block	*root_block;
+		struct xfs_buf		*root_bp;
+		int			root_maxrecs;
+
+		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
+		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
+		if (xchk_btree_check_iroot_minrecs(bs) &&
+		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
+		     numrecs <= root_maxrecs))
+			xchk_btree_set_corrupt(bs->sc, cur, level);
+		return;
+	}
+
+	/*
+	 * Otherwise, only the root level is allowed to have fewer than minrecs
+	 * records or keyptrs.
+	 */
+	if (level < root_level)
+		xchk_btree_set_corrupt(bs->sc, cur, level);
+}
+
+/*
+ * Grab and scrub a btree block given a btree pointer.  Returns block
+ * and buffer pointers (if applicable) if they're ok to use.
+ */
+STATIC int
+xchk_btree_get_block(
+	struct xchk_btree	*bs,
+	int			level,
+	union xfs_btree_ptr	*pp,
+	struct xfs_btree_block	**pblock,
+	struct xfs_buf		**pbp)
+{
+	xfs_failaddr_t		failed_at;
+	int			error;
+
+	*pblock = NULL;
+	*pbp = NULL;
+
+	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
+	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
+	    !*pblock)
+		return error;
+
+	xfs_btree_get_block(bs->cur, level, pbp);
+	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
+				level, *pbp);
+	else
+		failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
+				 level, *pbp);
+	if (failed_at) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+		return 0;
+	}
+	if (*pbp)
+		xchk_buffer_recheck(bs->sc, *pbp);
+
+	xchk_btree_check_minrecs(bs, level, *pblock);
+
+	/*
+	 * Check the block's owner; this function absorbs error codes
+	 * for us.
+	 */
+	error = xchk_btree_check_owner(bs, level, *pbp);
+	if (error)
+		return error;
+
+	/*
+	 * Check the block's siblings; this function absorbs error codes
+	 * for us.
+	 */
+	return xchk_btree_block_check_siblings(bs, *pblock);
+}
+
+/*
+ * Check that the low and high keys of this block match the keys stored
+ * in the parent block.
+ */
+STATIC void
+xchk_btree_block_keys(
+	struct xchk_btree	*bs,
+	int			level,
+	struct xfs_btree_block	*block)
+{
+	union xfs_btree_key	block_keys;
+	struct xfs_btree_cur	*cur = bs->cur;
+	union xfs_btree_key	*high_bk;
+	union xfs_btree_key	*parent_keys;
+	union xfs_btree_key	*high_pk;
+	struct xfs_btree_block	*parent_block;
+	struct xfs_buf		*bp;
+
+	if (level >= cur->bc_nlevels - 1)
+		return;
+
+	/* Calculate the keys for this block. */
+	xfs_btree_get_keys(cur, block, &block_keys);
+
+	/* Obtain the parent's copy of the keys for this block. */
+	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
+	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
+			parent_block);
+
+	if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
+		xchk_btree_set_corrupt(bs->sc, cur, 1);
+
+	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+		return;
+
+	/* Get high keys */
+	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
+	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
+			parent_block);
+
+	if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
+		xchk_btree_set_corrupt(bs->sc, cur, 1);
+}
+
+/*
+ * Visit all nodes and leaves of a btree.  Check that all pointers and
+ * records are in order, that the keys reflect the records, and use a callback
+ * so that the caller can verify individual records.
+ */
+int
+xchk_btree(
+	struct xfs_scrub		*sc,
+	struct xfs_btree_cur		*cur,
+	xchk_btree_rec_fn		scrub_fn,
+	const struct xfs_owner_info	*oinfo,
+	void				*private)
+{
+	union xfs_btree_ptr		ptr;
+	struct xchk_btree		*bs;
+	union xfs_btree_ptr		*pp;
+	union xfs_btree_rec		*recp;
+	struct xfs_btree_block		*block;
+	struct xfs_buf			*bp;
+	struct check_owner		*co;
+	struct check_owner		*n;
+	size_t				cur_sz;
+	int				level;
+	int				error = 0;
+
+	/*
+	 * Allocate the btree scrub context from the heap, because this
+	 * structure can get rather large.  Don't let a caller feed us a
+	 * totally absurd size.
+	 */
+	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
+	if (cur_sz > PAGE_SIZE) {
+		xchk_btree_set_corrupt(sc, cur, 0);
+		return 0;
+	}
+	bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+	if (!bs)
+		return -ENOMEM;
+	bs->cur = cur;
+	bs->scrub_rec = scrub_fn;
+	bs->oinfo = oinfo;
+	bs->private = private;
+	bs->sc = sc;
+
+	/* Initialize scrub state */
+	INIT_LIST_HEAD(&bs->to_check);
+
+	/*
+	 * Load the root of the btree.  The helper function absorbs
+	 * error codes for us.
+	 */
+	level = cur->bc_nlevels - 1;
+	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
+		goto out;
+	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
+	if (error || !block)
+		goto out;
+
+	cur->bc_levels[level].ptr = 1;
+
+	while (level < cur->bc_nlevels) {
+		block = xfs_btree_get_block(cur, level, &bp);
+
+		if (level == 0) {
+			/* End of leaf, pop back towards the root. */
+			if (cur->bc_levels[level].ptr >
+			    be16_to_cpu(block->bb_numrecs)) {
+				xchk_btree_block_keys(bs, level, block);
+				if (level < cur->bc_nlevels - 1)
+					cur->bc_levels[level + 1].ptr++;
+				level++;
+				continue;
+			}
+
+			/* Records in order for scrub? */
+			xchk_btree_rec(bs);
+
+			/* Call out to the record checker. */
+			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+					block);
+			error = bs->scrub_rec(bs, recp);
+			if (error)
+				break;
+			if (xchk_should_terminate(sc, &error) ||
+			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+				break;
+
+			cur->bc_levels[level].ptr++;
+			continue;
+		}
+
+		/* End of node, pop back towards the root. */
+		if (cur->bc_levels[level].ptr >
+					be16_to_cpu(block->bb_numrecs)) {
+			xchk_btree_block_keys(bs, level, block);
+			if (level < cur->bc_nlevels - 1)
+				cur->bc_levels[level + 1].ptr++;
+			level++;
+			continue;
+		}
+
+		/* Keys in order for scrub? */
+		xchk_btree_key(bs, level);
+
+		/* Drill another level deeper. */
+		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
+		if (!xchk_btree_ptr_ok(bs, level, pp)) {
+			cur->bc_levels[level].ptr++;
+			continue;
+		}
+		level--;
+		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
+		if (error || !block)
+			goto out;
+
+		cur->bc_levels[level].ptr = 1;
+	}
+
+out:
+	/* Process deferred owner checks on btree blocks. */
+	list_for_each_entry_safe(co, n, &bs->to_check, list) {
+		if (!error && bs->cur)
+			error = xchk_btree_check_block_owner(bs, co->level,
+					co->daddr);
+		list_del(&co->list);
+		kmem_free(co);
+	}
+	kmem_free(bs);
+
+	return error;
+}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
new file mode 100644
index 000000000..da61a53a0
--- /dev/null
+++ b/fs/xfs/scrub/btree.h
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_BTREE_H__
+#define __XFS_SCRUB_BTREE_H__
+
+/* btree scrub */
+
+/* Check for btree operation errors. */
+bool xchk_btree_process_error(struct xfs_scrub *sc,
+		struct xfs_btree_cur *cur, int level, int *error);
+
+/* Check for btree xref operation errors. */
+bool xchk_btree_xref_process_error(struct xfs_scrub *sc,
+		struct xfs_btree_cur *cur, int level, int *error);
+
+/* Check for btree corruption. */
+void xchk_btree_set_corrupt(struct xfs_scrub *sc,
+		struct xfs_btree_cur *cur, int level);
+
+/* Check for btree xref discrepancies. */
+void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc,
+		struct xfs_btree_cur *cur, int level);
+
+struct xchk_btree;
+typedef int (*xchk_btree_rec_fn)(
+	struct xchk_btree		*bs,
+	const union xfs_btree_rec	*rec);
+
+struct xchk_btree {
+	/* caller-provided scrub state */
+	struct xfs_scrub		*sc;
+	struct xfs_btree_cur		*cur;
+	xchk_btree_rec_fn		scrub_rec;
+	const struct xfs_owner_info	*oinfo;
+	void				*private;
+
+	/* internal scrub state */
+	union xfs_btree_rec		lastrec;
+	struct list_head		to_check;
+
+	/* this element must come last! */
+	union xfs_btree_key		lastkey[];
+};
+
+/*
+ * Calculate the size of a xchk_btree structure.  There are nlevels-1 slots for
+ * keys because we track leaf records separately in lastrec.
+ */
+static inline size_t
+xchk_btree_sizeof(unsigned int nlevels)
+{
+	return struct_size((struct xchk_btree *)NULL, lastkey, nlevels - 1);
+}
+
+int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo,
+		void *private);
+
+#endif /* __XFS_SCRUB_BTREE_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
new file mode 100644
index 000000000..e71449658
--- /dev/null
+++ b/fs/xfs/scrub/common.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_reflink.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/health.h"
+
+/* Common code for the metadata scrubbers. */
+
+/*
+ * Handling operational errors.
+ *
+ * The *_process_error() family of functions are used to process error return
+ * codes from functions called as part of a scrub operation.
+ *
+ * If there's no error, we return true to tell the caller that it's ok
+ * to move on to the next check in its list.
+ *
+ * For non-verifier errors (e.g. ENOMEM) we return false to tell the
+ * caller that something bad happened, and we preserve *error so that
+ * the caller can return the *error up the stack to userspace.
+ *
+ * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
+ * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
+ * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
+ * not via return codes.  We return false to tell the caller that
+ * something bad happened.  Since the error has been cleared, the caller
+ * will (presumably) return that zero and scrubbing will move on to
+ * whatever's next.
+ *
+ * ftrace can be used to record the precise metadata location and the
+ * approximate code location of the failed operation.
+ */
+
+/* Check for operational errors. */
+static bool
+__xchk_process_error(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	int			*error,
+	__u32			errflag,
+	void			*ret_ip)
+{
+	switch (*error) {
+	case 0:
+		return true;
+	case -EDEADLOCK:
+		/* Used to restart an op with deadlock avoidance. */
+		trace_xchk_deadlock_retry(
+				sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
+				sc->sm, *error);
+		break;
+	case -EFSBADCRC:
+	case -EFSCORRUPTED:
+		/* Note the badness but don't abort. */
+		sc->sm->sm_flags |= errflag;
+		*error = 0;
+		fallthrough;
+	default:
+		trace_xchk_op_error(sc, agno, bno, *error,
+				ret_ip);
+		break;
+	}
+	return false;
+}
+
+bool
+xchk_process_error(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	int			*error)
+{
+	return __xchk_process_error(sc, agno, bno, error,
+			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
+bool
+xchk_xref_process_error(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	int			*error)
+{
+	return __xchk_process_error(sc, agno, bno, error,
+			XFS_SCRUB_OFLAG_XFAIL, __return_address);
+}
+
+/* Check for operational errors for a file offset. */
+static bool
+__xchk_fblock_process_error(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_fileoff_t		offset,
+	int			*error,
+	__u32			errflag,
+	void			*ret_ip)
+{
+	switch (*error) {
+	case 0:
+		return true;
+	case -EDEADLOCK:
+		/* Used to restart an op with deadlock avoidance. */
+		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
+		break;
+	case -EFSBADCRC:
+	case -EFSCORRUPTED:
+		/* Note the badness but don't abort. */
+		sc->sm->sm_flags |= errflag;
+		*error = 0;
+		fallthrough;
+	default:
+		trace_xchk_file_op_error(sc, whichfork, offset, *error,
+				ret_ip);
+		break;
+	}
+	return false;
+}
+
+bool
+xchk_fblock_process_error(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_fileoff_t		offset,
+	int			*error)
+{
+	return __xchk_fblock_process_error(sc, whichfork, offset, error,
+			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
+bool
+xchk_fblock_xref_process_error(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_fileoff_t		offset,
+	int			*error)
+{
+	return __xchk_fblock_process_error(sc, whichfork, offset, error,
+			XFS_SCRUB_OFLAG_XFAIL, __return_address);
+}
+
+/*
+ * Handling scrub corruption/optimization/warning checks.
+ *
+ * The *_set_{corrupt,preen,warning}() family of functions are used to
+ * record the presence of metadata that is incorrect (corrupt), could be
+ * optimized somehow (preen), or should be flagged for administrative
+ * review but is not incorrect (warn).
+ *
+ * ftrace can be used to record the precise metadata location and
+ * approximate code location of the failed check.
+ */
+
+/* Record a block which could be optimized. */
+void
+xchk_block_set_preen(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+	trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
+}
+
+/*
+ * Record an inode which could be optimized.  The trace data will
+ * include the block given by bp if bp is given; otherwise it will use
+ * the block location of the inode record itself.
+ */
+void
+xchk_ino_set_preen(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		ino)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+	trace_xchk_ino_preen(sc, ino, __return_address);
+}
+
+/* Record something being wrong with the filesystem primary superblock. */
+void
+xchk_set_corrupt(
+	struct xfs_scrub	*sc)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+	trace_xchk_fs_error(sc, 0, __return_address);
+}
+
+/* Record a corrupt block. */
+void
+xchk_block_set_corrupt(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
+}
+
+/* Record a corruption while cross-referencing. */
+void
+xchk_block_xref_set_corrupt(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
+	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
+}
+
+/*
+ * Record a corrupt inode.  The trace data will include the block given
+ * by bp if bp is given; otherwise it will use the block location of the
+ * inode record itself.
+ */
+void
+xchk_ino_set_corrupt(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		ino)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+	trace_xchk_ino_error(sc, ino, __return_address);
+}
+
+/* Record a corruption while cross-referencing with an inode. */
+void
+xchk_ino_xref_set_corrupt(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		ino)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
+	trace_xchk_ino_error(sc, ino, __return_address);
+}
+
+/* Record corruption in a block indexed by a file fork. */
+void
+xchk_fblock_set_corrupt(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_fileoff_t		offset)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
+}
+
+/* Record a corruption while cross-referencing a fork block. */
+void
+xchk_fblock_xref_set_corrupt(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_fileoff_t		offset)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
+	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
+}
+
+/*
+ * Warn about inodes that need administrative review but is not
+ * incorrect.
+ */
+void
+xchk_ino_set_warning(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		ino)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
+	trace_xchk_ino_warning(sc, ino, __return_address);
+}
+
+/* Warn about a block indexed by a file fork that needs review. */
+void
+xchk_fblock_set_warning(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_fileoff_t		offset)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
+	trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
+}
+
+/* Signal an incomplete scrub. */
+void
+xchk_set_incomplete(
+	struct xfs_scrub	*sc)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
+	trace_xchk_incomplete(sc, __return_address);
+}
+
+/*
+ * rmap scrubbing -- compute the number of blocks with a given owner,
+ * at least according to the reverse mapping data.
+ */
+
+struct xchk_rmap_ownedby_info {
+	const struct xfs_owner_info	*oinfo;
+	xfs_filblks_t			*blocks;
+};
+
+STATIC int
+xchk_count_rmap_ownedby_irec(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xchk_rmap_ownedby_info	*sroi = priv;
+	bool				irec_attr;
+	bool				oinfo_attr;
+
+	irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
+	oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
+
+	if (rec->rm_owner != sroi->oinfo->oi_owner)
+		return 0;
+
+	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
+		(*sroi->blocks) += rec->rm_blockcount;
+
+	return 0;
+}
+
+/*
+ * Calculate the number of blocks the rmap thinks are owned by something.
+ * The caller should pass us an rmapbt cursor.
+ */
+int
+xchk_count_rmap_ownedby_ag(
+	struct xfs_scrub		*sc,
+	struct xfs_btree_cur		*cur,
+	const struct xfs_owner_info	*oinfo,
+	xfs_filblks_t			*blocks)
+{
+	struct xchk_rmap_ownedby_info	sroi = {
+		.oinfo			= oinfo,
+		.blocks			= blocks,
+	};
+
+	*blocks = 0;
+	return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
+			&sroi);
+}
+
+/*
+ * AG scrubbing
+ *
+ * These helpers facilitate locking an allocation group's header
+ * buffers, setting up cursors for all btrees that are present, and
+ * cleaning everything up once we're through.
+ */
+
+/* Decide if we want to return an AG header read failure. */
+static inline bool
+want_ag_read_header_failure(
+	struct xfs_scrub	*sc,
+	unsigned int		type)
+{
+	/* Return all AG header read failures when scanning btrees. */
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
+	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
+	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
+		return true;
+	/*
+	 * If we're scanning a given type of AG header, we only want to
+	 * see read failures from that specific header.  We'd like the
+	 * other headers to cross-check them, but this isn't required.
+	 */
+	if (sc->sm->sm_type == type)
+		return true;
+	return false;
+}
+
+/*
+ * Grab the perag structure and all the headers for an AG.
+ *
+ * The headers should be released by xchk_ag_free, but as a fail safe we attach
+ * all the buffers we grab to the scrub transaction so they'll all be freed
+ * when we cancel it.  Returns ENOENT if we can't grab the perag structure.
+ */
+int
+xchk_ag_read_headers(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	struct xchk_ag		*sa)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			error;
+
+	ASSERT(!sa->pag);
+	sa->pag = xfs_perag_get(mp, agno);
+	if (!sa->pag)
+		return -ENOENT;
+
+	error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
+		return error;
+
+	error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
+		return error;
+
+	error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
+		return error;
+
+	return 0;
+}
+
+/* Release all the AG btree cursors. */
+void
+xchk_ag_btcur_free(
+	struct xchk_ag		*sa)
+{
+	if (sa->refc_cur)
+		xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
+	if (sa->rmap_cur)
+		xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
+	if (sa->fino_cur)
+		xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
+	if (sa->ino_cur)
+		xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
+	if (sa->cnt_cur)
+		xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
+	if (sa->bno_cur)
+		xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
+
+	sa->refc_cur = NULL;
+	sa->rmap_cur = NULL;
+	sa->fino_cur = NULL;
+	sa->ino_cur = NULL;
+	sa->bno_cur = NULL;
+	sa->cnt_cur = NULL;
+}
+
+/* Initialize all the btree cursors for an AG. */
+void
+xchk_ag_btcur_init(
+	struct xfs_scrub	*sc,
+	struct xchk_ag		*sa)
+{
+	struct xfs_mount	*mp = sc->mp;
+
+	if (sa->agf_bp &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
+		/* Set up a bnobt cursor for cross-referencing. */
+		sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sa->pag, XFS_BTNUM_BNO);
+	}
+
+	if (sa->agf_bp &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
+		/* Set up a cntbt cursor for cross-referencing. */
+		sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sa->pag, XFS_BTNUM_CNT);
+	}
+
+	/* Set up a inobt cursor for cross-referencing. */
+	if (sa->agi_bp &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
+		sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+				sa->pag, XFS_BTNUM_INO);
+	}
+
+	/* Set up a finobt cursor for cross-referencing. */
+	if (sa->agi_bp && xfs_has_finobt(mp) &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
+		sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+				sa->pag, XFS_BTNUM_FINO);
+	}
+
+	/* Set up a rmapbt cursor for cross-referencing. */
+	if (sa->agf_bp && xfs_has_rmapbt(mp) &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
+		sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sa->pag);
+	}
+
+	/* Set up a refcountbt cursor for cross-referencing. */
+	if (sa->agf_bp && xfs_has_reflink(mp) &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
+		sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+				sa->agf_bp, sa->pag);
+	}
+}
+
+/* Release the AG header context and btree cursors. */
+void
+xchk_ag_free(
+	struct xfs_scrub	*sc,
+	struct xchk_ag		*sa)
+{
+	xchk_ag_btcur_free(sa);
+	if (sa->agfl_bp) {
+		xfs_trans_brelse(sc->tp, sa->agfl_bp);
+		sa->agfl_bp = NULL;
+	}
+	if (sa->agf_bp) {
+		xfs_trans_brelse(sc->tp, sa->agf_bp);
+		sa->agf_bp = NULL;
+	}
+	if (sa->agi_bp) {
+		xfs_trans_brelse(sc->tp, sa->agi_bp);
+		sa->agi_bp = NULL;
+	}
+	if (sa->pag) {
+		xfs_perag_put(sa->pag);
+		sa->pag = NULL;
+	}
+}
+
+/*
+ * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
+ * order.  Locking order requires us to get the AGI before the AGF.  We use the
+ * transaction to avoid deadlocking on crosslinked metadata buffers; either the
+ * caller passes one in (bmap scrub) or we have to create a transaction
+ * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
+ */
+int
+xchk_ag_init(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	struct xchk_ag		*sa)
+{
+	int			error;
+
+	error = xchk_ag_read_headers(sc, agno, sa);
+	if (error)
+		return error;
+
+	xchk_ag_btcur_init(sc, sa);
+	return 0;
+}
+
+/* Per-scrubber setup functions */
+
+/*
+ * Grab an empty transaction so that we can re-grab locked buffers if
+ * one of our btrees turns out to be cyclic.
+ *
+ * If we're going to repair something, we need to ask for the largest possible
+ * log reservation so that we can handle the worst case scenario for metadata
+ * updates while rebuilding a metadata item.  We also need to reserve as many
+ * blocks in the head transaction as we think we're going to need to rebuild
+ * the metadata object.
+ */
+int
+xchk_trans_alloc(
+	struct xfs_scrub	*sc,
+	uint			resblks)
+{
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+				resblks, 0, 0, &sc->tp);
+
+	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
+/* Set us up with a transaction and an empty context. */
+int
+xchk_setup_fs(
+	struct xfs_scrub	*sc)
+{
+	uint			resblks;
+
+	resblks = xrep_calc_ag_resblks(sc);
+	return xchk_trans_alloc(sc, resblks);
+}
+
+/* Set us up with AG headers and btree cursors. */
+int
+xchk_setup_ag_btree(
+	struct xfs_scrub	*sc,
+	bool			force_log)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			error;
+
+	/*
+	 * If the caller asks us to checkpont the log, do so.  This
+	 * expensive operation should be performed infrequently and only
+	 * as a last resort.  Any caller that sets force_log should
+	 * document why they need to do so.
+	 */
+	if (force_log) {
+		error = xchk_checkpoint_log(mp);
+		if (error)
+			return error;
+	}
+
+	error = xchk_setup_fs(sc);
+	if (error)
+		return error;
+
+	return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
+}
+
+/* Push everything out of the log onto disk. */
+int
+xchk_checkpoint_log(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	error = xfs_log_force(mp, XFS_LOG_SYNC);
+	if (error)
+		return error;
+	xfs_ail_push_all_sync(mp->m_ail);
+	return 0;
+}
+
+/*
+ * Given an inode and the scrub control structure, grab either the
+ * inode referenced in the control structure or the inode passed in.
+ * The inode is not locked.
+ */
+int
+xchk_get_inode(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_imap		imap;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	/* We want to scan the inode we already had opened. */
+	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
+		sc->ip = ip_in;
+		return 0;
+	}
+
+	/* Look up the inode, see if the generation number matches. */
+	if (xfs_internal_inum(mp, sc->sm->sm_ino))
+		return -ENOENT;
+	error = xfs_iget(mp, NULL, sc->sm->sm_ino,
+			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip);
+	switch (error) {
+	case -ENOENT:
+		/* Inode doesn't exist, just bail out. */
+		return error;
+	case 0:
+		/* Got an inode, continue. */
+		break;
+	case -EINVAL:
+		/*
+		 * -EINVAL with IGET_UNTRUSTED could mean one of several
+		 * things: userspace gave us an inode number that doesn't
+		 * correspond to fs space, or doesn't have an inobt entry;
+		 * or it could simply mean that the inode buffer failed the
+		 * read verifiers.
+		 *
+		 * Try just the inode mapping lookup -- if it succeeds, then
+		 * the inode buffer verifier failed and something needs fixing.
+		 * Otherwise, we really couldn't find it so tell userspace
+		 * that it no longer exists.
+		 */
+		error = xfs_imap(sc->mp, sc->tp, sc->sm->sm_ino, &imap,
+				XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE);
+		if (error)
+			return -ENOENT;
+		error = -EFSCORRUPTED;
+		fallthrough;
+	default:
+		trace_xchk_op_error(sc,
+				XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
+				XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+				error, __return_address);
+		return error;
+	}
+	if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
+		xfs_irele(ip);
+		return -ENOENT;
+	}
+
+	sc->ip = ip;
+	return 0;
+}
+
+/* Set us up to scrub a file's contents. */
+int
+xchk_setup_inode_contents(
+	struct xfs_scrub	*sc,
+	unsigned int		resblks)
+{
+	int			error;
+
+	error = xchk_get_inode(sc);
+	if (error)
+		return error;
+
+	/* Got the inode, lock it and we're ready to go. */
+	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+	error = xchk_trans_alloc(sc, resblks);
+	if (error)
+		goto out;
+	sc->ilock_flags |= XFS_ILOCK_EXCL;
+	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+	/* scrub teardown will unlock and release the inode for us */
+	return error;
+}
+
+/*
+ * Predicate that decides if we need to evaluate the cross-reference check.
+ * If there was an error accessing the cross-reference btree, just delete
+ * the cursor and skip the check.
+ */
+bool
+xchk_should_check_xref(
+	struct xfs_scrub	*sc,
+	int			*error,
+	struct xfs_btree_cur	**curpp)
+{
+	/* No point in xref if we already know we're corrupt. */
+	if (xchk_skip_xref(sc->sm))
+		return false;
+
+	if (*error == 0)
+		return true;
+
+	if (curpp) {
+		/* If we've already given up on xref, just bail out. */
+		if (!*curpp)
+			return false;
+
+		/* xref error, delete cursor and bail out. */
+		xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
+		*curpp = NULL;
+	}
+
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
+	trace_xchk_xref_error(sc, *error, __return_address);
+
+	/*
+	 * Errors encountered during cross-referencing with another
+	 * data structure should not cause this scrubber to abort.
+	 */
+	*error = 0;
+	return false;
+}
+
+/* Run the structure verifiers on in-memory buffers to detect bad memory. */
+void
+xchk_buffer_recheck(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp)
+{
+	xfs_failaddr_t		fa;
+
+	if (bp->b_ops == NULL) {
+		xchk_block_set_corrupt(sc, bp);
+		return;
+	}
+	if (bp->b_ops->verify_struct == NULL) {
+		xchk_set_incomplete(sc);
+		return;
+	}
+	fa = bp->b_ops->verify_struct(bp);
+	if (!fa)
+		return;
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+	trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
+}
+
+/*
+ * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
+ * pointed to by sc->ip and the ILOCK must be held.
+ */
+int
+xchk_metadata_inode_forks(
+	struct xfs_scrub	*sc)
+{
+	__u32			smtype;
+	bool			shared;
+	int			error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	/* Metadata inodes don't live on the rt device. */
+	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
+	}
+
+	/* They should never participate in reflink. */
+	if (xfs_is_reflink_inode(sc->ip)) {
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
+	}
+
+	/* They also should never have extended attributes. */
+	if (xfs_inode_hasattr(sc->ip)) {
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
+	}
+
+	/* Invoke the data fork scrubber. */
+	smtype = sc->sm->sm_type;
+	sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
+	error = xchk_bmap_data(sc);
+	sc->sm->sm_type = smtype;
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	/* Look for incorrect shared blocks. */
+	if (xfs_has_reflink(sc->mp)) {
+		error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+				&shared);
+		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
+				&error))
+			return error;
+		if (shared)
+			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+	}
+
+	return error;
+}
+
+/*
+ * Try to lock an inode in violation of the usual locking order rules.  For
+ * example, trying to get the IOLOCK while in transaction context, or just
+ * plain breaking AG-order or inode-order inode locking rules.  Either way,
+ * the only way to avoid an ABBA deadlock is to use trylock and back off if
+ * we can't.
+ */
+int
+xchk_ilock_inverted(
+	struct xfs_inode	*ip,
+	uint			lock_mode)
+{
+	int			i;
+
+	for (i = 0; i < 20; i++) {
+		if (xfs_ilock_nowait(ip, lock_mode))
+			return 0;
+		delay(1);
+	}
+	return -EDEADLOCK;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
new file mode 100644
index 000000000..2ca80102e
--- /dev/null
+++ b/fs/xfs/scrub/common.h
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_COMMON_H__
+#define __XFS_SCRUB_COMMON_H__
+
+/*
+ * We /could/ terminate a scrub/repair operation early.  If we're not
+ * in a good place to continue (fatal signal, etc.) then bail out.
+ * Note that we're careful not to make any judgements about *error.
+ */
+static inline bool
+xchk_should_terminate(
+	struct xfs_scrub	*sc,
+	int			*error)
+{
+	/*
+	 * If preemption is disabled, we need to yield to the scheduler every
+	 * few seconds so that we don't run afoul of the soft lockup watchdog
+	 * or RCU stall detector.
+	 */
+	cond_resched();
+
+	if (fatal_signal_pending(current)) {
+		if (*error == 0)
+			*error = -EAGAIN;
+		return true;
+	}
+	return false;
+}
+
+int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
+		xfs_agblock_t bno, int *error);
+bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork,
+		xfs_fileoff_t offset, int *error);
+
+bool xchk_xref_process_error(struct xfs_scrub *sc,
+		xfs_agnumber_t agno, xfs_agblock_t bno, int *error);
+bool xchk_fblock_xref_process_error(struct xfs_scrub *sc,
+		int whichfork, xfs_fileoff_t offset, int *error);
+
+void xchk_block_set_preen(struct xfs_scrub *sc,
+		struct xfs_buf *bp);
+void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino);
+
+void xchk_set_corrupt(struct xfs_scrub *sc);
+void xchk_block_set_corrupt(struct xfs_scrub *sc,
+		struct xfs_buf *bp);
+void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
+void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork,
+		xfs_fileoff_t offset);
+
+void xchk_block_xref_set_corrupt(struct xfs_scrub *sc,
+		struct xfs_buf *bp);
+void xchk_ino_xref_set_corrupt(struct xfs_scrub *sc,
+		xfs_ino_t ino);
+void xchk_fblock_xref_set_corrupt(struct xfs_scrub *sc,
+		int whichfork, xfs_fileoff_t offset);
+
+void xchk_ino_set_warning(struct xfs_scrub *sc, xfs_ino_t ino);
+void xchk_fblock_set_warning(struct xfs_scrub *sc, int whichfork,
+		xfs_fileoff_t offset);
+
+void xchk_set_incomplete(struct xfs_scrub *sc);
+int xchk_checkpoint_log(struct xfs_mount *mp);
+
+/* Are we set up for a cross-referencing check? */
+bool xchk_should_check_xref(struct xfs_scrub *sc, int *error,
+			   struct xfs_btree_cur **curpp);
+
+/* Setup functions */
+int xchk_setup_fs(struct xfs_scrub *sc);
+int xchk_setup_ag_allocbt(struct xfs_scrub *sc);
+int xchk_setup_ag_iallocbt(struct xfs_scrub *sc);
+int xchk_setup_ag_rmapbt(struct xfs_scrub *sc);
+int xchk_setup_ag_refcountbt(struct xfs_scrub *sc);
+int xchk_setup_inode(struct xfs_scrub *sc);
+int xchk_setup_inode_bmap(struct xfs_scrub *sc);
+int xchk_setup_inode_bmap_data(struct xfs_scrub *sc);
+int xchk_setup_directory(struct xfs_scrub *sc);
+int xchk_setup_xattr(struct xfs_scrub *sc);
+int xchk_setup_symlink(struct xfs_scrub *sc);
+int xchk_setup_parent(struct xfs_scrub *sc);
+#ifdef CONFIG_XFS_RT
+int xchk_setup_rt(struct xfs_scrub *sc);
+#else
+static inline int
+xchk_setup_rt(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
+#endif
+#ifdef CONFIG_XFS_QUOTA
+int xchk_setup_quota(struct xfs_scrub *sc);
+#else
+static inline int
+xchk_setup_quota(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
+#endif
+int xchk_setup_fscounters(struct xfs_scrub *sc);
+
+void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
+int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
+		struct xchk_ag *sa);
+
+/*
+ * Grab all AG resources, treating the inability to grab the perag structure as
+ * a fs corruption.  This is intended for callers checking an ondisk reference
+ * to a given AG, which means that the AG must still exist.
+ */
+static inline int
+xchk_ag_init_existing(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	struct xchk_ag		*sa)
+{
+	int			error = xchk_ag_init(sc, agno, sa);
+
+	return error == -ENOENT ? -EFSCORRUPTED : error;
+}
+
+int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
+		struct xchk_ag *sa);
+void xchk_ag_btcur_free(struct xchk_ag *sa);
+void xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
+int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
+
+int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
+int xchk_get_inode(struct xfs_scrub *sc);
+int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
+void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
+
+/*
+ * Don't bother cross-referencing if we already found corruption or cross
+ * referencing discrepancies.
+ */
+static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
+{
+	return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+			       XFS_SCRUB_OFLAG_XCORRUPT);
+}
+
+int xchk_metadata_inode_forks(struct xfs_scrub *sc);
+int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+
+#endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
new file mode 100644
index 000000000..84fe3d33d
--- /dev/null
+++ b/fs/xfs/scrub/dabtree.c
@@ -0,0 +1,596 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_leaf.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/dabtree.h"
+
+/* Directory/Attribute Btree */
+
+/*
+ * Check for da btree operation errors.  See the section about handling
+ * operational errors in common.c.
+ */
+bool
+xchk_da_process_error(
+	struct xchk_da_btree	*ds,
+	int			level,
+	int			*error)
+{
+	struct xfs_scrub	*sc = ds->sc;
+
+	if (*error == 0)
+		return true;
+
+	switch (*error) {
+	case -EDEADLOCK:
+		/* Used to restart an op with deadlock avoidance. */
+		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
+		break;
+	case -EFSBADCRC:
+	case -EFSCORRUPTED:
+		/* Note the badness but don't abort. */
+		sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+		*error = 0;
+		fallthrough;
+	default:
+		trace_xchk_file_op_error(sc, ds->dargs.whichfork,
+				xfs_dir2_da_to_db(ds->dargs.geo,
+					ds->state->path.blk[level].blkno),
+				*error, __return_address);
+		break;
+	}
+	return false;
+}
+
+/*
+ * Check for da btree corruption.  See the section about handling
+ * operational errors in common.c.
+ */
+void
+xchk_da_set_corrupt(
+	struct xchk_da_btree	*ds,
+	int			level)
+{
+	struct xfs_scrub	*sc = ds->sc;
+
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+	trace_xchk_fblock_error(sc, ds->dargs.whichfork,
+			xfs_dir2_da_to_db(ds->dargs.geo,
+				ds->state->path.blk[level].blkno),
+			__return_address);
+}
+
+static struct xfs_da_node_entry *
+xchk_da_btree_node_entry(
+	struct xchk_da_btree		*ds,
+	int				level)
+{
+	struct xfs_da_state_blk		*blk = &ds->state->path.blk[level];
+	struct xfs_da3_icnode_hdr	hdr;
+
+	ASSERT(blk->magic == XFS_DA_NODE_MAGIC);
+
+	xfs_da3_node_hdr_from_disk(ds->sc->mp, &hdr, blk->bp->b_addr);
+	return hdr.btree + blk->index;
+}
+
+/* Scrub a da btree hash (key). */
+int
+xchk_da_btree_hash(
+	struct xchk_da_btree		*ds,
+	int				level,
+	__be32				*hashp)
+{
+	struct xfs_da_node_entry	*entry;
+	xfs_dahash_t			hash;
+	xfs_dahash_t			parent_hash;
+
+	/* Is this hash in order? */
+	hash = be32_to_cpu(*hashp);
+	if (hash < ds->hashes[level])
+		xchk_da_set_corrupt(ds, level);
+	ds->hashes[level] = hash;
+
+	if (level == 0)
+		return 0;
+
+	/* Is this hash no larger than the parent hash? */
+	entry = xchk_da_btree_node_entry(ds, level - 1);
+	parent_hash = be32_to_cpu(entry->hashval);
+	if (parent_hash < hash)
+		xchk_da_set_corrupt(ds, level);
+
+	return 0;
+}
+
+/*
+ * Check a da btree pointer.  Returns true if it's ok to use this
+ * pointer.
+ */
+STATIC bool
+xchk_da_btree_ptr_ok(
+	struct xchk_da_btree	*ds,
+	int			level,
+	xfs_dablk_t		blkno)
+{
+	if (blkno < ds->lowest || (ds->highest != 0 && blkno >= ds->highest)) {
+		xchk_da_set_corrupt(ds, level);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * The da btree scrubber can handle leaf1 blocks as a degenerate
+ * form of leafn blocks.  Since the regular da code doesn't handle
+ * leaf1, we must multiplex the verifiers.
+ */
+static void
+xchk_da_btree_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+
+	switch (be16_to_cpu(info->magic)) {
+	case XFS_DIR2_LEAF1_MAGIC:
+	case XFS_DIR3_LEAF1_MAGIC:
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		bp->b_ops->verify_read(bp);
+		return;
+	default:
+		/*
+		 * xfs_da3_node_buf_ops already know how to handle
+		 * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks.
+		 */
+		bp->b_ops = &xfs_da3_node_buf_ops;
+		bp->b_ops->verify_read(bp);
+		return;
+	}
+}
+static void
+xchk_da_btree_write_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+
+	switch (be16_to_cpu(info->magic)) {
+	case XFS_DIR2_LEAF1_MAGIC:
+	case XFS_DIR3_LEAF1_MAGIC:
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		bp->b_ops->verify_write(bp);
+		return;
+	default:
+		/*
+		 * xfs_da3_node_buf_ops already know how to handle
+		 * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks.
+		 */
+		bp->b_ops = &xfs_da3_node_buf_ops;
+		bp->b_ops->verify_write(bp);
+		return;
+	}
+}
+static void *
+xchk_da_btree_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+
+	switch (be16_to_cpu(info->magic)) {
+	case XFS_DIR2_LEAF1_MAGIC:
+	case XFS_DIR3_LEAF1_MAGIC:
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		return bp->b_ops->verify_struct(bp);
+	default:
+		bp->b_ops = &xfs_da3_node_buf_ops;
+		return bp->b_ops->verify_struct(bp);
+	}
+}
+
+static const struct xfs_buf_ops xchk_da_btree_buf_ops = {
+	.name = "xchk_da_btree",
+	.verify_read = xchk_da_btree_read_verify,
+	.verify_write = xchk_da_btree_write_verify,
+	.verify_struct = xchk_da_btree_verify,
+};
+
+/* Check a block's sibling. */
+STATIC int
+xchk_da_btree_block_check_sibling(
+	struct xchk_da_btree	*ds,
+	int			level,
+	int			direction,
+	xfs_dablk_t		sibling)
+{
+	struct xfs_da_state_path *path = &ds->state->path;
+	struct xfs_da_state_path *altpath = &ds->state->altpath;
+	int			retval;
+	int			plevel;
+	int			error;
+
+	memcpy(altpath, path, sizeof(ds->state->altpath));
+
+	/*
+	 * If the pointer is null, we shouldn't be able to move the upper
+	 * level pointer anywhere.
+	 */
+	if (sibling == 0) {
+		error = xfs_da3_path_shift(ds->state, altpath, direction,
+				false, &retval);
+		if (error == 0 && retval == 0)
+			xchk_da_set_corrupt(ds, level);
+		error = 0;
+		goto out;
+	}
+
+	/* Move the alternate cursor one block in the direction given. */
+	error = xfs_da3_path_shift(ds->state, altpath, direction, false,
+			&retval);
+	if (!xchk_da_process_error(ds, level, &error))
+		goto out;
+	if (retval) {
+		xchk_da_set_corrupt(ds, level);
+		goto out;
+	}
+	if (altpath->blk[level].bp)
+		xchk_buffer_recheck(ds->sc, altpath->blk[level].bp);
+
+	/* Compare upper level pointer to sibling pointer. */
+	if (altpath->blk[level].blkno != sibling)
+		xchk_da_set_corrupt(ds, level);
+
+out:
+	/* Free all buffers in the altpath that aren't referenced from path. */
+	for (plevel = 0; plevel < altpath->active; plevel++) {
+		if (altpath->blk[plevel].bp == NULL ||
+		    (plevel < path->active &&
+		     altpath->blk[plevel].bp == path->blk[plevel].bp))
+			continue;
+
+		xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp);
+		altpath->blk[plevel].bp = NULL;
+	}
+
+	return error;
+}
+
+/* Check a block's sibling pointers. */
+STATIC int
+xchk_da_btree_block_check_siblings(
+	struct xchk_da_btree	*ds,
+	int			level,
+	struct xfs_da_blkinfo	*hdr)
+{
+	xfs_dablk_t		forw;
+	xfs_dablk_t		back;
+	int			error = 0;
+
+	forw = be32_to_cpu(hdr->forw);
+	back = be32_to_cpu(hdr->back);
+
+	/* Top level blocks should not have sibling pointers. */
+	if (level == 0) {
+		if (forw != 0 || back != 0)
+			xchk_da_set_corrupt(ds, level);
+		return 0;
+	}
+
+	/*
+	 * Check back (left) and forw (right) pointers.  These functions
+	 * absorb error codes for us.
+	 */
+	error = xchk_da_btree_block_check_sibling(ds, level, 0, back);
+	if (error)
+		goto out;
+	error = xchk_da_btree_block_check_sibling(ds, level, 1, forw);
+
+out:
+	memset(&ds->state->altpath, 0, sizeof(ds->state->altpath));
+	return error;
+}
+
+/* Load a dir/attribute block from a btree. */
+STATIC int
+xchk_da_btree_block(
+	struct xchk_da_btree		*ds,
+	int				level,
+	xfs_dablk_t			blkno)
+{
+	struct xfs_da_state_blk		*blk;
+	struct xfs_da_intnode		*node;
+	struct xfs_da_node_entry	*btree;
+	struct xfs_da3_blkinfo		*hdr3;
+	struct xfs_da_args		*dargs = &ds->dargs;
+	struct xfs_inode		*ip = ds->dargs.dp;
+	xfs_ino_t			owner;
+	int				*pmaxrecs;
+	struct xfs_da3_icnode_hdr	nodehdr;
+	int				error = 0;
+
+	blk = &ds->state->path.blk[level];
+	ds->state->path.active = level + 1;
+
+	/* Release old block. */
+	if (blk->bp) {
+		xfs_trans_brelse(dargs->trans, blk->bp);
+		blk->bp = NULL;
+	}
+
+	/* Check the pointer. */
+	blk->blkno = blkno;
+	if (!xchk_da_btree_ptr_ok(ds, level, blkno))
+		goto out_nobuf;
+
+	/* Read the buffer. */
+	error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno,
+			XFS_DABUF_MAP_HOLE_OK, &blk->bp, dargs->whichfork,
+			&xchk_da_btree_buf_ops);
+	if (!xchk_da_process_error(ds, level, &error))
+		goto out_nobuf;
+	if (blk->bp)
+		xchk_buffer_recheck(ds->sc, blk->bp);
+
+	/*
+	 * We didn't find a dir btree root block, which means that
+	 * there's no LEAF1/LEAFN tree (at least not where it's supposed
+	 * to be), so jump out now.
+	 */
+	if (ds->dargs.whichfork == XFS_DATA_FORK && level == 0 &&
+			blk->bp == NULL)
+		goto out_nobuf;
+
+	/* It's /not/ ok for attr trees not to have a da btree. */
+	if (blk->bp == NULL) {
+		xchk_da_set_corrupt(ds, level);
+		goto out_nobuf;
+	}
+
+	hdr3 = blk->bp->b_addr;
+	blk->magic = be16_to_cpu(hdr3->hdr.magic);
+	pmaxrecs = &ds->maxrecs[level];
+
+	/* We only started zeroing the header on v5 filesystems. */
+	if (xfs_has_crc(ds->sc->mp) && hdr3->hdr.pad)
+		xchk_da_set_corrupt(ds, level);
+
+	/* Check the owner. */
+	if (xfs_has_crc(ip->i_mount)) {
+		owner = be64_to_cpu(hdr3->owner);
+		if (owner != ip->i_ino)
+			xchk_da_set_corrupt(ds, level);
+	}
+
+	/* Check the siblings. */
+	error = xchk_da_btree_block_check_siblings(ds, level, &hdr3->hdr);
+	if (error)
+		goto out;
+
+	/* Interpret the buffer. */
+	switch (blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+	case XFS_ATTR3_LEAF_MAGIC:
+		xfs_trans_buf_set_type(dargs->trans, blk->bp,
+				XFS_BLFT_ATTR_LEAF_BUF);
+		blk->magic = XFS_ATTR_LEAF_MAGIC;
+		blk->hashval = xfs_attr_leaf_lasthash(blk->bp, pmaxrecs);
+		if (ds->tree_level != 0)
+			xchk_da_set_corrupt(ds, level);
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+	case XFS_DIR3_LEAFN_MAGIC:
+		xfs_trans_buf_set_type(dargs->trans, blk->bp,
+				XFS_BLFT_DIR_LEAFN_BUF);
+		blk->magic = XFS_DIR2_LEAFN_MAGIC;
+		blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs);
+		if (ds->tree_level != 0)
+			xchk_da_set_corrupt(ds, level);
+		break;
+	case XFS_DIR2_LEAF1_MAGIC:
+	case XFS_DIR3_LEAF1_MAGIC:
+		xfs_trans_buf_set_type(dargs->trans, blk->bp,
+				XFS_BLFT_DIR_LEAF1_BUF);
+		blk->magic = XFS_DIR2_LEAF1_MAGIC;
+		blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs);
+		if (ds->tree_level != 0)
+			xchk_da_set_corrupt(ds, level);
+		break;
+	case XFS_DA_NODE_MAGIC:
+	case XFS_DA3_NODE_MAGIC:
+		xfs_trans_buf_set_type(dargs->trans, blk->bp,
+				XFS_BLFT_DA_NODE_BUF);
+		blk->magic = XFS_DA_NODE_MAGIC;
+		node = blk->bp->b_addr;
+		xfs_da3_node_hdr_from_disk(ip->i_mount, &nodehdr, node);
+		btree = nodehdr.btree;
+		*pmaxrecs = nodehdr.count;
+		blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval);
+		if (level == 0) {
+			if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+				xchk_da_set_corrupt(ds, level);
+				goto out_freebp;
+			}
+			ds->tree_level = nodehdr.level;
+		} else {
+			if (ds->tree_level != nodehdr.level) {
+				xchk_da_set_corrupt(ds, level);
+				goto out_freebp;
+			}
+		}
+
+		/* XXX: Check hdr3.pad32 once we know how to fix it. */
+		break;
+	default:
+		xchk_da_set_corrupt(ds, level);
+		goto out_freebp;
+	}
+
+	/*
+	 * If we've been handed a block that is below the dabtree root, does
+	 * its hashval match what the parent block expected to see?
+	 */
+	if (level > 0) {
+		struct xfs_da_node_entry	*key;
+
+		key = xchk_da_btree_node_entry(ds, level - 1);
+		if (be32_to_cpu(key->hashval) != blk->hashval) {
+			xchk_da_set_corrupt(ds, level);
+			goto out_freebp;
+		}
+	}
+
+out:
+	return error;
+out_freebp:
+	xfs_trans_brelse(dargs->trans, blk->bp);
+	blk->bp = NULL;
+out_nobuf:
+	blk->blkno = 0;
+	return error;
+}
+
+/* Visit all nodes and leaves of a da btree. */
+int
+xchk_da_btree(
+	struct xfs_scrub		*sc,
+	int				whichfork,
+	xchk_da_btree_rec_fn		scrub_fn,
+	void				*private)
+{
+	struct xchk_da_btree		*ds;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_da_state_blk		*blks;
+	struct xfs_da_node_entry	*key;
+	xfs_dablk_t			blkno;
+	int				level;
+	int				error;
+
+	/* Skip short format data structures; no btree to scan. */
+	if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork)))
+		return 0;
+
+	/* Set up initial da state. */
+	ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+	if (!ds)
+		return -ENOMEM;
+	ds->dargs.dp = sc->ip;
+	ds->dargs.whichfork = whichfork;
+	ds->dargs.trans = sc->tp;
+	ds->dargs.op_flags = XFS_DA_OP_OKNOENT;
+	ds->state = xfs_da_state_alloc(&ds->dargs);
+	ds->sc = sc;
+	ds->private = private;
+	if (whichfork == XFS_ATTR_FORK) {
+		ds->dargs.geo = mp->m_attr_geo;
+		ds->lowest = 0;
+		ds->highest = 0;
+	} else {
+		ds->dargs.geo = mp->m_dir_geo;
+		ds->lowest = ds->dargs.geo->leafblk;
+		ds->highest = ds->dargs.geo->freeblk;
+	}
+	blkno = ds->lowest;
+	level = 0;
+
+	/* Find the root of the da tree, if present. */
+	blks = ds->state->path.blk;
+	error = xchk_da_btree_block(ds, level, blkno);
+	if (error)
+		goto out_state;
+	/*
+	 * We didn't find a block at ds->lowest, which means that there's
+	 * no LEAF1/LEAFN tree (at least not where it's supposed to be),
+	 * so jump out now.
+	 */
+	if (blks[level].bp == NULL)
+		goto out_state;
+
+	blks[level].index = 0;
+	while (level >= 0 && level < XFS_DA_NODE_MAXDEPTH) {
+		/* Handle leaf block. */
+		if (blks[level].magic != XFS_DA_NODE_MAGIC) {
+			/* End of leaf, pop back towards the root. */
+			if (blks[level].index >= ds->maxrecs[level]) {
+				if (level > 0)
+					blks[level - 1].index++;
+				ds->tree_level++;
+				level--;
+				continue;
+			}
+
+			/* Dispatch record scrubbing. */
+			error = scrub_fn(ds, level);
+			if (error)
+				break;
+			if (xchk_should_terminate(sc, &error) ||
+			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+				break;
+
+			blks[level].index++;
+			continue;
+		}
+
+
+		/* End of node, pop back towards the root. */
+		if (blks[level].index >= ds->maxrecs[level]) {
+			if (level > 0)
+				blks[level - 1].index++;
+			ds->tree_level++;
+			level--;
+			continue;
+		}
+
+		/* Hashes in order for scrub? */
+		key = xchk_da_btree_node_entry(ds, level);
+		error = xchk_da_btree_hash(ds, level, &key->hashval);
+		if (error)
+			goto out;
+
+		/* Drill another level deeper. */
+		blkno = be32_to_cpu(key->before);
+		level++;
+		if (level >= XFS_DA_NODE_MAXDEPTH) {
+			/* Too deep! */
+			xchk_da_set_corrupt(ds, level - 1);
+			break;
+		}
+		ds->tree_level--;
+		error = xchk_da_btree_block(ds, level, blkno);
+		if (error)
+			goto out;
+		if (blks[level].bp == NULL)
+			goto out;
+
+		blks[level].index = 0;
+	}
+
+out:
+	/* Release all the buffers we're tracking. */
+	for (level = 0; level < XFS_DA_NODE_MAXDEPTH; level++) {
+		if (blks[level].bp == NULL)
+			continue;
+		xfs_trans_brelse(sc->tp, blks[level].bp);
+		blks[level].bp = NULL;
+	}
+
+out_state:
+	xfs_da_state_free(ds->state);
+	kmem_free(ds);
+	return error;
+}
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
new file mode 100644
index 000000000..1f3515c6d
--- /dev/null
+++ b/fs/xfs/scrub/dabtree.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_DABTREE_H__
+#define __XFS_SCRUB_DABTREE_H__
+
+/* dir/attr btree */
+
+struct xchk_da_btree {
+	struct xfs_da_args	dargs;
+	xfs_dahash_t		hashes[XFS_DA_NODE_MAXDEPTH];
+	int			maxrecs[XFS_DA_NODE_MAXDEPTH];
+	struct xfs_da_state	*state;
+	struct xfs_scrub	*sc;
+	void			*private;
+
+	/*
+	 * Lowest and highest directory block address in which we expect
+	 * to find dir/attr btree node blocks.  For a directory this
+	 * (presumably) means between LEAF_OFFSET and FREE_OFFSET; for
+	 * attributes there is no limit.
+	 */
+	xfs_dablk_t		lowest;
+	xfs_dablk_t		highest;
+
+	int			tree_level;
+};
+
+typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds, int level);
+
+/* Check for da btree operation errors. */
+bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error);
+
+/* Check for da btree corruption. */
+void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level);
+
+int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp);
+int xchk_da_btree(struct xfs_scrub *sc, int whichfork,
+		xchk_da_btree_rec_fn scrub_fn, void *private);
+
+#endif /* __XFS_SCRUB_DABTREE_H__ */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
new file mode 100644
index 000000000..5c87800ab
--- /dev/null
+++ b/fs/xfs/scrub/dir.c
@@ -0,0 +1,876 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/dabtree.h"
+
+/* Set us up to scrub directories. */
+int
+xchk_setup_directory(
+	struct xfs_scrub	*sc)
+{
+	return xchk_setup_inode_contents(sc, 0);
+}
+
+/* Directories */
+
+/* Scrub a directory entry. */
+
+struct xchk_dir_ctx {
+	/* VFS fill-directory iterator */
+	struct dir_context	dir_iter;
+
+	struct xfs_scrub	*sc;
+};
+
+/* Check that an inode's mode matches a given DT_ type. */
+STATIC int
+xchk_dir_check_ftype(
+	struct xchk_dir_ctx	*sdc,
+	xfs_fileoff_t		offset,
+	xfs_ino_t		inum,
+	int			dtype)
+{
+	struct xfs_mount	*mp = sdc->sc->mp;
+	struct xfs_inode	*ip;
+	int			ino_dtype;
+	int			error = 0;
+
+	if (!xfs_has_ftype(mp)) {
+		if (dtype != DT_UNKNOWN && dtype != DT_DIR)
+			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+					offset);
+		goto out;
+	}
+
+	/*
+	 * Grab the inode pointed to by the dirent.  We release the
+	 * inode before we cancel the scrub transaction.  Since we're
+	 * don't know a priori that releasing the inode won't trigger
+	 * eofblocks cleanup (which allocates what would be a nested
+	 * transaction), we can't use DONTCACHE here because DONTCACHE
+	 * inodes can trigger immediate inactive cleanup of the inode.
+	 *
+	 * If _iget returns -EINVAL or -ENOENT then the child inode number is
+	 * garbage and the directory is corrupt.  If the _iget returns
+	 * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
+	 *  cross referencing error.  Any other error is an operational error.
+	 */
+	error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip);
+	if (error == -EINVAL || error == -ENOENT) {
+		error = -EFSCORRUPTED;
+		xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, 0, &error);
+		goto out;
+	}
+	if (!xchk_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset,
+			&error))
+		goto out;
+
+	/* Convert mode to the DT_* values that dir_emit uses. */
+	ino_dtype = xfs_dir3_get_dtype(mp,
+			xfs_mode_to_ftype(VFS_I(ip)->i_mode));
+	if (ino_dtype != dtype)
+		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+	xfs_irele(ip);
+out:
+	return error;
+}
+
+/*
+ * Scrub a single directory entry.
+ *
+ * We use the VFS directory iterator (i.e. readdir) to call this
+ * function for every directory entry in a directory.  Once we're here,
+ * we check the inode number to make sure it's sane, then we check that
+ * we can look up this filename.  Finally, we check the ftype.
+ */
+STATIC bool
+xchk_dir_actor(
+	struct dir_context	*dir_iter,
+	const char		*name,
+	int			namelen,
+	loff_t			pos,
+	u64			ino,
+	unsigned		type)
+{
+	struct xfs_mount	*mp;
+	struct xfs_inode	*ip;
+	struct xchk_dir_ctx	*sdc;
+	struct xfs_name		xname;
+	xfs_ino_t		lookup_ino;
+	xfs_dablk_t		offset;
+	bool			checked_ftype = false;
+	int			error = 0;
+
+	sdc = container_of(dir_iter, struct xchk_dir_ctx, dir_iter);
+	ip = sdc->sc->ip;
+	mp = ip->i_mount;
+	offset = xfs_dir2_db_to_da(mp->m_dir_geo,
+			xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
+
+	if (xchk_should_terminate(sdc->sc, &error))
+		return !error;
+
+	/* Does this inode number make sense? */
+	if (!xfs_verify_dir_ino(mp, ino)) {
+		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+		goto out;
+	}
+
+	/* Does this name make sense? */
+	if (!xfs_dir2_namecheck(name, namelen)) {
+		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+		goto out;
+	}
+
+	if (!strncmp(".", name, namelen)) {
+		/* If this is "." then check that the inum matches the dir. */
+		if (xfs_has_ftype(mp) && type != DT_DIR)
+			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+					offset);
+		checked_ftype = true;
+		if (ino != ip->i_ino)
+			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+					offset);
+	} else if (!strncmp("..", name, namelen)) {
+		/*
+		 * If this is ".." in the root inode, check that the inum
+		 * matches this dir.
+		 */
+		if (xfs_has_ftype(mp) && type != DT_DIR)
+			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+					offset);
+		checked_ftype = true;
+		if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino)
+			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+					offset);
+	}
+
+	/* Verify that we can look up this name by hash. */
+	xname.name = name;
+	xname.len = namelen;
+	xname.type = XFS_DIR3_FT_UNKNOWN;
+
+	error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
+	/* ENOENT means the hash lookup failed and the dir is corrupt */
+	if (error == -ENOENT)
+		error = -EFSCORRUPTED;
+	if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
+			&error))
+		goto out;
+	if (lookup_ino != ino) {
+		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+		goto out;
+	}
+
+	/* Verify the file type.  This function absorbs error codes. */
+	if (!checked_ftype) {
+		error = xchk_dir_check_ftype(sdc, offset, lookup_ino, type);
+		if (error)
+			goto out;
+	}
+out:
+	/*
+	 * A negative error code returned here is supposed to cause the
+	 * dir_emit caller (xfs_readdir) to abort the directory iteration
+	 * and return zero to xchk_directory.
+	 */
+	if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return false;
+	return !error;
+}
+
+/* Scrub a directory btree record. */
+STATIC int
+xchk_dir_rec(
+	struct xchk_da_btree		*ds,
+	int				level)
+{
+	struct xfs_da_state_blk		*blk = &ds->state->path.blk[level];
+	struct xfs_mount		*mp = ds->state->mp;
+	struct xfs_inode		*dp = ds->dargs.dp;
+	struct xfs_da_geometry		*geo = mp->m_dir_geo;
+	struct xfs_dir2_data_entry	*dent;
+	struct xfs_buf			*bp;
+	struct xfs_dir2_leaf_entry	*ent;
+	unsigned int			end;
+	unsigned int			iter_off;
+	xfs_ino_t			ino;
+	xfs_dablk_t			rec_bno;
+	xfs_dir2_db_t			db;
+	xfs_dir2_data_aoff_t		off;
+	xfs_dir2_dataptr_t		ptr;
+	xfs_dahash_t			calc_hash;
+	xfs_dahash_t			hash;
+	struct xfs_dir3_icleaf_hdr	hdr;
+	unsigned int			tag;
+	int				error;
+
+	ASSERT(blk->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+	xfs_dir2_leaf_hdr_from_disk(mp, &hdr, blk->bp->b_addr);
+	ent = hdr.ents + blk->index;
+
+	/* Check the hash of the entry. */
+	error = xchk_da_btree_hash(ds, level, &ent->hashval);
+	if (error)
+		goto out;
+
+	/* Valid hash pointer? */
+	ptr = be32_to_cpu(ent->address);
+	if (ptr == 0)
+		return 0;
+
+	/* Find the directory entry's location. */
+	db = xfs_dir2_dataptr_to_db(geo, ptr);
+	off = xfs_dir2_dataptr_to_off(geo, ptr);
+	rec_bno = xfs_dir2_db_to_da(geo, db);
+
+	if (rec_bno >= geo->leafblk) {
+		xchk_da_set_corrupt(ds, level);
+		goto out;
+	}
+	error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno,
+			XFS_DABUF_MAP_HOLE_OK, &bp);
+	if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
+			&error))
+		goto out;
+	if (!bp) {
+		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+		goto out;
+	}
+	xchk_buffer_recheck(ds->sc, bp);
+
+	if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out_relse;
+
+	dent = bp->b_addr + off;
+
+	/* Make sure we got a real directory entry. */
+	iter_off = geo->data_entry_offset;
+	end = xfs_dir3_data_end_offset(geo, bp->b_addr);
+	if (!end) {
+		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+		goto out_relse;
+	}
+	for (;;) {
+		struct xfs_dir2_data_entry	*dep = bp->b_addr + iter_off;
+		struct xfs_dir2_data_unused	*dup = bp->b_addr + iter_off;
+
+		if (iter_off >= end) {
+			xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+			goto out_relse;
+		}
+
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			iter_off += be16_to_cpu(dup->length);
+			continue;
+		}
+		if (dep == dent)
+			break;
+		iter_off += xfs_dir2_data_entsize(mp, dep->namelen);
+	}
+
+	/* Retrieve the entry, sanity check it, and compare hashes. */
+	ino = be64_to_cpu(dent->inumber);
+	hash = be32_to_cpu(ent->hashval);
+	tag = be16_to_cpup(xfs_dir2_data_entry_tag_p(mp, dent));
+	if (!xfs_verify_dir_ino(mp, ino) || tag != off)
+		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+	if (dent->namelen == 0) {
+		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+		goto out_relse;
+	}
+	calc_hash = xfs_da_hashname(dent->name, dent->namelen);
+	if (calc_hash != hash)
+		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+
+out_relse:
+	xfs_trans_brelse(ds->dargs.trans, bp);
+out:
+	return error;
+}
+
+/*
+ * Is this unused entry either in the bestfree or smaller than all of
+ * them?  We've already checked that the bestfrees are sorted longest to
+ * shortest, and that there aren't any bogus entries.
+ */
+STATIC void
+xchk_directory_check_free_entry(
+	struct xfs_scrub		*sc,
+	xfs_dablk_t			lblk,
+	struct xfs_dir2_data_free	*bf,
+	struct xfs_dir2_data_unused	*dup)
+{
+	struct xfs_dir2_data_free	*dfp;
+	unsigned int			dup_length;
+
+	dup_length = be16_to_cpu(dup->length);
+
+	/* Unused entry is shorter than any of the bestfrees */
+	if (dup_length < be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+		return;
+
+	for (dfp = &bf[XFS_DIR2_DATA_FD_COUNT - 1]; dfp >= bf; dfp--)
+		if (dup_length == be16_to_cpu(dfp->length))
+			return;
+
+	/* Unused entry should be in the bestfrees but wasn't found. */
+	xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+}
+
+/* Check free space info in a directory data block. */
+STATIC int
+xchk_directory_data_bestfree(
+	struct xfs_scrub		*sc,
+	xfs_dablk_t			lblk,
+	bool				is_block)
+{
+	struct xfs_dir2_data_unused	*dup;
+	struct xfs_dir2_data_free	*dfp;
+	struct xfs_buf			*bp;
+	struct xfs_dir2_data_free	*bf;
+	struct xfs_mount		*mp = sc->mp;
+	u16				tag;
+	unsigned int			nr_bestfrees = 0;
+	unsigned int			nr_frees = 0;
+	unsigned int			smallest_bestfree;
+	int				newlen;
+	unsigned int			offset;
+	unsigned int			end;
+	int				error;
+
+	if (is_block) {
+		/* dir block format */
+		if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+		error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
+	} else {
+		/* dir data format */
+		error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp);
+	}
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+		goto out;
+	xchk_buffer_recheck(sc, bp);
+
+	/* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out_buf;
+
+	/* Do the bestfrees correspond to actual free space? */
+	bf = xfs_dir2_data_bestfree_p(mp, bp->b_addr);
+	smallest_bestfree = UINT_MAX;
+	for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+		offset = be16_to_cpu(dfp->offset);
+		if (offset == 0)
+			continue;
+		if (offset >= mp->m_dir_geo->blksize) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out_buf;
+		}
+		dup = bp->b_addr + offset;
+		tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
+
+		/* bestfree doesn't match the entry it points at? */
+		if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) ||
+		    be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) ||
+		    tag != offset) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out_buf;
+		}
+
+		/* bestfree records should be ordered largest to smallest */
+		if (smallest_bestfree < be16_to_cpu(dfp->length)) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out_buf;
+		}
+
+		smallest_bestfree = be16_to_cpu(dfp->length);
+		nr_bestfrees++;
+	}
+
+	/* Make sure the bestfrees are actually the best free spaces. */
+	offset = mp->m_dir_geo->data_entry_offset;
+	end = xfs_dir3_data_end_offset(mp->m_dir_geo, bp->b_addr);
+
+	/* Iterate the entries, stopping when we hit or go past the end. */
+	while (offset < end) {
+		dup = bp->b_addr + offset;
+
+		/* Skip real entries */
+		if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) {
+			struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
+			newlen = xfs_dir2_data_entsize(mp, dep->namelen);
+			if (newlen <= 0) {
+				xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+						lblk);
+				goto out_buf;
+			}
+			offset += newlen;
+			continue;
+		}
+
+		/* Spot check this free entry */
+		tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
+		if (tag != offset) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out_buf;
+		}
+
+		/*
+		 * Either this entry is a bestfree or it's smaller than
+		 * any of the bestfrees.
+		 */
+		xchk_directory_check_free_entry(sc, lblk, bf, dup);
+		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+			goto out_buf;
+
+		/* Move on. */
+		newlen = be16_to_cpu(dup->length);
+		if (newlen <= 0) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out_buf;
+		}
+		offset += newlen;
+		if (offset <= end)
+			nr_frees++;
+	}
+
+	/* We're required to fill all the space. */
+	if (offset != end)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+	/* Did we see at least as many free slots as there are bestfrees? */
+	if (nr_frees < nr_bestfrees)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+out_buf:
+	xfs_trans_brelse(sc->tp, bp);
+out:
+	return error;
+}
+
+/*
+ * Does the free space length in the free space index block ($len) match
+ * the longest length in the directory data block's bestfree array?
+ * Assume that we've already checked that the data block's bestfree
+ * array is in order.
+ */
+STATIC void
+xchk_directory_check_freesp(
+	struct xfs_scrub		*sc,
+	xfs_dablk_t			lblk,
+	struct xfs_buf			*dbp,
+	unsigned int			len)
+{
+	struct xfs_dir2_data_free	*dfp;
+
+	dfp = xfs_dir2_data_bestfree_p(sc->mp, dbp->b_addr);
+
+	if (len != be16_to_cpu(dfp->length))
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+	if (len > 0 && be16_to_cpu(dfp->offset) == 0)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+}
+
+/* Check free space info in a directory leaf1 block. */
+STATIC int
+xchk_directory_leaf1_bestfree(
+	struct xfs_scrub		*sc,
+	struct xfs_da_args		*args,
+	xfs_dir2_db_t			last_data_db,
+	xfs_dablk_t			lblk)
+{
+	struct xfs_dir3_icleaf_hdr	leafhdr;
+	struct xfs_dir2_leaf_tail	*ltp;
+	struct xfs_dir2_leaf		*leaf;
+	struct xfs_buf			*dbp;
+	struct xfs_buf			*bp;
+	struct xfs_da_geometry		*geo = sc->mp->m_dir_geo;
+	__be16				*bestp;
+	__u16				best;
+	__u32				hash;
+	__u32				lasthash = 0;
+	__u32				bestcount;
+	unsigned int			stale = 0;
+	int				i;
+	int				error;
+
+	/* Read the free space block. */
+	error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+		return error;
+	xchk_buffer_recheck(sc, bp);
+
+	leaf = bp->b_addr;
+	xfs_dir2_leaf_hdr_from_disk(sc->ip->i_mount, &leafhdr, leaf);
+	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+	bestcount = be32_to_cpu(ltp->bestcount);
+	bestp = xfs_dir2_leaf_bests_p(ltp);
+
+	if (xfs_has_crc(sc->mp)) {
+		struct xfs_dir3_leaf_hdr	*hdr3 = bp->b_addr;
+
+		if (hdr3->pad != cpu_to_be32(0))
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+	}
+
+	/*
+	 * There must be enough bestfree slots to cover all the directory data
+	 * blocks that we scanned.  It is possible for there to be a hole
+	 * between the last data block and i_disk_size.  This seems like an
+	 * oversight to the scrub author, but as we have been writing out
+	 * directories like this (and xfs_repair doesn't mind them) for years,
+	 * that's what we have to check.
+	 */
+	if (bestcount != last_data_db + 1) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+		goto out;
+	}
+
+	/* Is the leaf count even remotely sane? */
+	if (leafhdr.count > geo->leaf_max_ents) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+		goto out;
+	}
+
+	/* Leaves and bests don't overlap in leaf format. */
+	if ((char *)&leafhdr.ents[leafhdr.count] > (char *)bestp) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+		goto out;
+	}
+
+	/* Check hash value order, count stale entries.  */
+	for (i = 0; i < leafhdr.count; i++) {
+		hash = be32_to_cpu(leafhdr.ents[i].hashval);
+		if (i > 0 && lasthash > hash)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+		lasthash = hash;
+		if (leafhdr.ents[i].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			stale++;
+	}
+	if (leafhdr.stale != stale)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/* Check all the bestfree entries. */
+	for (i = 0; i < bestcount; i++, bestp++) {
+		best = be16_to_cpu(*bestp);
+		error = xfs_dir3_data_read(sc->tp, sc->ip,
+				xfs_dir2_db_to_da(args->geo, i),
+				XFS_DABUF_MAP_HOLE_OK,
+				&dbp);
+		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
+				&error))
+			break;
+
+		if (!dbp) {
+			if (best != NULLDATAOFF) {
+				xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+						lblk);
+				break;
+			}
+			continue;
+		}
+
+		if (best == NULLDATAOFF)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+		else
+			xchk_directory_check_freesp(sc, lblk, dbp, best);
+		xfs_trans_brelse(sc->tp, dbp);
+		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+			break;
+	}
+out:
+	xfs_trans_brelse(sc->tp, bp);
+	return error;
+}
+
+/* Check free space info in a directory freespace block. */
+STATIC int
+xchk_directory_free_bestfree(
+	struct xfs_scrub		*sc,
+	struct xfs_da_args		*args,
+	xfs_dablk_t			lblk)
+{
+	struct xfs_dir3_icfree_hdr	freehdr;
+	struct xfs_buf			*dbp;
+	struct xfs_buf			*bp;
+	__u16				best;
+	unsigned int			stale = 0;
+	int				i;
+	int				error;
+
+	/* Read the free space block */
+	error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+		return error;
+	xchk_buffer_recheck(sc, bp);
+
+	if (xfs_has_crc(sc->mp)) {
+		struct xfs_dir3_free_hdr	*hdr3 = bp->b_addr;
+
+		if (hdr3->pad != cpu_to_be32(0))
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+	}
+
+	/* Check all the entries. */
+	xfs_dir2_free_hdr_from_disk(sc->ip->i_mount, &freehdr, bp->b_addr);
+	for (i = 0; i < freehdr.nvalid; i++) {
+		best = be16_to_cpu(freehdr.bests[i]);
+		if (best == NULLDATAOFF) {
+			stale++;
+			continue;
+		}
+		error = xfs_dir3_data_read(sc->tp, sc->ip,
+				(freehdr.firstdb + i) * args->geo->fsbcount,
+				0, &dbp);
+		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
+				&error))
+			goto out;
+		xchk_directory_check_freesp(sc, lblk, dbp, best);
+		xfs_trans_brelse(sc->tp, dbp);
+	}
+
+	if (freehdr.nused + stale != freehdr.nvalid)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+out:
+	xfs_trans_brelse(sc->tp, bp);
+	return error;
+}
+
+/* Check free space information in directories. */
+STATIC int
+xchk_directory_blocks(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_bmbt_irec	got;
+	struct xfs_da_args	args;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+	struct xfs_mount	*mp = sc->mp;
+	xfs_fileoff_t		leaf_lblk;
+	xfs_fileoff_t		free_lblk;
+	xfs_fileoff_t		lblk;
+	struct xfs_iext_cursor	icur;
+	xfs_dablk_t		dabno;
+	xfs_dir2_db_t		last_data_db = 0;
+	bool			found;
+	bool			is_block = false;
+	int			error;
+
+	/* Ignore local format directories. */
+	if (ifp->if_format != XFS_DINODE_FMT_EXTENTS &&
+	    ifp->if_format != XFS_DINODE_FMT_BTREE)
+		return 0;
+
+	lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET);
+	leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET);
+	free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
+
+	/* Is this a block dir? */
+	args.dp = sc->ip;
+	args.geo = mp->m_dir_geo;
+	args.trans = sc->tp;
+	error = xfs_dir2_isblock(&args, &is_block);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+		goto out;
+
+	/* Iterate all the data extents in the directory... */
+	found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+	while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+		/* No more data blocks... */
+		if (got.br_startoff >= leaf_lblk)
+			break;
+
+		/*
+		 * Check each data block's bestfree data.
+		 *
+		 * Iterate all the fsbcount-aligned block offsets in
+		 * this directory.  The directory block reading code is
+		 * smart enough to do its own bmap lookups to handle
+		 * discontiguous directory blocks.  When we're done
+		 * with the extent record, re-query the bmap at the
+		 * next fsbcount-aligned offset to avoid redundant
+		 * block checks.
+		 */
+		for (lblk = roundup((xfs_dablk_t)got.br_startoff,
+				args.geo->fsbcount);
+		     lblk < got.br_startoff + got.br_blockcount;
+		     lblk += args.geo->fsbcount) {
+			last_data_db = xfs_dir2_da_to_db(args.geo, lblk);
+			error = xchk_directory_data_bestfree(sc, lblk,
+					is_block);
+			if (error)
+				goto out;
+		}
+		dabno = got.br_startoff + got.br_blockcount;
+		lblk = roundup(dabno, args.geo->fsbcount);
+		found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+	}
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/* Look for a leaf1 block, which has free info. */
+	if (xfs_iext_lookup_extent(sc->ip, ifp, leaf_lblk, &icur, &got) &&
+	    got.br_startoff == leaf_lblk &&
+	    got.br_blockcount == args.geo->fsbcount &&
+	    !xfs_iext_next_extent(ifp, &icur, &got)) {
+		if (is_block) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out;
+		}
+		error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db,
+				leaf_lblk);
+		if (error)
+			goto out;
+	}
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/* Scan for free blocks */
+	lblk = free_lblk;
+	found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+	while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+		/*
+		 * Dirs can't have blocks mapped above 2^32.
+		 * Single-block dirs shouldn't even be here.
+		 */
+		lblk = got.br_startoff;
+		if (lblk & ~0xFFFFFFFFULL) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out;
+		}
+		if (is_block) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out;
+		}
+
+		/*
+		 * Check each dir free block's bestfree data.
+		 *
+		 * Iterate all the fsbcount-aligned block offsets in
+		 * this directory.  The directory block reading code is
+		 * smart enough to do its own bmap lookups to handle
+		 * discontiguous directory blocks.  When we're done
+		 * with the extent record, re-query the bmap at the
+		 * next fsbcount-aligned offset to avoid redundant
+		 * block checks.
+		 */
+		for (lblk = roundup((xfs_dablk_t)got.br_startoff,
+				args.geo->fsbcount);
+		     lblk < got.br_startoff + got.br_blockcount;
+		     lblk += args.geo->fsbcount) {
+			error = xchk_directory_free_bestfree(sc, &args,
+					lblk);
+			if (error)
+				goto out;
+		}
+		dabno = got.br_startoff + got.br_blockcount;
+		lblk = roundup(dabno, args.geo->fsbcount);
+		found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+	}
+out:
+	return error;
+}
+
+/* Scrub a whole directory. */
+int
+xchk_directory(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_dir_ctx	sdc = {
+		.dir_iter.actor = xchk_dir_actor,
+		.dir_iter.pos = 0,
+		.sc = sc,
+	};
+	size_t			bufsize;
+	loff_t			oldpos;
+	int			error = 0;
+
+	if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+		return -ENOENT;
+
+	/* Plausible size? */
+	if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		goto out;
+	}
+
+	/* Check directory tree structure */
+	error = xchk_da_btree(sc, XFS_DATA_FORK, xchk_dir_rec, NULL);
+	if (error)
+		return error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return error;
+
+	/* Check the freespace. */
+	error = xchk_directory_blocks(sc);
+	if (error)
+		return error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return error;
+
+	/*
+	 * Check that every dirent we see can also be looked up by hash.
+	 * Userspace usually asks for a 32k buffer, so we will too.
+	 */
+	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
+			sc->ip->i_disk_size);
+
+	/*
+	 * Look up every name in this directory by hash.
+	 *
+	 * Use the xfs_readdir function to call xchk_dir_actor on
+	 * every directory entry in this directory.  In _actor, we check
+	 * the name, inode number, and ftype (if applicable) of the
+	 * entry.  xfs_readdir uses the VFS filldir functions to provide
+	 * iteration context.
+	 *
+	 * The VFS grabs a read or write lock via i_rwsem before it reads
+	 * or writes to a directory.  If we've gotten this far we've
+	 * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
+	 * getting a write lock on i_rwsem.  Therefore, it is safe for us
+	 * to drop the ILOCK here in order to reuse the _readdir and
+	 * _dir_lookup routines, which do their own ILOCK locking.
+	 */
+	oldpos = 0;
+	sc->ilock_flags &= ~XFS_ILOCK_EXCL;
+	xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+	while (true) {
+		error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize);
+		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
+				&error))
+			goto out;
+		if (oldpos == sdc.dir_iter.pos)
+			break;
+		oldpos = sdc.dir_iter.pos;
+	}
+
+out:
+	return error;
+}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
new file mode 100644
index 000000000..88d6961e3
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_health.h"
+#include "xfs_btree.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * The basics of filesystem summary counter checking are that we iterate the
+ * AGs counting the number of free blocks, free space btree blocks, per-AG
+ * reservations, inodes, delayed allocation reservations, and free inodes.
+ * Then we compare what we computed against the in-core counters.
+ *
+ * However, the reality is that summary counters are a tricky beast to check.
+ * While we /could/ freeze the filesystem and scramble around the AGs counting
+ * the free blocks, in practice we prefer not do that for a scan because
+ * freezing is costly.  To get around this, we added a per-cpu counter of the
+ * delalloc reservations so that we can rotor around the AGs relatively
+ * quickly, and we allow the counts to be slightly off because we're not taking
+ * any locks while we do this.
+ *
+ * So the first thing we do is warm up the buffer cache in the setup routine by
+ * walking all the AGs to make sure the incore per-AG structure has been
+ * initialized.  The expected value calculation then iterates the incore per-AG
+ * structures as quickly as it can.  We snapshot the percpu counters before and
+ * after this operation and use the difference in counter values to guess at
+ * our tolerance for mismatch between expected and actual counter values.
+ */
+
+/*
+ * Since the expected value computation is lockless but only browses incore
+ * values, the percpu counters should be fairly close to each other.  However,
+ * we'll allow ourselves to be off by at least this (arbitrary) amount.
+ */
+#define XCHK_FSCOUNT_MIN_VARIANCE	(512)
+
+/*
+ * Make sure the per-AG structure has been initialized from the on-disk header
+ * contents and trust that the incore counters match the ondisk counters.  (The
+ * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
+ * summary counters after checking all AG headers).  Do this from the setup
+ * function so that the inner AG aggregation loop runs as quickly as possible.
+ *
+ * This function runs during the setup phase /before/ we start checking any
+ * metadata.
+ */
+STATIC int
+xchk_fscount_warmup(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*agi_bp = NULL;
+	struct xfs_buf		*agf_bp = NULL;
+	struct xfs_perag	*pag = NULL;
+	xfs_agnumber_t		agno;
+	int			error = 0;
+
+	for_each_perag(mp, agno, pag) {
+		if (xchk_should_terminate(sc, &error))
+			break;
+		if (pag->pagi_init && pag->pagf_init)
+			continue;
+
+		/* Lock both AG headers. */
+		error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+		if (error)
+			break;
+		error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
+		if (error)
+			break;
+
+		/*
+		 * These are supposed to be initialized by the header read
+		 * function.
+		 */
+		if (!pag->pagi_init || !pag->pagf_init) {
+			error = -EFSCORRUPTED;
+			break;
+		}
+
+		xfs_buf_relse(agf_bp);
+		agf_bp = NULL;
+		xfs_buf_relse(agi_bp);
+		agi_bp = NULL;
+	}
+
+	if (agf_bp)
+		xfs_buf_relse(agf_bp);
+	if (agi_bp)
+		xfs_buf_relse(agi_bp);
+	if (pag)
+		xfs_perag_put(pag);
+	return error;
+}
+
+int
+xchk_setup_fscounters(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_fscounters	*fsc;
+	int			error;
+
+	sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
+	if (!sc->buf)
+		return -ENOMEM;
+	fsc = sc->buf;
+
+	xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
+
+	/* We must get the incore counters set up before we can proceed. */
+	error = xchk_fscount_warmup(sc);
+	if (error)
+		return error;
+
+	return xchk_trans_alloc(sc, 0);
+}
+
+/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
+static int
+xchk_fscount_btreeblks(
+	struct xfs_scrub	*sc,
+	struct xchk_fscounters	*fsc,
+	xfs_agnumber_t		agno)
+{
+	xfs_extlen_t		blocks;
+	int			error;
+
+	error = xchk_ag_init_existing(sc, agno, &sc->sa);
+	if (error)
+		goto out_free;
+
+	error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
+	if (error)
+		goto out_free;
+	fsc->fdblocks += blocks - 1;
+
+	error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
+	if (error)
+		goto out_free;
+	fsc->fdblocks += blocks - 1;
+
+out_free:
+	xchk_ag_free(sc, &sc->sa);
+	return error;
+}
+
+/*
+ * Calculate what the global in-core counters ought to be from the incore
+ * per-AG structure.  Callers can compare this to the actual in-core counters
+ * to estimate by how much both in-core and on-disk counters need to be
+ * adjusted.
+ */
+STATIC int
+xchk_fscount_aggregate_agcounts(
+	struct xfs_scrub	*sc,
+	struct xchk_fscounters	*fsc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_perag	*pag;
+	uint64_t		delayed;
+	xfs_agnumber_t		agno;
+	int			tries = 8;
+	int			error = 0;
+
+retry:
+	fsc->icount = 0;
+	fsc->ifree = 0;
+	fsc->fdblocks = 0;
+
+	for_each_perag(mp, agno, pag) {
+		if (xchk_should_terminate(sc, &error))
+			break;
+
+		/* This somehow got unset since the warmup? */
+		if (!pag->pagi_init || !pag->pagf_init) {
+			error = -EFSCORRUPTED;
+			break;
+		}
+
+		/* Count all the inodes */
+		fsc->icount += pag->pagi_count;
+		fsc->ifree += pag->pagi_freecount;
+
+		/* Add up the free/freelist/bnobt/cntbt blocks */
+		fsc->fdblocks += pag->pagf_freeblks;
+		fsc->fdblocks += pag->pagf_flcount;
+		if (xfs_has_lazysbcount(sc->mp)) {
+			fsc->fdblocks += pag->pagf_btreeblks;
+		} else {
+			error = xchk_fscount_btreeblks(sc, fsc, agno);
+			if (error)
+				break;
+		}
+
+		/*
+		 * Per-AG reservations are taken out of the incore counters,
+		 * so they must be left out of the free blocks computation.
+		 */
+		fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
+		fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
+
+	}
+	if (pag)
+		xfs_perag_put(pag);
+	if (error)
+		return error;
+
+	/*
+	 * The global incore space reservation is taken from the incore
+	 * counters, so leave that out of the computation.
+	 */
+	fsc->fdblocks -= mp->m_resblks_avail;
+
+	/*
+	 * Delayed allocation reservations are taken out of the incore counters
+	 * but not recorded on disk, so leave them and their indlen blocks out
+	 * of the computation.
+	 */
+	delayed = percpu_counter_sum(&mp->m_delalloc_blks);
+	fsc->fdblocks -= delayed;
+
+	trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
+			delayed);
+
+
+	/* Bail out if the values we compute are totally nonsense. */
+	if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
+	    fsc->fdblocks > mp->m_sb.sb_dblocks ||
+	    fsc->ifree > fsc->icount_max)
+		return -EFSCORRUPTED;
+
+	/*
+	 * If ifree > icount then we probably had some perturbation in the
+	 * counters while we were calculating things.  We'll try a few times
+	 * to maintain ifree <= icount before giving up.
+	 */
+	if (fsc->ifree > fsc->icount) {
+		if (tries--)
+			goto retry;
+		xchk_set_incomplete(sc);
+		return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Is the @counter reasonably close to the @expected value?
+ *
+ * We neither locked nor froze anything in the filesystem while aggregating the
+ * per-AG data to compute the @expected value, which means that the counter
+ * could have changed.  We know the @old_value of the summation of the counter
+ * before the aggregation, and we re-sum the counter now.  If the expected
+ * value falls between the two summations, we're ok.
+ *
+ * Otherwise, we /might/ have a problem.  If the change in the summations is
+ * more than we want to tolerate, the filesystem is probably busy and we should
+ * just send back INCOMPLETE and see if userspace will try again.
+ */
+static inline bool
+xchk_fscount_within_range(
+	struct xfs_scrub	*sc,
+	const int64_t		old_value,
+	struct percpu_counter	*counter,
+	uint64_t		expected)
+{
+	int64_t			min_value, max_value;
+	int64_t			curr_value = percpu_counter_sum(counter);
+
+	trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
+			old_value);
+
+	/* Negative values are always wrong. */
+	if (curr_value < 0)
+		return false;
+
+	/* Exact matches are always ok. */
+	if (curr_value == expected)
+		return true;
+
+	min_value = min(old_value, curr_value);
+	max_value = max(old_value, curr_value);
+
+	/* Within the before-and-after range is ok. */
+	if (expected >= min_value && expected <= max_value)
+		return true;
+
+	/*
+	 * If the difference between the two summations is too large, the fs
+	 * might just be busy and so we'll mark the scrub incomplete.  Return
+	 * true here so that we don't mark the counter corrupt.
+	 *
+	 * XXX: In the future when userspace can grant scrub permission to
+	 * quiesce the filesystem to solve the outsized variance problem, this
+	 * check should be moved up and the return code changed to signal to
+	 * userspace that we need quiesce permission.
+	 */
+	if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
+		xchk_set_incomplete(sc);
+		return true;
+	}
+
+	return false;
+}
+
+/* Check the superblock counters. */
+int
+xchk_fscounters(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xchk_fscounters	*fsc = sc->buf;
+	int64_t			icount, ifree, fdblocks;
+	int			error;
+
+	/* Snapshot the percpu counters. */
+	icount = percpu_counter_sum(&mp->m_icount);
+	ifree = percpu_counter_sum(&mp->m_ifree);
+	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
+	/* No negative values, please! */
+	if (icount < 0 || ifree < 0 || fdblocks < 0)
+		xchk_set_corrupt(sc);
+
+	/* See if icount is obviously wrong. */
+	if (icount < fsc->icount_min || icount > fsc->icount_max)
+		xchk_set_corrupt(sc);
+
+	/* See if fdblocks is obviously wrong. */
+	if (fdblocks > mp->m_sb.sb_dblocks)
+		xchk_set_corrupt(sc);
+
+	/*
+	 * XXX: We can't quiesce percpu counter updates, so exit early.
+	 * This can be re-enabled when we gain exclusive freeze functionality.
+	 */
+	return 0;
+
+	/*
+	 * If ifree exceeds icount by more than the minimum variance then
+	 * something's probably wrong with the counters.
+	 */
+	if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
+		xchk_set_corrupt(sc);
+
+	/* Walk the incore AG headers to calculate the expected counters. */
+	error = xchk_fscount_aggregate_agcounts(sc, fsc);
+	if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+		return error;
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+		return 0;
+
+	/* Compare the in-core counters with whatever we counted. */
+	if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
+		xchk_set_corrupt(sc);
+
+	if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
+		xchk_set_corrupt(sc);
+
+	if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
+			fsc->fdblocks))
+		xchk_set_corrupt(sc);
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
new file mode 100644
index 000000000..aa65ec88a
--- /dev/null
+++ b/fs/xfs/scrub/health.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_health.h"
+#include "scrub/scrub.h"
+#include "scrub/health.h"
+
+/*
+ * Scrub and In-Core Filesystem Health Assessments
+ * ===============================================
+ *
+ * Online scrub and repair have the time and the ability to perform stronger
+ * checks than we can do from the metadata verifiers, because they can
+ * cross-reference records between data structures.  Therefore, scrub is in a
+ * good position to update the online filesystem health assessments to reflect
+ * the good/bad state of the data structure.
+ *
+ * We therefore extend scrub in the following ways to achieve this:
+ *
+ * 1. Create a "sick_mask" field in the scrub context.  When we're setting up a
+ * scrub call, set this to the default XFS_SICK_* flag(s) for the selected
+ * scrub type (call it A).  Scrub and repair functions can override the default
+ * sick_mask value if they choose.
+ *
+ * 2. If the scrubber returns a runtime error code, we exit making no changes
+ * to the incore sick state.
+ *
+ * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore
+ * sick flags before exiting.
+ *
+ * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore
+ * sick flags.  If the user didn't want to repair then we exit, leaving the
+ * metadata structure unfixed and the sick flag set.
+ *
+ * 5. Now we know that A is corrupt and the user wants to repair, so run the
+ * repairer.  If the repairer returns an error code, we exit with that error
+ * code, having made no further changes to the incore sick state.
+ *
+ * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean,
+ * use sick_mask to clear the incore sick flags.  This should have the effect
+ * that A is no longer marked sick.
+ *
+ * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and
+ * use sick_mask to set the incore sick flags.  This should have no externally
+ * visible effect since we already set them in step (4).
+ *
+ * There are some complications to this story, however.  For certain types of
+ * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild
+ * both structures at the same time.  The following principles apply to this
+ * type of repair strategy:
+ *
+ * 8. Any repair function that rebuilds multiple structures should update
+ * sick_mask_visible to reflect whatever other structures are rebuilt, and
+ * verify that all the rebuilt structures can pass a scrub check.  The outcomes
+ * of 5-7 still apply, but with a sick_mask that covers everything being
+ * rebuilt.
+ */
+
+/* Map our scrub type to a sick mask and a set of health update functions. */
+
+enum xchk_health_group {
+	XHG_FS = 1,
+	XHG_RT,
+	XHG_AG,
+	XHG_INO,
+};
+
+struct xchk_health_map {
+	enum xchk_health_group	group;
+	unsigned int		sick_mask;
+};
+
+static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
+	[XFS_SCRUB_TYPE_SB]		= { XHG_AG,  XFS_SICK_AG_SB },
+	[XFS_SCRUB_TYPE_AGF]		= { XHG_AG,  XFS_SICK_AG_AGF },
+	[XFS_SCRUB_TYPE_AGFL]		= { XHG_AG,  XFS_SICK_AG_AGFL },
+	[XFS_SCRUB_TYPE_AGI]		= { XHG_AG,  XFS_SICK_AG_AGI },
+	[XFS_SCRUB_TYPE_BNOBT]		= { XHG_AG,  XFS_SICK_AG_BNOBT },
+	[XFS_SCRUB_TYPE_CNTBT]		= { XHG_AG,  XFS_SICK_AG_CNTBT },
+	[XFS_SCRUB_TYPE_INOBT]		= { XHG_AG,  XFS_SICK_AG_INOBT },
+	[XFS_SCRUB_TYPE_FINOBT]		= { XHG_AG,  XFS_SICK_AG_FINOBT },
+	[XFS_SCRUB_TYPE_RMAPBT]		= { XHG_AG,  XFS_SICK_AG_RMAPBT },
+	[XFS_SCRUB_TYPE_REFCNTBT]	= { XHG_AG,  XFS_SICK_AG_REFCNTBT },
+	[XFS_SCRUB_TYPE_INODE]		= { XHG_INO, XFS_SICK_INO_CORE },
+	[XFS_SCRUB_TYPE_BMBTD]		= { XHG_INO, XFS_SICK_INO_BMBTD },
+	[XFS_SCRUB_TYPE_BMBTA]		= { XHG_INO, XFS_SICK_INO_BMBTA },
+	[XFS_SCRUB_TYPE_BMBTC]		= { XHG_INO, XFS_SICK_INO_BMBTC },
+	[XFS_SCRUB_TYPE_DIR]		= { XHG_INO, XFS_SICK_INO_DIR },
+	[XFS_SCRUB_TYPE_XATTR]		= { XHG_INO, XFS_SICK_INO_XATTR },
+	[XFS_SCRUB_TYPE_SYMLINK]	= { XHG_INO, XFS_SICK_INO_SYMLINK },
+	[XFS_SCRUB_TYPE_PARENT]		= { XHG_INO, XFS_SICK_INO_PARENT },
+	[XFS_SCRUB_TYPE_RTBITMAP]	= { XHG_RT,  XFS_SICK_RT_BITMAP },
+	[XFS_SCRUB_TYPE_RTSUM]		= { XHG_RT,  XFS_SICK_RT_SUMMARY },
+	[XFS_SCRUB_TYPE_UQUOTA]		= { XHG_FS,  XFS_SICK_FS_UQUOTA },
+	[XFS_SCRUB_TYPE_GQUOTA]		= { XHG_FS,  XFS_SICK_FS_GQUOTA },
+	[XFS_SCRUB_TYPE_PQUOTA]		= { XHG_FS,  XFS_SICK_FS_PQUOTA },
+	[XFS_SCRUB_TYPE_FSCOUNTERS]	= { XHG_FS,  XFS_SICK_FS_COUNTERS },
+};
+
+/* Return the health status mask for this scrub type. */
+unsigned int
+xchk_health_mask_for_scrub_type(
+	__u32			scrub_type)
+{
+	return type_to_health_flag[scrub_type].sick_mask;
+}
+
+/*
+ * Update filesystem health assessments based on what we found and did.
+ *
+ * If the scrubber finds errors, we mark sick whatever's mentioned in
+ * sick_mask, no matter whether this is a first scan or an
+ * evaluation of repair effectiveness.
+ *
+ * Otherwise, no direct corruption was found, so mark whatever's in
+ * sick_mask as healthy.
+ */
+void
+xchk_update_health(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_perag	*pag;
+	bool			bad;
+
+	if (!sc->sick_mask)
+		return;
+
+	bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+				   XFS_SCRUB_OFLAG_XCORRUPT));
+	switch (type_to_health_flag[sc->sm->sm_type].group) {
+	case XHG_AG:
+		pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
+		if (bad)
+			xfs_ag_mark_sick(pag, sc->sick_mask);
+		else
+			xfs_ag_mark_healthy(pag, sc->sick_mask);
+		xfs_perag_put(pag);
+		break;
+	case XHG_INO:
+		if (!sc->ip)
+			return;
+		if (bad)
+			xfs_inode_mark_sick(sc->ip, sc->sick_mask);
+		else
+			xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
+		break;
+	case XHG_FS:
+		if (bad)
+			xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+		else
+			xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
+		break;
+	case XHG_RT:
+		if (bad)
+			xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+		else
+			xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/* Is the given per-AG btree healthy enough for scanning? */
+bool
+xchk_ag_btree_healthy_enough(
+	struct xfs_scrub	*sc,
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum)
+{
+	unsigned int		mask = 0;
+
+	/*
+	 * We always want the cursor if it's the same type as whatever we're
+	 * scrubbing, even if we already know the structure is corrupt.
+	 *
+	 * Otherwise, we're only interested in the btree for cross-referencing.
+	 * If we know the btree is bad then don't bother, just set XFAIL.
+	 */
+	switch (btnum) {
+	case XFS_BTNUM_BNO:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
+			return true;
+		mask = XFS_SICK_AG_BNOBT;
+		break;
+	case XFS_BTNUM_CNT:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
+			return true;
+		mask = XFS_SICK_AG_CNTBT;
+		break;
+	case XFS_BTNUM_INO:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+			return true;
+		mask = XFS_SICK_AG_INOBT;
+		break;
+	case XFS_BTNUM_FINO:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
+			return true;
+		mask = XFS_SICK_AG_FINOBT;
+		break;
+	case XFS_BTNUM_RMAP:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
+			return true;
+		mask = XFS_SICK_AG_RMAPBT;
+		break;
+	case XFS_BTNUM_REFC:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
+			return true;
+		mask = XFS_SICK_AG_REFCNTBT;
+		break;
+	default:
+		ASSERT(0);
+		return true;
+	}
+
+	if (xfs_ag_has_sickness(pag, mask)) {
+		sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
+		return false;
+	}
+
+	return true;
+}
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
new file mode 100644
index 000000000..d0b938d3d
--- /dev/null
+++ b/fs/xfs/scrub/health.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_HEALTH_H__
+#define __XFS_SCRUB_HEALTH_H__
+
+unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
+void xchk_update_health(struct xfs_scrub *sc);
+bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
+		xfs_btnum_t btnum);
+
+#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
new file mode 100644
index 000000000..e312be7cd
--- /dev/null
+++ b/fs/xfs/scrub/ialloc.c
@@ -0,0 +1,659 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "xfs_ag.h"
+
+/*
+ * Set us up to scrub inode btrees.
+ * If we detect a discrepancy between the inobt and the inode,
+ * try again after forcing logged inode cores out to disk.
+ */
+int
+xchk_setup_ag_iallocbt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER);
+}
+
+/* Inode btree scrubber. */
+
+struct xchk_iallocbt {
+	/* Number of inodes we see while scanning inobt. */
+	unsigned long long	inodes;
+
+	/* Expected next startino, for big block filesystems. */
+	xfs_agino_t		next_startino;
+
+	/* Expected end of the current inode cluster. */
+	xfs_agino_t		next_cluster_ino;
+};
+
+/*
+ * If we're checking the finobt, cross-reference with the inobt.
+ * Otherwise we're checking the inobt; if there is an finobt, make sure
+ * we have a record or not depending on freecount.
+ */
+static inline void
+xchk_iallocbt_chunk_xref_other(
+	struct xfs_scrub		*sc,
+	struct xfs_inobt_rec_incore	*irec,
+	xfs_agino_t			agino)
+{
+	struct xfs_btree_cur		**pcur;
+	bool				has_irec;
+	int				error;
+
+	if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
+		pcur = &sc->sa.ino_cur;
+	else
+		pcur = &sc->sa.fino_cur;
+	if (!(*pcur))
+		return;
+	error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec);
+	if (!xchk_should_check_xref(sc, &error, pcur))
+		return;
+	if (((irec->ir_freecount > 0 && !has_irec) ||
+	     (irec->ir_freecount == 0 && has_irec)))
+		xchk_btree_xref_set_corrupt(sc, *pcur, 0);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_iallocbt_chunk_xref(
+	struct xfs_scrub		*sc,
+	struct xfs_inobt_rec_incore	*irec,
+	xfs_agino_t			agino,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			len)
+{
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	xchk_xref_is_used_space(sc, agbno, len);
+	xchk_iallocbt_chunk_xref_other(sc, irec, agino);
+	xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES);
+	xchk_xref_is_not_shared(sc, agbno, len);
+}
+
+/* Is this chunk worth checking? */
+STATIC bool
+xchk_iallocbt_chunk(
+	struct xchk_btree		*bs,
+	struct xfs_inobt_rec_incore	*irec,
+	xfs_agino_t			agino,
+	xfs_extlen_t			len)
+{
+	struct xfs_mount		*mp = bs->cur->bc_mp;
+	struct xfs_perag		*pag = bs->cur->bc_ag.pag;
+	xfs_agblock_t			bno;
+
+	bno = XFS_AGINO_TO_AGBNO(mp, agino);
+
+	if (!xfs_verify_agbext(pag, bno, len))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
+
+	return true;
+}
+
+/* Count the number of free inodes. */
+static unsigned int
+xchk_iallocbt_freecount(
+	xfs_inofree_t			freemask)
+{
+	BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64));
+	return hweight64(freemask);
+}
+
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record.  First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
+STATIC int
+xchk_iallocbt_check_cluster_ifree(
+	struct xchk_btree		*bs,
+	struct xfs_inobt_rec_incore	*irec,
+	unsigned int			irec_ino,
+	struct xfs_dinode		*dip)
+{
+	struct xfs_mount		*mp = bs->cur->bc_mp;
+	xfs_ino_t			fsino;
+	xfs_agino_t			agino;
+	bool				irec_free;
+	bool				ino_inuse;
+	bool				freemask_ok;
+	int				error = 0;
+
+	if (xchk_should_terminate(bs->sc, &error))
+		return error;
+
+	/*
+	 * Given an inobt record and the offset of an inode from the start of
+	 * the record, compute which fs inode we're talking about.
+	 */
+	agino = irec->ir_startino + irec_ino;
+	fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino);
+	irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
+	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
+	    (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		goto out;
+	}
+
+	error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+			&ino_inuse);
+	if (error == -ENODATA) {
+		/* Not cached, just read the disk buffer */
+		freemask_ok = irec_free ^ !!(dip->di_mode);
+		if (!(bs->sc->flags & XCHK_TRY_HARDER) && !freemask_ok)
+			return -EDEADLOCK;
+	} else if (error < 0) {
+		/*
+		 * Inode is only half assembled, or there was an IO error,
+		 * or the verifier failed, so don't bother trying to check.
+		 * The inode scrubber can deal with this.
+		 */
+		goto out;
+	} else {
+		/* Inode is all there. */
+		freemask_ok = irec_free ^ ino_inuse;
+	}
+	if (!freemask_ok)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+out:
+	return 0;
+}
+
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk.  If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
+STATIC int
+xchk_iallocbt_check_cluster(
+	struct xchk_btree		*bs,
+	struct xfs_inobt_rec_incore	*irec,
+	unsigned int			cluster_base)
+{
+	struct xfs_imap			imap;
+	struct xfs_mount		*mp = bs->cur->bc_mp;
+	struct xfs_buf			*cluster_bp;
+	unsigned int			nr_inodes;
+	xfs_agnumber_t			agno = bs->cur->bc_ag.pag->pag_agno;
+	xfs_agblock_t			agbno;
+	unsigned int			cluster_index;
+	uint16_t			cluster_mask = 0;
+	uint16_t			ir_holemask;
+	int				error = 0;
+
+	nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+			M_IGEO(mp)->inodes_per_cluster);
+
+	/* Map this inode cluster */
+	agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
+
+	/* Compute a bitmask for this cluster that can be used for holemask. */
+	for (cluster_index = 0;
+	     cluster_index < nr_inodes;
+	     cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+		cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+				XFS_INODES_PER_HOLEMASK_BIT);
+
+	/*
+	 * Map the first inode of this cluster to a buffer and offset.
+	 * Be careful about inobt records that don't align with the start of
+	 * the inode buffer when block sizes are large enough to hold multiple
+	 * inode chunks.  When this happens, cluster_base will be zero but
+	 * ir_startino can be large enough to make im_boffset nonzero.
+	 */
+	ir_holemask = (irec->ir_holemask & cluster_mask);
+	imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
+	imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) <<
+			mp->m_sb.sb_inodelog;
+
+	if (imap.im_boffset != 0 && cluster_base != 0) {
+		ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
+
+	trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+			imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+			cluster_mask, ir_holemask,
+			XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+					  cluster_base));
+
+	/* The whole cluster must be a hole or not a hole. */
+	if (ir_holemask != cluster_mask && ir_holemask != 0) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
+
+	/* If any part of this is a hole, skip it. */
+	if (ir_holemask) {
+		xchk_xref_is_not_owned_by(bs->sc, agbno,
+				M_IGEO(mp)->blocks_per_cluster,
+				&XFS_RMAP_OINFO_INODES);
+		return 0;
+	}
+
+	xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster,
+			&XFS_RMAP_OINFO_INODES);
+
+	/* Grab the inode cluster buffer. */
+	error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &cluster_bp);
+	if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+		return error;
+
+	/* Check free status of each inode within this cluster. */
+	for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+		struct xfs_dinode	*dip;
+
+		if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+			break;
+		}
+
+		dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+		error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+				cluster_base + cluster_index, dip);
+		if (error)
+			break;
+		imap.im_boffset += mp->m_sb.sb_inodesize;
+	}
+
+	xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+	return error;
+}
+
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+	struct xchk_btree		*bs,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	unsigned int			cluster_base;
+	int				error = 0;
+
+	/*
+	 * For the common case where this inobt record maps to multiple inode
+	 * clusters this will call _check_cluster for each cluster.
+	 *
+	 * For the case that multiple inobt records map to a single cluster,
+	 * this will call _check_cluster once.
+	 */
+	for (cluster_base = 0;
+	     cluster_base < XFS_INODES_PER_CHUNK;
+	     cluster_base += M_IGEO(bs->sc->mp)->inodes_per_cluster) {
+		error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+/*
+ * Make sure this inode btree record is aligned properly.  Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk.  This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+	struct xchk_btree		*bs,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	struct xfs_mount		*mp = bs->sc->mp;
+	struct xchk_iallocbt		*iabt = bs->private;
+	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
+
+	/*
+	 * finobt records have different positioning requirements than inobt
+	 * records: each finobt record must have a corresponding inobt record.
+	 * That is checked in the xref function, so for now we only catch the
+	 * obvious case where the record isn't at all aligned properly.
+	 *
+	 * Note that if a fs block contains more than a single chunk of inodes,
+	 * we will have finobt records only for those chunks containing free
+	 * inodes, and therefore expect chunk alignment of finobt records.
+	 * Otherwise, we expect that the finobt record is aligned to the
+	 * cluster alignment as told by the superblock.
+	 */
+	if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+		unsigned int	imask;
+
+		imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+				igeo->cluster_align_inodes) - 1;
+		if (irec->ir_startino & imask)
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return;
+	}
+
+	if (iabt->next_startino != NULLAGINO) {
+		/*
+		 * We're midway through a cluster of inodes that is mapped by
+		 * multiple inobt records.  Did we get the record for the next
+		 * irec in the sequence?
+		 */
+		if (irec->ir_startino != iabt->next_startino) {
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+			return;
+		}
+
+		iabt->next_startino += XFS_INODES_PER_CHUNK;
+
+		/* Are we done with the cluster? */
+		if (iabt->next_startino >= iabt->next_cluster_ino) {
+			iabt->next_startino = NULLAGINO;
+			iabt->next_cluster_ino = NULLAGINO;
+		}
+		return;
+	}
+
+	/* inobt records must be aligned to cluster and inoalignmnt size. */
+	if (irec->ir_startino & (igeo->cluster_align_inodes - 1)) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return;
+	}
+
+	if (irec->ir_startino & (igeo->inodes_per_cluster - 1)) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return;
+	}
+
+	if (igeo->inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+		return;
+
+	/*
+	 * If this is the start of an inode cluster that can be mapped by
+	 * multiple inobt records, the next inobt record must follow exactly
+	 * after this one.
+	 */
+	iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+	iabt->next_cluster_ino = irec->ir_startino + igeo->inodes_per_cluster;
+}
+
+/* Scrub an inobt/finobt record. */
+STATIC int
+xchk_iallocbt_rec(
+	struct xchk_btree		*bs,
+	const union xfs_btree_rec	*rec)
+{
+	struct xfs_mount		*mp = bs->cur->bc_mp;
+	struct xfs_perag		*pag = bs->cur->bc_ag.pag;
+	struct xchk_iallocbt		*iabt = bs->private;
+	struct xfs_inobt_rec_incore	irec;
+	uint64_t			holes;
+	xfs_agino_t			agino;
+	xfs_extlen_t			len;
+	int				holecount;
+	int				i;
+	int				error = 0;
+	unsigned int			real_freecount;
+	uint16_t			holemask;
+
+	xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+	if (irec.ir_count > XFS_INODES_PER_CHUNK ||
+	    irec.ir_freecount > XFS_INODES_PER_CHUNK)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	real_freecount = irec.ir_freecount +
+			(XFS_INODES_PER_CHUNK - irec.ir_count);
+	if (real_freecount != xchk_iallocbt_freecount(irec.ir_free))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	agino = irec.ir_startino;
+	/* Record has to be properly aligned within the AG. */
+	if (!xfs_verify_agino(pag, agino) ||
+	    !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		goto out;
+	}
+
+	xchk_iallocbt_rec_alignment(bs, &irec);
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	iabt->inodes += irec.ir_count;
+
+	/* Handle non-sparse inodes */
+	if (!xfs_inobt_issparse(irec.ir_holemask)) {
+		len = XFS_B_TO_FSB(mp,
+				XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize);
+		if (irec.ir_count != XFS_INODES_PER_CHUNK)
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+		if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
+			goto out;
+		goto check_clusters;
+	}
+
+	/* Check each chunk of a sparse inode cluster. */
+	holemask = irec.ir_holemask;
+	holecount = 0;
+	len = XFS_B_TO_FSB(mp,
+			XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize);
+	holes = ~xfs_inobt_irec_to_allocmask(&irec);
+	if ((holes & irec.ir_free) != holes ||
+	    irec.ir_freecount > irec.ir_count)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) {
+		if (holemask & 1)
+			holecount += XFS_INODES_PER_HOLEMASK_BIT;
+		else if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
+			break;
+		holemask >>= 1;
+		agino += XFS_INODES_PER_HOLEMASK_BIT;
+	}
+
+	if (holecount > XFS_INODES_PER_CHUNK ||
+	    holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+check_clusters:
+	error = xchk_iallocbt_check_clusters(bs, &irec);
+	if (error)
+		goto out;
+
+out:
+	return error;
+}
+
+/*
+ * Make sure the inode btrees are as large as the rmap thinks they are.
+ * Don't bother if we're missing btree cursors, as we're already corrupt.
+ */
+STATIC void
+xchk_iallocbt_xref_rmap_btreeblks(
+	struct xfs_scrub	*sc,
+	int			which)
+{
+	xfs_filblks_t		blocks;
+	xfs_extlen_t		inobt_blocks = 0;
+	xfs_extlen_t		finobt_blocks = 0;
+	int			error;
+
+	if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
+	    (xfs_has_finobt(sc->mp) && !sc->sa.fino_cur) ||
+	    xchk_skip_xref(sc->sm))
+		return;
+
+	/* Check that we saw as many inobt blocks as the rmap says. */
+	error = xfs_btree_count_blocks(sc->sa.ino_cur, &inobt_blocks);
+	if (!xchk_process_error(sc, 0, 0, &error))
+		return;
+
+	if (sc->sa.fino_cur) {
+		error = xfs_btree_count_blocks(sc->sa.fino_cur, &finobt_blocks);
+		if (!xchk_process_error(sc, 0, 0, &error))
+			return;
+	}
+
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+			&XFS_RMAP_OINFO_INOBT, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	if (blocks != inobt_blocks + finobt_blocks)
+		xchk_btree_set_corrupt(sc, sc->sa.ino_cur, 0);
+}
+
+/*
+ * Make sure that the inobt records point to the same number of blocks as
+ * the rmap says are owned by inodes.
+ */
+STATIC void
+xchk_iallocbt_xref_rmap_inodes(
+	struct xfs_scrub	*sc,
+	int			which,
+	unsigned long long	inodes)
+{
+	xfs_filblks_t		blocks;
+	xfs_filblks_t		inode_blocks;
+	int			error;
+
+	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Check that we saw as many inode blocks as the rmap knows about. */
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+			&XFS_RMAP_OINFO_INODES, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	inode_blocks = XFS_B_TO_FSB(sc->mp, inodes * sc->mp->m_sb.sb_inodesize);
+	if (blocks != inode_blocks)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+}
+
+/* Scrub the inode btrees for some AG. */
+STATIC int
+xchk_iallocbt(
+	struct xfs_scrub	*sc,
+	xfs_btnum_t		which)
+{
+	struct xfs_btree_cur	*cur;
+	struct xchk_iallocbt	iabt = {
+		.inodes		= 0,
+		.next_startino	= NULLAGINO,
+		.next_cluster_ino = NULLAGINO,
+	};
+	int			error;
+
+	cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
+	error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
+			&iabt);
+	if (error)
+		return error;
+
+	xchk_iallocbt_xref_rmap_btreeblks(sc, which);
+
+	/*
+	 * If we're scrubbing the inode btree, inode_blocks is the number of
+	 * blocks pointed to by all the inode chunk records.  Therefore, we
+	 * should compare to the number of inode chunk blocks that the rmap
+	 * knows about.  We can't do this for the finobt since it only points
+	 * to inode chunks with free inodes.
+	 */
+	if (which == XFS_BTNUM_INO)
+		xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
+
+	return error;
+}
+
+int
+xchk_inobt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_iallocbt(sc, XFS_BTNUM_INO);
+}
+
+int
+xchk_finobt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_iallocbt(sc, XFS_BTNUM_FINO);
+}
+
+/* See if an inode btree has (or doesn't have) an inode chunk record. */
+static inline void
+xchk_xref_inode_check(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len,
+	struct xfs_btree_cur	**icur,
+	bool			should_have_inodes)
+{
+	bool			has_inodes;
+	int			error;
+
+	if (!(*icur) || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes);
+	if (!xchk_should_check_xref(sc, &error, icur))
+		return;
+	if (has_inodes != should_have_inodes)
+		xchk_btree_xref_set_corrupt(sc, *icur, 0);
+}
+
+/* xref check that the extent is not covered by inodes */
+void
+xchk_xref_is_not_inode_chunk(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false);
+	xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false);
+}
+
+/* xref check that the extent is covered by inodes */
+void
+xchk_xref_is_inode_chunk(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true);
+}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
new file mode 100644
index 000000000..51820b40a
--- /dev/null
+++ b/fs/xfs/scrub/inode.c
@@ -0,0 +1,628 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "xfs_rmap.h"
+#include "xfs_bmap_util.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+
+/*
+ * Grab total control of the inode metadata.  It doesn't matter here if
+ * the file data is still changing; exclusive access to the metadata is
+ * the goal.
+ */
+int
+xchk_setup_inode(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	/*
+	 * Try to get the inode.  If the verifiers fail, we try again
+	 * in raw mode.
+	 */
+	error = xchk_get_inode(sc);
+	switch (error) {
+	case 0:
+		break;
+	case -EFSCORRUPTED:
+	case -EFSBADCRC:
+		return xchk_trans_alloc(sc, 0);
+	default:
+		return error;
+	}
+
+	/* Got the inode, lock it and we're ready to go. */
+	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		goto out;
+	sc->ilock_flags |= XFS_ILOCK_EXCL;
+	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+	/* scrub teardown will unlock and release the inode for us */
+	return error;
+}
+
+/* Inode core */
+
+/* Validate di_extsize hint. */
+STATIC void
+xchk_inode_extsize(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	xfs_ino_t		ino,
+	uint16_t		mode,
+	uint16_t		flags)
+{
+	xfs_failaddr_t		fa;
+	uint32_t		value = be32_to_cpu(dip->di_extsize);
+
+	fa = xfs_inode_validate_extsize(sc->mp, value, mode, flags);
+	if (fa)
+		xchk_ino_set_corrupt(sc, ino);
+
+	/*
+	 * XFS allows a sysadmin to change the rt extent size when adding a rt
+	 * section to a filesystem after formatting.  If there are any
+	 * directories with extszinherit and rtinherit set, the hint could
+	 * become misaligned with the new rextsize.  The verifier doesn't check
+	 * this, because we allow rtinherit directories even without an rt
+	 * device.  Flag this as an administrative warning since we will clean
+	 * this up eventually.
+	 */
+	if ((flags & XFS_DIFLAG_RTINHERIT) &&
+	    (flags & XFS_DIFLAG_EXTSZINHERIT) &&
+	    value % sc->mp->m_sb.sb_rextsize > 0)
+		xchk_ino_set_warning(sc, ino);
+}
+
+/*
+ * Validate di_cowextsize hint.
+ *
+ * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
+ * These functions must be kept in sync with each other.
+ */
+STATIC void
+xchk_inode_cowextsize(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	xfs_ino_t		ino,
+	uint16_t		mode,
+	uint16_t		flags,
+	uint64_t		flags2)
+{
+	xfs_failaddr_t		fa;
+
+	fa = xfs_inode_validate_cowextsize(sc->mp,
+			be32_to_cpu(dip->di_cowextsize), mode, flags,
+			flags2);
+	if (fa)
+		xchk_ino_set_corrupt(sc, ino);
+}
+
+/* Make sure the di_flags make sense for the inode. */
+STATIC void
+xchk_inode_flags(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	xfs_ino_t		ino,
+	uint16_t		mode,
+	uint16_t		flags)
+{
+	struct xfs_mount	*mp = sc->mp;
+
+	/* di_flags are all taken, last bit cannot be used */
+	if (flags & ~XFS_DIFLAG_ANY)
+		goto bad;
+
+	/* rt flags require rt device */
+	if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
+		goto bad;
+
+	/* new rt bitmap flag only valid for rbmino */
+	if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino)
+		goto bad;
+
+	/* directory-only flags */
+	if ((flags & (XFS_DIFLAG_RTINHERIT |
+		     XFS_DIFLAG_EXTSZINHERIT |
+		     XFS_DIFLAG_PROJINHERIT |
+		     XFS_DIFLAG_NOSYMLINKS)) &&
+	    !S_ISDIR(mode))
+		goto bad;
+
+	/* file-only flags */
+	if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) &&
+	    !S_ISREG(mode))
+		goto bad;
+
+	/* filestreams and rt make no sense */
+	if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
+		goto bad;
+
+	return;
+bad:
+	xchk_ino_set_corrupt(sc, ino);
+}
+
+/* Make sure the di_flags2 make sense for the inode. */
+STATIC void
+xchk_inode_flags2(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	xfs_ino_t		ino,
+	uint16_t		mode,
+	uint16_t		flags,
+	uint64_t		flags2)
+{
+	struct xfs_mount	*mp = sc->mp;
+
+	/* Unknown di_flags2 could be from a future kernel */
+	if (flags2 & ~XFS_DIFLAG2_ANY)
+		xchk_ino_set_warning(sc, ino);
+
+	/* reflink flag requires reflink feature */
+	if ((flags2 & XFS_DIFLAG2_REFLINK) &&
+	    !xfs_has_reflink(mp))
+		goto bad;
+
+	/* cowextsize flag is checked w.r.t. mode separately */
+
+	/* file/dir-only flags */
+	if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode)))
+		goto bad;
+
+	/* file-only flags */
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
+		goto bad;
+
+	/* realtime and reflink make no sense, currently */
+	if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+		goto bad;
+
+	/* no bigtime iflag without the bigtime feature */
+	if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
+		goto bad;
+
+	return;
+bad:
+	xchk_ino_set_corrupt(sc, ino);
+}
+
+static inline void
+xchk_dinode_nsec(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		ino,
+	struct xfs_dinode	*dip,
+	const xfs_timestamp_t	ts)
+{
+	struct timespec64	tv;
+
+	tv = xfs_inode_from_disk_ts(dip, ts);
+	if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC)
+		xchk_ino_set_corrupt(sc, ino);
+}
+
+/* Scrub all the ondisk inode fields. */
+STATIC void
+xchk_dinode(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	xfs_ino_t		ino)
+{
+	struct xfs_mount	*mp = sc->mp;
+	size_t			fork_recs;
+	unsigned long long	isize;
+	uint64_t		flags2;
+	xfs_extnum_t		nextents;
+	xfs_extnum_t		naextents;
+	prid_t			prid;
+	uint16_t		flags;
+	uint16_t		mode;
+
+	flags = be16_to_cpu(dip->di_flags);
+	if (dip->di_version >= 3)
+		flags2 = be64_to_cpu(dip->di_flags2);
+	else
+		flags2 = 0;
+
+	/* di_mode */
+	mode = be16_to_cpu(dip->di_mode);
+	switch (mode & S_IFMT) {
+	case S_IFLNK:
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		/* mode is recognized */
+		break;
+	default:
+		xchk_ino_set_corrupt(sc, ino);
+		break;
+	}
+
+	/* v1/v2 fields */
+	switch (dip->di_version) {
+	case 1:
+		/*
+		 * We autoconvert v1 inodes into v2 inodes on writeout,
+		 * so just mark this inode for preening.
+		 */
+		xchk_ino_set_preen(sc, ino);
+		prid = 0;
+		break;
+	case 2:
+	case 3:
+		if (dip->di_onlink != 0)
+			xchk_ino_set_corrupt(sc, ino);
+
+		if (dip->di_mode == 0 && sc->ip)
+			xchk_ino_set_corrupt(sc, ino);
+
+		if (dip->di_projid_hi != 0 &&
+		    !xfs_has_projid32(mp))
+			xchk_ino_set_corrupt(sc, ino);
+
+		prid = be16_to_cpu(dip->di_projid_lo);
+		break;
+	default:
+		xchk_ino_set_corrupt(sc, ino);
+		return;
+	}
+
+	if (xfs_has_projid32(mp))
+		prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16;
+
+	/*
+	 * di_uid/di_gid -- -1 isn't invalid, but there's no way that
+	 * userspace could have created that.
+	 */
+	if (dip->di_uid == cpu_to_be32(-1U) ||
+	    dip->di_gid == cpu_to_be32(-1U))
+		xchk_ino_set_warning(sc, ino);
+
+	/*
+	 * project id of -1 isn't supposed to be valid, but the kernel didn't
+	 * always validate that.
+	 */
+	if (prid == -1U)
+		xchk_ino_set_warning(sc, ino);
+
+	/* di_format */
+	switch (dip->di_format) {
+	case XFS_DINODE_FMT_DEV:
+		if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
+		    !S_ISFIFO(mode) && !S_ISSOCK(mode))
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		if (!S_ISDIR(mode) && !S_ISLNK(mode))
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (!S_ISREG(mode) && !S_ISDIR(mode))
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	case XFS_DINODE_FMT_UUID:
+	default:
+		xchk_ino_set_corrupt(sc, ino);
+		break;
+	}
+
+	/* di_[amc]time.nsec */
+	xchk_dinode_nsec(sc, ino, dip, dip->di_atime);
+	xchk_dinode_nsec(sc, ino, dip, dip->di_mtime);
+	xchk_dinode_nsec(sc, ino, dip, dip->di_ctime);
+
+	/*
+	 * di_size.  xfs_dinode_verify checks for things that screw up
+	 * the VFS such as the upper bit being set and zero-length
+	 * symlinks/directories, but we can do more here.
+	 */
+	isize = be64_to_cpu(dip->di_size);
+	if (isize & (1ULL << 63))
+		xchk_ino_set_corrupt(sc, ino);
+
+	/* Devices, fifos, and sockets must have zero size */
+	if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
+		xchk_ino_set_corrupt(sc, ino);
+
+	/* Directories can't be larger than the data section size (32G) */
+	if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE))
+		xchk_ino_set_corrupt(sc, ino);
+
+	/* Symlinks can't be larger than SYMLINK_MAXLEN */
+	if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN))
+		xchk_ino_set_corrupt(sc, ino);
+
+	/*
+	 * Warn if the running kernel can't handle the kinds of offsets
+	 * needed to deal with the file size.  In other words, if the
+	 * pagecache can't cache all the blocks in this file due to
+	 * overly large offsets, flag the inode for admin review.
+	 */
+	if (isize >= mp->m_super->s_maxbytes)
+		xchk_ino_set_warning(sc, ino);
+
+	/* di_nblocks */
+	if (flags2 & XFS_DIFLAG2_REFLINK) {
+		; /* nblocks can exceed dblocks */
+	} else if (flags & XFS_DIFLAG_REALTIME) {
+		/*
+		 * nblocks is the sum of data extents (in the rtdev),
+		 * attr extents (in the datadev), and both forks' bmbt
+		 * blocks (in the datadev).  This clumsy check is the
+		 * best we can do without cross-referencing with the
+		 * inode forks.
+		 */
+		if (be64_to_cpu(dip->di_nblocks) >=
+		    mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
+			xchk_ino_set_corrupt(sc, ino);
+	} else {
+		if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
+			xchk_ino_set_corrupt(sc, ino);
+	}
+
+	xchk_inode_flags(sc, dip, ino, mode, flags);
+
+	xchk_inode_extsize(sc, dip, ino, mode, flags);
+
+	nextents = xfs_dfork_data_extents(dip);
+	naextents = xfs_dfork_attr_extents(dip);
+
+	/* di_nextents */
+	fork_recs =  XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+	switch (dip->di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		if (nextents > fork_recs)
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (nextents <= fork_recs)
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	default:
+		if (nextents != 0)
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	}
+
+	/* di_forkoff */
+	if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+		xchk_ino_set_corrupt(sc, ino);
+	if (naextents != 0 && dip->di_forkoff == 0)
+		xchk_ino_set_corrupt(sc, ino);
+	if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
+		xchk_ino_set_corrupt(sc, ino);
+
+	/* di_aformat */
+	if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
+	    dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
+	    dip->di_aformat != XFS_DINODE_FMT_BTREE)
+		xchk_ino_set_corrupt(sc, ino);
+
+	/* di_anextents */
+	fork_recs =  XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+	switch (dip->di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		if (naextents > fork_recs)
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (naextents <= fork_recs)
+			xchk_ino_set_corrupt(sc, ino);
+		break;
+	default:
+		if (naextents != 0)
+			xchk_ino_set_corrupt(sc, ino);
+	}
+
+	if (dip->di_version >= 3) {
+		xchk_dinode_nsec(sc, ino, dip, dip->di_crtime);
+		xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
+		xchk_inode_cowextsize(sc, dip, ino, mode, flags,
+				flags2);
+	}
+}
+
+/*
+ * Make sure the finobt doesn't think this inode is free.
+ * We don't have to check the inobt ourselves because we got the inode via
+ * IGET_UNTRUSTED, which checks the inobt for us.
+ */
+static void
+xchk_inode_xref_finobt(
+	struct xfs_scrub		*sc,
+	xfs_ino_t			ino)
+{
+	struct xfs_inobt_rec_incore	rec;
+	xfs_agino_t			agino;
+	int				has_record;
+	int				error;
+
+	if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	agino = XFS_INO_TO_AGINO(sc->mp, ino);
+
+	/*
+	 * Try to get the finobt record.  If we can't get it, then we're
+	 * in good shape.
+	 */
+	error = xfs_inobt_lookup(sc->sa.fino_cur, agino, XFS_LOOKUP_LE,
+			&has_record);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) ||
+	    !has_record)
+		return;
+
+	error = xfs_inobt_get_rec(sc->sa.fino_cur, &rec, &has_record);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) ||
+	    !has_record)
+		return;
+
+	/*
+	 * Otherwise, make sure this record either doesn't cover this inode,
+	 * or that it does but it's marked present.
+	 */
+	if (rec.ir_startino > agino ||
+	    rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+		return;
+
+	if (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))
+		xchk_btree_xref_set_corrupt(sc, sc->sa.fino_cur, 0);
+}
+
+/* Cross reference the inode fields with the forks. */
+STATIC void
+xchk_inode_xref_bmap(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	xfs_extnum_t		nextents;
+	xfs_filblks_t		count;
+	xfs_filblks_t		acount;
+	int			error;
+
+	if (xchk_skip_xref(sc->sm))
+		return;
+
+	/* Walk all the extents to check nextents/naextents/nblocks. */
+	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
+			&nextents, &count);
+	if (!xchk_should_check_xref(sc, &error, NULL))
+		return;
+	if (nextents < xfs_dfork_data_extents(dip))
+		xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+
+	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
+			&nextents, &acount);
+	if (!xchk_should_check_xref(sc, &error, NULL))
+		return;
+	if (nextents != xfs_dfork_attr_extents(dip))
+		xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+
+	/* Check nblocks against the inode. */
+	if (count + acount != be64_to_cpu(dip->di_nblocks))
+		xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_inode_xref(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		ino,
+	struct xfs_dinode	*dip)
+{
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	int			error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	agno = XFS_INO_TO_AGNO(sc->mp, ino);
+	agbno = XFS_INO_TO_AGBNO(sc->mp, ino);
+
+	error = xchk_ag_init_existing(sc, agno, &sc->sa);
+	if (!xchk_xref_process_error(sc, agno, agbno, &error))
+		goto out_free;
+
+	xchk_xref_is_used_space(sc, agbno, 1);
+	xchk_inode_xref_finobt(sc, ino);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
+	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_inode_xref_bmap(sc, dip);
+
+out_free:
+	xchk_ag_free(sc, &sc->sa);
+}
+
+/*
+ * If the reflink iflag disagrees with a scan for shared data fork extents,
+ * either flag an error (shared extents w/ no flag) or a preen (flag set w/o
+ * any shared extents).  We already checked for reflink iflag set on a non
+ * reflink filesystem.
+ */
+static void
+xchk_inode_check_reflink_iflag(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		ino)
+{
+	struct xfs_mount	*mp = sc->mp;
+	bool			has_shared;
+	int			error;
+
+	if (!xfs_has_reflink(mp))
+		return;
+
+	error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+			&has_shared);
+	if (!xchk_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
+			XFS_INO_TO_AGBNO(mp, ino), &error))
+		return;
+	if (xfs_is_reflink_inode(sc->ip) && !has_shared)
+		xchk_ino_set_preen(sc, ino);
+	else if (!xfs_is_reflink_inode(sc->ip) && has_shared)
+		xchk_ino_set_corrupt(sc, ino);
+}
+
+/* Scrub an inode. */
+int
+xchk_inode(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_dinode	di;
+	int			error = 0;
+
+	/*
+	 * If sc->ip is NULL, that means that the setup function called
+	 * xfs_iget to look up the inode.  xfs_iget returned a EFSCORRUPTED
+	 * and a NULL inode, so flag the corruption error and return.
+	 */
+	if (!sc->ip) {
+		xchk_ino_set_corrupt(sc, sc->sm->sm_ino);
+		return 0;
+	}
+
+	/* Scrub the inode core. */
+	xfs_inode_to_disk(sc->ip, &di, 0);
+	xchk_dinode(sc, &di, sc->ip->i_ino);
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/*
+	 * Look for discrepancies between file's data blocks and the reflink
+	 * iflag.  We already checked the iflag against the file mode when
+	 * we scrubbed the dinode.
+	 */
+	if (S_ISREG(VFS_I(sc->ip)->i_mode))
+		xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino);
+
+	xchk_inode_xref(sc, sc->ip->i_ino, &di);
+out:
+	return error;
+}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
new file mode 100644
index 000000000..d8dff3fd8
--- /dev/null
+++ b/fs/xfs/scrub/parent.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Set us up to scrub parents. */
+int
+xchk_setup_parent(
+	struct xfs_scrub	*sc)
+{
+	return xchk_setup_inode_contents(sc, 0);
+}
+
+/* Parent pointers */
+
+/* Look for an entry in a parent pointing to this inode. */
+
+struct xchk_parent_ctx {
+	struct dir_context	dc;
+	struct xfs_scrub	*sc;
+	xfs_ino_t		ino;
+	xfs_nlink_t		nlink;
+	bool			cancelled;
+};
+
+/* Look for a single entry in a directory pointing to an inode. */
+STATIC bool
+xchk_parent_actor(
+	struct dir_context	*dc,
+	const char		*name,
+	int			namelen,
+	loff_t			pos,
+	u64			ino,
+	unsigned		type)
+{
+	struct xchk_parent_ctx	*spc;
+	int			error = 0;
+
+	spc = container_of(dc, struct xchk_parent_ctx, dc);
+	if (spc->ino == ino)
+		spc->nlink++;
+
+	/*
+	 * If we're facing a fatal signal, bail out.  Store the cancellation
+	 * status separately because the VFS readdir code squashes error codes
+	 * into short directory reads.
+	 */
+	if (xchk_should_terminate(spc->sc, &error))
+		spc->cancelled = true;
+
+	return !error;
+}
+
+/* Count the number of dentries in the parent dir that point to this inode. */
+STATIC int
+xchk_parent_count_parent_dentries(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*parent,
+	xfs_nlink_t		*nlink)
+{
+	struct xchk_parent_ctx	spc = {
+		.dc.actor	= xchk_parent_actor,
+		.ino		= sc->ip->i_ino,
+		.sc		= sc,
+	};
+	size_t			bufsize;
+	loff_t			oldpos;
+	uint			lock_mode;
+	int			error = 0;
+
+	/*
+	 * If there are any blocks, read-ahead block 0 as we're almost
+	 * certain to have the next operation be a read there.  This is
+	 * how we guarantee that the parent's extent map has been loaded,
+	 * if there is one.
+	 */
+	lock_mode = xfs_ilock_data_map_shared(parent);
+	if (parent->i_df.if_nextents > 0)
+		error = xfs_dir3_data_readahead(parent, 0, 0);
+	xfs_iunlock(parent, lock_mode);
+	if (error)
+		return error;
+
+	/*
+	 * Iterate the parent dir to confirm that there is
+	 * exactly one entry pointing back to the inode being
+	 * scanned.
+	 */
+	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
+			parent->i_disk_size);
+	oldpos = 0;
+	while (true) {
+		error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
+		if (error)
+			goto out;
+		if (spc.cancelled) {
+			error = -EAGAIN;
+			goto out;
+		}
+		if (oldpos == spc.dc.pos)
+			break;
+		oldpos = spc.dc.pos;
+	}
+	*nlink = spc.nlink;
+out:
+	return error;
+}
+
+/*
+ * Given the inode number of the alleged parent of the inode being
+ * scrubbed, try to validate that the parent has exactly one directory
+ * entry pointing back to the inode being scrubbed.
+ */
+STATIC int
+xchk_parent_validate(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		dnum,
+	bool			*try_again)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*dp = NULL;
+	xfs_nlink_t		expected_nlink;
+	xfs_nlink_t		nlink;
+	int			error = 0;
+
+	*try_again = false;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/* '..' must not point to ourselves. */
+	if (sc->ip->i_ino == dnum) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out;
+	}
+
+	/*
+	 * If we're an unlinked directory, the parent /won't/ have a link
+	 * to us.  Otherwise, it should have one link.
+	 */
+	expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
+
+	/*
+	 * Grab this parent inode.  We release the inode before we
+	 * cancel the scrub transaction.  Since we're don't know a
+	 * priori that releasing the inode won't trigger eofblocks
+	 * cleanup (which allocates what would be a nested transaction)
+	 * if the parent pointer erroneously points to a file, we
+	 * can't use DONTCACHE here because DONTCACHE inodes can trigger
+	 * immediate inactive cleanup of the inode.
+	 *
+	 * If _iget returns -EINVAL or -ENOENT then the parent inode number is
+	 * garbage and the directory is corrupt.  If the _iget returns
+	 * -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a
+	 *  cross referencing error.  Any other error is an operational error.
+	 */
+	error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp);
+	if (error == -EINVAL || error == -ENOENT) {
+		error = -EFSCORRUPTED;
+		xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
+		goto out;
+	}
+	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+		goto out;
+	if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out_rele;
+	}
+
+	/*
+	 * We prefer to keep the inode locked while we lock and search
+	 * its alleged parent for a forward reference.  If we can grab
+	 * the iolock, validate the pointers and we're done.  We must
+	 * use nowait here to avoid an ABBA deadlock on the parent and
+	 * the child inodes.
+	 */
+	if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) {
+		error = xchk_parent_count_parent_dentries(sc, dp, &nlink);
+		if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+				&error))
+			goto out_unlock;
+		if (nlink != expected_nlink)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out_unlock;
+	}
+
+	/*
+	 * The game changes if we get here.  We failed to lock the parent,
+	 * so we're going to try to verify both pointers while only holding
+	 * one lock so as to avoid deadlocking with something that's actually
+	 * trying to traverse down the directory tree.
+	 */
+	xfs_iunlock(sc->ip, sc->ilock_flags);
+	sc->ilock_flags = 0;
+	error = xchk_ilock_inverted(dp, XFS_IOLOCK_SHARED);
+	if (error)
+		goto out_rele;
+
+	/* Go looking for our dentry. */
+	error = xchk_parent_count_parent_dentries(sc, dp, &nlink);
+	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+		goto out_unlock;
+
+	/* Drop the parent lock, relock this inode. */
+	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+	error = xchk_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL);
+	if (error)
+		goto out_rele;
+	sc->ilock_flags = XFS_IOLOCK_EXCL;
+
+	/*
+	 * If we're an unlinked directory, the parent /won't/ have a link
+	 * to us.  Otherwise, it should have one link.  We have to re-set
+	 * it here because we dropped the lock on sc->ip.
+	 */
+	expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
+
+	/* Look up '..' to see if the inode changed. */
+	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+		goto out_rele;
+
+	/* Drat, parent changed.  Try again! */
+	if (dnum != dp->i_ino) {
+		xfs_irele(dp);
+		*try_again = true;
+		return 0;
+	}
+	xfs_irele(dp);
+
+	/*
+	 * '..' didn't change, so check that there was only one entry
+	 * for us in the parent.
+	 */
+	if (nlink != expected_nlink)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+	return error;
+
+out_unlock:
+	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+out_rele:
+	xfs_irele(dp);
+out:
+	return error;
+}
+
+/* Scrub a parent pointer. */
+int
+xchk_parent(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	xfs_ino_t		dnum;
+	bool			try_again;
+	int			tries = 0;
+	int			error = 0;
+
+	/*
+	 * If we're a directory, check that the '..' link points up to
+	 * a directory that has one entry pointing to us.
+	 */
+	if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+		return -ENOENT;
+
+	/* We're not a special inode, are we? */
+	if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out;
+	}
+
+	/*
+	 * The VFS grabs a read or write lock via i_rwsem before it reads
+	 * or writes to a directory.  If we've gotten this far we've
+	 * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
+	 * getting a write lock on i_rwsem.  Therefore, it is safe for us
+	 * to drop the ILOCK here in order to do directory lookups.
+	 */
+	sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
+	xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
+	/* Look up '..' */
+	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+		goto out;
+	if (!xfs_verify_dir_ino(mp, dnum)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out;
+	}
+
+	/* Is this the root dir?  Then '..' must point to itself. */
+	if (sc->ip == mp->m_rootip) {
+		if (sc->ip->i_ino != mp->m_sb.sb_rootino ||
+		    sc->ip->i_ino != dnum)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out;
+	}
+
+	do {
+		error = xchk_parent_validate(sc, dnum, &try_again);
+		if (error)
+			goto out;
+	} while (try_again && ++tries < 20);
+
+	/*
+	 * We gave it our best shot but failed, so mark this scrub
+	 * incomplete.  Userspace can decide if it wants to try again.
+	 */
+	if (try_again && tries == 20)
+		xchk_set_incomplete(sc);
+out:
+	/*
+	 * If we failed to lock the parent inode even after a retry, just mark
+	 * this scrub incomplete and return.
+	 */
+	if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
+		error = 0;
+		xchk_set_incomplete(sc);
+	}
+	return error;
+}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
new file mode 100644
index 000000000..21b4c9006
--- /dev/null
+++ b/fs/xfs/scrub/quota.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Convert a scrub type code to a DQ flag, or return 0 if error. */
+static inline xfs_dqtype_t
+xchk_quota_to_dqtype(
+	struct xfs_scrub	*sc)
+{
+	switch (sc->sm->sm_type) {
+	case XFS_SCRUB_TYPE_UQUOTA:
+		return XFS_DQTYPE_USER;
+	case XFS_SCRUB_TYPE_GQUOTA:
+		return XFS_DQTYPE_GROUP;
+	case XFS_SCRUB_TYPE_PQUOTA:
+		return XFS_DQTYPE_PROJ;
+	default:
+		return 0;
+	}
+}
+
+/* Set us up to scrub a quota. */
+int
+xchk_setup_quota(
+	struct xfs_scrub	*sc)
+{
+	xfs_dqtype_t		dqtype;
+	int			error;
+
+	if (!XFS_IS_QUOTA_ON(sc->mp))
+		return -ENOENT;
+
+	dqtype = xchk_quota_to_dqtype(sc);
+	if (dqtype == 0)
+		return -EINVAL;
+
+	if (!xfs_this_quota_on(sc->mp, dqtype))
+		return -ENOENT;
+
+	error = xchk_setup_fs(sc);
+	if (error)
+		return error;
+	sc->ip = xfs_quota_inode(sc->mp, dqtype);
+	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+	sc->ilock_flags = XFS_ILOCK_EXCL;
+	return 0;
+}
+
+/* Quotas. */
+
+struct xchk_quota_info {
+	struct xfs_scrub	*sc;
+	xfs_dqid_t		last_id;
+};
+
+/* Scrub the fields in an individual quota item. */
+STATIC int
+xchk_quota_item(
+	struct xfs_dquot	*dq,
+	xfs_dqtype_t		dqtype,
+	void			*priv)
+{
+	struct xchk_quota_info	*sqi = priv;
+	struct xfs_scrub	*sc = sqi->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	xfs_fileoff_t		offset;
+	xfs_ino_t		fs_icount;
+	int			error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return -ECANCELED;
+
+	/*
+	 * Except for the root dquot, the actual dquot we got must either have
+	 * the same or higher id as we saw before.
+	 */
+	offset = dq->q_id / qi->qi_dqperchunk;
+	if (dq->q_id && dq->q_id <= sqi->last_id)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+	sqi->last_id = dq->q_id;
+
+	/*
+	 * Warn if the hard limits are larger than the fs.
+	 * Administrators can do this, though in production this seems
+	 * suspect, which is why we flag it for review.
+	 *
+	 * Complain about corruption if the soft limit is greater than
+	 * the hard limit.
+	 */
+	if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks)
+		xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+	if (dq->q_blk.softlimit > dq->q_blk.hardlimit)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+	if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount)
+		xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+	if (dq->q_ino.softlimit > dq->q_ino.hardlimit)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+	if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks)
+		xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+	if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+	/* Check the resource counts. */
+	fs_icount = percpu_counter_sum(&mp->m_icount);
+
+	/*
+	 * Check that usage doesn't exceed physical limits.  However, on
+	 * a reflink filesystem we're allowed to exceed physical space
+	 * if there are no quota limits.
+	 */
+	if (xfs_has_reflink(mp)) {
+		if (mp->m_sb.sb_dblocks < dq->q_blk.count)
+			xchk_fblock_set_warning(sc, XFS_DATA_FORK,
+					offset);
+	} else {
+		if (mp->m_sb.sb_dblocks < dq->q_blk.count)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+					offset);
+	}
+	if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+	/*
+	 * We can violate the hard limits if the admin suddenly sets a
+	 * lower limit than the actual usage.  However, we flag it for
+	 * admin review.
+	 */
+	if (dq->q_id == 0)
+		goto out;
+
+	if (dq->q_blk.hardlimit != 0 &&
+	    dq->q_blk.count > dq->q_blk.hardlimit)
+		xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+
+	if (dq->q_ino.hardlimit != 0 &&
+	    dq->q_ino.count > dq->q_ino.hardlimit)
+		xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+
+	if (dq->q_rtb.hardlimit != 0 &&
+	    dq->q_rtb.count > dq->q_rtb.hardlimit)
+		xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+
+out:
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return -ECANCELED;
+
+	return 0;
+}
+
+/* Check the quota's data fork. */
+STATIC int
+xchk_quota_data_fork(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_bmbt_irec	irec = { 0 };
+	struct xfs_iext_cursor	icur;
+	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
+	struct xfs_ifork	*ifp;
+	xfs_fileoff_t		max_dqid_off;
+	int			error = 0;
+
+	/* Invoke the fork scrubber. */
+	error = xchk_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	/* Check for data fork problems that apply only to quota files. */
+	max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+	for_each_xfs_iext(ifp, &icur, &irec) {
+		if (xchk_should_terminate(sc, &error))
+			break;
+		/*
+		 * delalloc extents or blocks mapped above the highest
+		 * quota id shouldn't happen.
+		 */
+		if (isnullstartblock(irec.br_startblock) ||
+		    irec.br_startoff > max_dqid_off ||
+		    irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+					irec.br_startoff);
+			break;
+		}
+	}
+
+	return error;
+}
+
+/* Scrub all of a quota type's items. */
+int
+xchk_quota(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_quota_info	sqi;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	xfs_dqtype_t		dqtype;
+	int			error = 0;
+
+	dqtype = xchk_quota_to_dqtype(sc);
+
+	/* Look for problem extents. */
+	error = xchk_quota_data_fork(sc);
+	if (error)
+		goto out;
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
+	/*
+	 * Check all the quota items.  Now that we've checked the quota inode
+	 * data fork we have to drop ILOCK_EXCL to use the regular dquot
+	 * functions.
+	 */
+	xfs_iunlock(sc->ip, sc->ilock_flags);
+	sc->ilock_flags = 0;
+	sqi.sc = sc;
+	sqi.last_id = 0;
+	error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
+	sc->ilock_flags = XFS_ILOCK_EXCL;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+	if (error == -ECANCELED)
+		error = 0;
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
+			sqi.last_id * qi->qi_dqperchunk, &error))
+		goto out;
+
+out:
+	return error;
+}
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
new file mode 100644
index 000000000..a26ee0f24
--- /dev/null
+++ b/fs/xfs/scrub/refcount.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+
+/*
+ * Set us up to scrub reference count btrees.
+ */
+int
+xchk_setup_ag_refcountbt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_setup_ag_btree(sc, false);
+}
+
+/* Reference count btree scrubber. */
+
+/*
+ * Confirming Reference Counts via Reverse Mappings
+ *
+ * We want to count the reverse mappings overlapping a refcount record
+ * (bno, len, refcount), allowing for the possibility that some of the
+ * overlap may come from smaller adjoining reverse mappings, while some
+ * comes from single extents which overlap the range entirely.  The
+ * outer loop is as follows:
+ *
+ * 1. For all reverse mappings overlapping the refcount extent,
+ *    a. If a given rmap completely overlaps, mark it as seen.
+ *    b. Otherwise, record the fragment (in agbno order) for later
+ *       processing.
+ *
+ * Once we've seen all the rmaps, we know that for all blocks in the
+ * refcount record we want to find $refcount owners and we've already
+ * visited $seen extents that overlap all the blocks.  Therefore, we
+ * need to find ($refcount - $seen) owners for every block in the
+ * extent; call that quantity $target_nr.  Proceed as follows:
+ *
+ * 2. Pull the first $target_nr fragments from the list; all of them
+ *    should start at or before the start of the extent.
+ *    Call this subset of fragments the working set.
+ * 3. Until there are no more unprocessed fragments,
+ *    a. Find the shortest fragments in the set and remove them.
+ *    b. Note the block number of the end of these fragments.
+ *    c. Pull the same number of fragments from the list.  All of these
+ *       fragments should start at the block number recorded in the
+ *       previous step.
+ *    d. Put those fragments in the set.
+ * 4. Check that there are $target_nr fragments remaining in the list,
+ *    and that they all end at or beyond the end of the refcount extent.
+ *
+ * If the refcount is correct, all the check conditions in the algorithm
+ * should always hold true.  If not, the refcount is incorrect.
+ */
+struct xchk_refcnt_frag {
+	struct list_head	list;
+	struct xfs_rmap_irec	rm;
+};
+
+struct xchk_refcnt_check {
+	struct xfs_scrub	*sc;
+	struct list_head	fragments;
+
+	/* refcount extent we're examining */
+	xfs_agblock_t		bno;
+	xfs_extlen_t		len;
+	xfs_nlink_t		refcount;
+
+	/* number of owners seen */
+	xfs_nlink_t		seen;
+};
+
+/*
+ * Decide if the given rmap is large enough that we can redeem it
+ * towards refcount verification now, or if it's a fragment, in
+ * which case we'll hang onto it in the hopes that we'll later
+ * discover that we've collected exactly the correct number of
+ * fragments as the refcountbt says we should have.
+ */
+STATIC int
+xchk_refcountbt_rmap_check(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xchk_refcnt_check	*refchk = priv;
+	struct xchk_refcnt_frag		*frag;
+	xfs_agblock_t			rm_last;
+	xfs_agblock_t			rc_last;
+	int				error = 0;
+
+	if (xchk_should_terminate(refchk->sc, &error))
+		return error;
+
+	rm_last = rec->rm_startblock + rec->rm_blockcount - 1;
+	rc_last = refchk->bno + refchk->len - 1;
+
+	/* Confirm that a single-owner refc extent is a CoW stage. */
+	if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) {
+		xchk_btree_xref_set_corrupt(refchk->sc, cur, 0);
+		return 0;
+	}
+
+	if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) {
+		/*
+		 * The rmap overlaps the refcount record, so we can confirm
+		 * one refcount owner seen.
+		 */
+		refchk->seen++;
+	} else {
+		/*
+		 * This rmap covers only part of the refcount record, so
+		 * save the fragment for later processing.  If the rmapbt
+		 * is healthy each rmap_irec we see will be in agbno order
+		 * so we don't need insertion sort here.
+		 */
+		frag = kmem_alloc(sizeof(struct xchk_refcnt_frag),
+				KM_MAYFAIL);
+		if (!frag)
+			return -ENOMEM;
+		memcpy(&frag->rm, rec, sizeof(frag->rm));
+		list_add_tail(&frag->list, &refchk->fragments);
+	}
+
+	return 0;
+}
+
+/*
+ * Given a bunch of rmap fragments, iterate through them, keeping
+ * a running tally of the refcount.  If this ever deviates from
+ * what we expect (which is the refcountbt's refcount minus the
+ * number of extents that totally covered the refcountbt extent),
+ * we have a refcountbt error.
+ */
+STATIC void
+xchk_refcountbt_process_rmap_fragments(
+	struct xchk_refcnt_check	*refchk)
+{
+	struct list_head		worklist;
+	struct xchk_refcnt_frag		*frag;
+	struct xchk_refcnt_frag		*n;
+	xfs_agblock_t			bno;
+	xfs_agblock_t			rbno;
+	xfs_agblock_t			next_rbno;
+	xfs_nlink_t			nr;
+	xfs_nlink_t			target_nr;
+
+	target_nr = refchk->refcount - refchk->seen;
+	if (target_nr == 0)
+		return;
+
+	/*
+	 * There are (refchk->rc.rc_refcount - refchk->nr refcount)
+	 * references we haven't found yet.  Pull that many off the
+	 * fragment list and figure out where the smallest rmap ends
+	 * (and therefore the next rmap should start).  All the rmaps
+	 * we pull off should start at or before the beginning of the
+	 * refcount record's range.
+	 */
+	INIT_LIST_HEAD(&worklist);
+	rbno = NULLAGBLOCK;
+
+	/* Make sure the fragments actually /are/ in agbno order. */
+	bno = 0;
+	list_for_each_entry(frag, &refchk->fragments, list) {
+		if (frag->rm.rm_startblock < bno)
+			goto done;
+		bno = frag->rm.rm_startblock;
+	}
+
+	/*
+	 * Find all the rmaps that start at or before the refc extent,
+	 * and put them on the worklist.
+	 */
+	nr = 0;
+	list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+		if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
+			break;
+		bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+		if (bno < rbno)
+			rbno = bno;
+		list_move_tail(&frag->list, &worklist);
+		nr++;
+	}
+
+	/*
+	 * We should have found exactly $target_nr rmap fragments starting
+	 * at or before the refcount extent.
+	 */
+	if (nr != target_nr)
+		goto done;
+
+	while (!list_empty(&refchk->fragments)) {
+		/* Discard any fragments ending at rbno from the worklist. */
+		nr = 0;
+		next_rbno = NULLAGBLOCK;
+		list_for_each_entry_safe(frag, n, &worklist, list) {
+			bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+			if (bno != rbno) {
+				if (bno < next_rbno)
+					next_rbno = bno;
+				continue;
+			}
+			list_del(&frag->list);
+			kmem_free(frag);
+			nr++;
+		}
+
+		/* Try to add nr rmaps starting at rbno to the worklist. */
+		list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+			bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+			if (frag->rm.rm_startblock != rbno)
+				goto done;
+			list_move_tail(&frag->list, &worklist);
+			if (next_rbno > bno)
+				next_rbno = bno;
+			nr--;
+			if (nr == 0)
+				break;
+		}
+
+		/*
+		 * If we get here and nr > 0, this means that we added fewer
+		 * items to the worklist than we discarded because the fragment
+		 * list ran out of items.  Therefore, we cannot maintain the
+		 * required refcount.  Something is wrong, so we're done.
+		 */
+		if (nr)
+			goto done;
+
+		rbno = next_rbno;
+	}
+
+	/*
+	 * Make sure the last extent we processed ends at or beyond
+	 * the end of the refcount extent.
+	 */
+	if (rbno < refchk->bno + refchk->len)
+		goto done;
+
+	/* Actually record us having seen the remaining refcount. */
+	refchk->seen = refchk->refcount;
+done:
+	/* Delete fragments and work list. */
+	list_for_each_entry_safe(frag, n, &worklist, list) {
+		list_del(&frag->list);
+		kmem_free(frag);
+	}
+	list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+		list_del(&frag->list);
+		kmem_free(frag);
+	}
+}
+
+/* Use the rmap entries covering this extent to verify the refcount. */
+STATIC void
+xchk_refcountbt_xref_rmap(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*irec)
+{
+	struct xchk_refcnt_check	refchk = {
+		.sc			= sc,
+		.bno			= irec->rc_startblock,
+		.len			= irec->rc_blockcount,
+		.refcount		= irec->rc_refcount,
+		.seen = 0,
+	};
+	struct xfs_rmap_irec		low;
+	struct xfs_rmap_irec		high;
+	struct xchk_refcnt_frag		*frag;
+	struct xchk_refcnt_frag		*n;
+	int				error;
+
+	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Cross-reference with the rmapbt to confirm the refcount. */
+	memset(&low, 0, sizeof(low));
+	low.rm_startblock = irec->rc_startblock;
+	memset(&high, 0xFF, sizeof(high));
+	high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
+
+	INIT_LIST_HEAD(&refchk.fragments);
+	error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
+			&xchk_refcountbt_rmap_check, &refchk);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		goto out_free;
+
+	xchk_refcountbt_process_rmap_fragments(&refchk);
+	if (irec->rc_refcount != refchk.seen)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+out_free:
+	list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
+		list_del(&frag->list);
+		kmem_free(frag);
+	}
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_refcountbt_xref(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount);
+	xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock,
+			irec->rc_blockcount);
+	xchk_refcountbt_xref_rmap(sc, irec);
+}
+
+/* Scrub a refcountbt record. */
+STATIC int
+xchk_refcountbt_rec(
+	struct xchk_btree	*bs,
+	const union xfs_btree_rec *rec)
+{
+	struct xfs_refcount_irec irec;
+	xfs_agblock_t		*cow_blocks = bs->private;
+	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
+
+	xfs_refcount_btrec_to_irec(rec, &irec);
+
+	/* Check the domain and refcount are not incompatible. */
+	if (!xfs_refcount_check_domain(&irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+		(*cow_blocks) += irec.rc_blockcount;
+
+	/* Check the extent. */
+	if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (irec.rc_refcount == 0)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	xchk_refcountbt_xref(bs->sc, &irec);
+
+	return 0;
+}
+
+/* Make sure we have as many refc blocks as the rmap says. */
+STATIC void
+xchk_refcount_xref_rmap(
+	struct xfs_scrub	*sc,
+	xfs_filblks_t		cow_blocks)
+{
+	xfs_extlen_t		refcbt_blocks = 0;
+	xfs_filblks_t		blocks;
+	int			error;
+
+	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Check that we saw as many refcbt blocks as the rmap knows about. */
+	error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks);
+	if (!xchk_btree_process_error(sc, sc->sa.refc_cur, 0, &error))
+		return;
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+			&XFS_RMAP_OINFO_REFC, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	if (blocks != refcbt_blocks)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+	/* Check that we saw as many cow blocks as the rmap knows about. */
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+			&XFS_RMAP_OINFO_COW, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	if (blocks != cow_blocks)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xchk_refcountbt(
+	struct xfs_scrub	*sc)
+{
+	xfs_agblock_t		cow_blocks = 0;
+	int			error;
+
+	error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec,
+			&XFS_RMAP_OINFO_REFC, &cow_blocks);
+	if (error)
+		return error;
+
+	xchk_refcount_xref_rmap(sc, cow_blocks);
+
+	return 0;
+}
+
+/* xref check that a cow staging extent is marked in the refcountbt. */
+void
+xchk_xref_is_cow_staging(
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			len)
+{
+	struct xfs_refcount_irec	rc;
+	int				has_refcount;
+	int				error;
+
+	if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Find the CoW staging extent. */
+	error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW,
+			agbno, &has_refcount);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
+		return;
+	if (!has_refcount) {
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+		return;
+	}
+
+	error = xfs_refcount_get_rec(sc->sa.refc_cur, &rc, &has_refcount);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
+		return;
+	if (!has_refcount) {
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+		return;
+	}
+
+	/* CoW lookup returned a shared extent record? */
+	if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+
+	/* Must be at least as long as what was passed in */
+	if (rc.rc_blockcount < len)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+}
+
+/*
+ * xref check that the extent is not shared.  Only file data blocks
+ * can have multiple owners.
+ */
+void
+xchk_xref_is_not_shared(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	bool			shared;
+	int			error;
+
+	if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED,
+			agbno, len, &shared);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
+		return;
+	if (shared)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
new file mode 100644
index 000000000..c18bd039f
--- /dev/null
+++ b/fs/xfs/scrub/repair.c
@@ -0,0 +1,963 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+
+/*
+ * Attempt to repair some metadata, if the metadata is corrupt and userspace
+ * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
+ * and will set *fixed to true if it thinks it repaired anything.
+ */
+int
+xrep_attempt(
+	struct xfs_scrub	*sc)
+{
+	int			error = 0;
+
+	trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
+
+	xchk_ag_btcur_free(&sc->sa);
+
+	/* Repair whatever's broken. */
+	ASSERT(sc->ops->repair);
+	error = sc->ops->repair(sc);
+	trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
+	switch (error) {
+	case 0:
+		/*
+		 * Repair succeeded.  Commit the fixes and perform a second
+		 * scrub so that we can tell userspace if we fixed the problem.
+		 */
+		sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+		sc->flags |= XREP_ALREADY_FIXED;
+		return -EAGAIN;
+	case -EDEADLOCK:
+	case -EAGAIN:
+		/* Tell the caller to try again having grabbed all the locks. */
+		if (!(sc->flags & XCHK_TRY_HARDER)) {
+			sc->flags |= XCHK_TRY_HARDER;
+			return -EAGAIN;
+		}
+		/*
+		 * We tried harder but still couldn't grab all the resources
+		 * we needed to fix it.  The corruption has not been fixed,
+		 * so report back to userspace.
+		 */
+		return -EFSCORRUPTED;
+	default:
+		return error;
+	}
+}
+
+/*
+ * Complain about unfixable problems in the filesystem.  We don't log
+ * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
+ * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
+ * administrator isn't running xfs_scrub in no-repairs mode.
+ *
+ * Use this helper function because _ratelimited silently declares a static
+ * structure to track rate limiting information.
+ */
+void
+xrep_failure(
+	struct xfs_mount	*mp)
+{
+	xfs_alert_ratelimited(mp,
+"Corruption not fixed during online repair.  Unmount and run xfs_repair.");
+}
+
+/*
+ * Repair probe -- userspace uses this to probe if we're willing to repair a
+ * given mountpoint.
+ */
+int
+xrep_probe(
+	struct xfs_scrub	*sc)
+{
+	int			error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	return 0;
+}
+
+/*
+ * Roll a transaction, keeping the AG headers locked and reinitializing
+ * the btree cursors.
+ */
+int
+xrep_roll_ag_trans(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	/* Keep the AG header buffers locked so we can keep going. */
+	if (sc->sa.agi_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+	if (sc->sa.agfl_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+
+	/*
+	 * Roll the transaction.  We still own the buffer and the buffer lock
+	 * regardless of whether or not the roll succeeds.  If the roll fails,
+	 * the buffers will be released during teardown on our way out of the
+	 * kernel.  If it succeeds, we join them to the new transaction and
+	 * move on.
+	 */
+	error = xfs_trans_roll(&sc->tp);
+	if (error)
+		return error;
+
+	/* Join AG headers to the new transaction. */
+	if (sc->sa.agi_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+	if (sc->sa.agfl_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+
+	return 0;
+}
+
+/*
+ * Does the given AG have enough space to rebuild a btree?  Neither AG
+ * reservation can be critical, and we must have enough space (factoring
+ * in AG reservations) to construct a whole btree.
+ */
+bool
+xrep_ag_has_space(
+	struct xfs_perag	*pag,
+	xfs_extlen_t		nr_blocks,
+	enum xfs_ag_resv_type	type)
+{
+	return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
+		!xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
+		pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
+}
+
+/*
+ * Figure out how many blocks to reserve for an AG repair.  We calculate the
+ * worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-AG btree.
+ */
+xfs_extlen_t
+xrep_calc_ag_resblks(
+	struct xfs_scrub		*sc)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_scrub_metadata	*sm = sc->sm;
+	struct xfs_perag		*pag;
+	struct xfs_buf			*bp;
+	xfs_agino_t			icount = NULLAGINO;
+	xfs_extlen_t			aglen = NULLAGBLOCK;
+	xfs_extlen_t			usedlen;
+	xfs_extlen_t			freelen;
+	xfs_extlen_t			bnobt_sz;
+	xfs_extlen_t			inobt_sz;
+	xfs_extlen_t			rmapbt_sz;
+	xfs_extlen_t			refcbt_sz;
+	int				error;
+
+	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+		return 0;
+
+	pag = xfs_perag_get(mp, sm->sm_agno);
+	if (pag->pagi_init) {
+		/* Use in-core icount if possible. */
+		icount = pag->pagi_count;
+	} else {
+		/* Try to get the actual counters from disk. */
+		error = xfs_ialloc_read_agi(pag, NULL, &bp);
+		if (!error) {
+			icount = pag->pagi_count;
+			xfs_buf_relse(bp);
+		}
+	}
+
+	/* Now grab the block counters from the AGF. */
+	error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
+	if (error) {
+		aglen = pag->block_count;
+		freelen = aglen;
+		usedlen = aglen;
+	} else {
+		struct xfs_agf	*agf = bp->b_addr;
+
+		aglen = be32_to_cpu(agf->agf_length);
+		freelen = be32_to_cpu(agf->agf_freeblks);
+		usedlen = aglen - freelen;
+		xfs_buf_relse(bp);
+	}
+
+	/* If the icount is impossible, make some worst-case assumptions. */
+	if (icount == NULLAGINO ||
+	    !xfs_verify_agino(pag, icount)) {
+		icount = pag->agino_max - pag->agino_min + 1;
+	}
+
+	/* If the block counts are impossible, make worst-case assumptions. */
+	if (aglen == NULLAGBLOCK ||
+	    aglen != pag->block_count ||
+	    freelen >= aglen) {
+		aglen = pag->block_count;
+		freelen = aglen;
+		usedlen = aglen;
+	}
+	xfs_perag_put(pag);
+
+	trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
+			freelen, usedlen);
+
+	/*
+	 * Figure out how many blocks we'd need worst case to rebuild
+	 * each type of btree.  Note that we can only rebuild the
+	 * bnobt/cntbt or inobt/finobt as pairs.
+	 */
+	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
+	if (xfs_has_sparseinodes(mp))
+		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+				XFS_INODES_PER_HOLEMASK_BIT);
+	else
+		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+				XFS_INODES_PER_CHUNK);
+	if (xfs_has_finobt(mp))
+		inobt_sz *= 2;
+	if (xfs_has_reflink(mp))
+		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
+	else
+		refcbt_sz = 0;
+	if (xfs_has_rmapbt(mp)) {
+		/*
+		 * Guess how many blocks we need to rebuild the rmapbt.
+		 * For non-reflink filesystems we can't have more records than
+		 * used blocks.  However, with reflink it's possible to have
+		 * more than one rmap record per AG block.  We don't know how
+		 * many rmaps there could be in the AG, so we start off with
+		 * what we hope is an generous over-estimation.
+		 */
+		if (xfs_has_reflink(mp))
+			rmapbt_sz = xfs_rmapbt_calc_size(mp,
+					(unsigned long long)aglen * 2);
+		else
+			rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
+	} else {
+		rmapbt_sz = 0;
+	}
+
+	trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
+			inobt_sz, rmapbt_sz, refcbt_sz);
+
+	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
+}
+
+/* Allocate a block in an AG. */
+int
+xrep_alloc_ag_block(
+	struct xfs_scrub		*sc,
+	const struct xfs_owner_info	*oinfo,
+	xfs_fsblock_t			*fsbno,
+	enum xfs_ag_resv_type		resv)
+{
+	struct xfs_alloc_arg		args = {0};
+	xfs_agblock_t			bno;
+	int				error;
+
+	switch (resv) {
+	case XFS_AG_RESV_AGFL:
+	case XFS_AG_RESV_RMAPBT:
+		error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
+				sc->sa.agf_bp, &bno, 1);
+		if (error)
+			return error;
+		if (bno == NULLAGBLOCK)
+			return -ENOSPC;
+		xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
+		*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
+		if (resv == XFS_AG_RESV_RMAPBT)
+			xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
+		return 0;
+	default:
+		break;
+	}
+
+	args.tp = sc->tp;
+	args.mp = sc->mp;
+	args.oinfo = *oinfo;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0);
+	args.minlen = 1;
+	args.maxlen = 1;
+	args.prod = 1;
+	args.type = XFS_ALLOCTYPE_THIS_AG;
+	args.resv = resv;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		return error;
+	if (args.fsbno == NULLFSBLOCK)
+		return -ENOSPC;
+	ASSERT(args.len == 1);
+	*fsbno = args.fsbno;
+
+	return 0;
+}
+
+/* Initialize a new AG btree root block with zero entries. */
+int
+xrep_init_btblock(
+	struct xfs_scrub		*sc,
+	xfs_fsblock_t			fsb,
+	struct xfs_buf			**bpp,
+	xfs_btnum_t			btnum,
+	const struct xfs_buf_ops	*ops)
+{
+	struct xfs_trans		*tp = sc->tp;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*bp;
+	int				error;
+
+	trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), btnum);
+
+	ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
+	error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
+			&bp);
+	if (error)
+		return error;
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
+	xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
+	bp->b_ops = ops;
+	*bpp = bp;
+
+	return 0;
+}
+
+/*
+ * Reconstructing per-AG Btrees
+ *
+ * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
+ * we scan secondary space metadata to derive the records that should be in
+ * the damaged btree, initialize a fresh btree root, and insert the records.
+ * Note that for rebuilding the rmapbt we scan all the primary data to
+ * generate the new records.
+ *
+ * However, that leaves the matter of removing all the metadata describing the
+ * old broken structure.  For primary metadata we use the rmap data to collect
+ * every extent with a matching rmap owner (bitmap); we then iterate all other
+ * metadata structures with the same rmap owner to collect the extents that
+ * cannot be removed (sublist).  We then subtract sublist from bitmap to
+ * derive the blocks that were used by the old btree.  These blocks can be
+ * reaped.
+ *
+ * For rmapbt reconstructions we must use different tactics for extent
+ * collection.  First we iterate all primary metadata (this excludes the old
+ * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
+ * records are collected as bitmap.  The bnobt records are collected as
+ * sublist.  As with the other btrees we subtract sublist from bitmap, and the
+ * result (since the rmapbt lives in the free space) are the blocks from the
+ * old rmapbt.
+ *
+ * Disposal of Blocks from Old per-AG Btrees
+ *
+ * Now that we've constructed a new btree to replace the damaged one, we want
+ * to dispose of the blocks that (we think) the old btree was using.
+ * Previously, we used the rmapbt to collect the extents (bitmap) with the
+ * rmap owner corresponding to the tree we rebuilt, collected extents for any
+ * blocks with the same rmap owner that are owned by another data structure
+ * (sublist), and subtracted sublist from bitmap.  In theory the extents
+ * remaining in bitmap are the old btree's blocks.
+ *
+ * Unfortunately, it's possible that the btree was crosslinked with other
+ * blocks on disk.  The rmap data can tell us if there are multiple owners, so
+ * if the rmapbt says there is an owner of this block other than @oinfo, then
+ * the block is crosslinked.  Remove the reverse mapping and continue.
+ *
+ * If there is one rmap record, we can free the block, which removes the
+ * reverse mapping but doesn't add the block to the free space.  Our repair
+ * strategy is to hope the other metadata objects crosslinked on this block
+ * will be rebuilt (atop different blocks), thereby removing all the cross
+ * links.
+ *
+ * If there are no rmap records at all, we also free the block.  If the btree
+ * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
+ * supposed to be a rmap record and everything is ok.  For other btrees there
+ * had to have been an rmap entry for the block to have ended up on @bitmap,
+ * so if it's gone now there's something wrong and the fs will shut down.
+ *
+ * Note: If there are multiple rmap records with only the same rmap owner as
+ * the btree we're trying to rebuild and the block is indeed owned by another
+ * data structure with the same rmap owner, then the block will be in sublist
+ * and therefore doesn't need disposal.  If there are multiple rmap records
+ * with only the same rmap owner but the block is not owned by something with
+ * the same rmap owner, the block will be freed.
+ *
+ * The caller is responsible for locking the AG headers for the entire rebuild
+ * operation so that nothing else can sneak in and change the AG state while
+ * we're not looking.  We also assume that the caller already invalidated any
+ * buffers associated with @bitmap.
+ */
+
+/*
+ * Invalidate buffers for per-AG btree blocks we're dumping.  This function
+ * is not intended for use with file data repairs; we have bunmapi for that.
+ */
+int
+xrep_invalidate_blocks(
+	struct xfs_scrub	*sc,
+	struct xbitmap		*bitmap)
+{
+	struct xbitmap_range	*bmr;
+	struct xbitmap_range	*n;
+	struct xfs_buf		*bp;
+	xfs_fsblock_t		fsbno;
+
+	/*
+	 * For each block in each extent, see if there's an incore buffer for
+	 * exactly that block; if so, invalidate it.  The buffer cache only
+	 * lets us look for one buffer at a time, so we have to look one block
+	 * at a time.  Avoid invalidating AG headers and post-EOFS blocks
+	 * because we never own those; and if we can't TRYLOCK the buffer we
+	 * assume it's owned by someone else.
+	 */
+	for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
+		int		error;
+
+		/* Skip AG headers and post-EOFS blocks */
+		if (!xfs_verify_fsbno(sc->mp, fsbno))
+			continue;
+		error = xfs_buf_incore(sc->mp->m_ddev_targp,
+				XFS_FSB_TO_DADDR(sc->mp, fsbno),
+				XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
+		if (error)
+			continue;
+
+		xfs_trans_bjoin(sc->tp, bp);
+		xfs_trans_binval(sc->tp, bp);
+	}
+
+	return 0;
+}
+
+/* Ensure the freelist is the correct size. */
+int
+xrep_fix_freelist(
+	struct xfs_scrub	*sc,
+	bool			can_shrink)
+{
+	struct xfs_alloc_arg	args = {0};
+
+	args.mp = sc->mp;
+	args.tp = sc->tp;
+	args.agno = sc->sa.pag->pag_agno;
+	args.alignment = 1;
+	args.pag = sc->sa.pag;
+
+	return xfs_alloc_fix_freelist(&args,
+			can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+}
+
+/*
+ * Put a block back on the AGFL.
+ */
+STATIC int
+xrep_put_freelist(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno)
+{
+	int			error;
+
+	/* Make sure there's space on the freelist. */
+	error = xrep_fix_freelist(sc, true);
+	if (error)
+		return error;
+
+	/*
+	 * Since we're "freeing" a lost block onto the AGFL, we have to
+	 * create an rmap for the block prior to merging it or else other
+	 * parts will break.
+	 */
+	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
+			&XFS_RMAP_OINFO_AG);
+	if (error)
+		return error;
+
+	/* Put the block on the AGFL. */
+	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
+			sc->sa.agfl_bp, agbno, 0);
+	if (error)
+		return error;
+	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
+			XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+	return 0;
+}
+
+/* Dispose of a single block. */
+STATIC int
+xrep_reap_block(
+	struct xfs_scrub		*sc,
+	xfs_fsblock_t			fsbno,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		resv)
+{
+	struct xfs_btree_cur		*cur;
+	struct xfs_buf			*agf_bp = NULL;
+	xfs_agblock_t			agbno;
+	bool				has_other_rmap;
+	int				error;
+
+	agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+	ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
+
+	/*
+	 * If we are repairing per-inode metadata, we need to read in the AGF
+	 * buffer.  Otherwise, we're repairing a per-AG structure, so reuse
+	 * the AGF buffer that the setup functions already grabbed.
+	 */
+	if (sc->ip) {
+		error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
+		if (error)
+			return error;
+	} else {
+		agf_bp = sc->sa.agf_bp;
+	}
+	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
+
+	/* Can we find any other rmappings? */
+	error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
+	xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out_free;
+
+	/*
+	 * If there are other rmappings, this block is cross linked and must
+	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
+	 * we were the only owner of the block, so free the extent, which will
+	 * also remove the rmap.
+	 *
+	 * XXX: XFS doesn't support detecting the case where a single block
+	 * metadata structure is crosslinked with a multi-block structure
+	 * because the buffer cache doesn't detect aliasing problems, so we
+	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
+	 * blow on writeout, the filesystem will shut down, and the admin gets
+	 * to run xfs_repair.
+	 */
+	if (has_other_rmap)
+		error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
+					1, oinfo);
+	else if (resv == XFS_AG_RESV_AGFL)
+		error = xrep_put_freelist(sc, agbno);
+	else
+		error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+	if (agf_bp != sc->sa.agf_bp)
+		xfs_trans_brelse(sc->tp, agf_bp);
+	if (error)
+		return error;
+
+	if (sc->ip)
+		return xfs_trans_roll_inode(&sc->tp, sc->ip);
+	return xrep_roll_ag_trans(sc);
+
+out_free:
+	if (agf_bp != sc->sa.agf_bp)
+		xfs_trans_brelse(sc->tp, agf_bp);
+	return error;
+}
+
+/* Dispose of every block of every extent in the bitmap. */
+int
+xrep_reap_extents(
+	struct xfs_scrub		*sc,
+	struct xbitmap			*bitmap,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		type)
+{
+	struct xbitmap_range		*bmr;
+	struct xbitmap_range		*n;
+	xfs_fsblock_t			fsbno;
+	int				error = 0;
+
+	ASSERT(xfs_has_rmapbt(sc->mp));
+
+	for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
+		ASSERT(sc->ip != NULL ||
+		       XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
+		trace_xrep_dispose_btree_extent(sc->mp,
+				XFS_FSB_TO_AGNO(sc->mp, fsbno),
+				XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
+
+		error = xrep_reap_block(sc, fsbno, oinfo, type);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+/*
+ * Finding per-AG Btree Roots for AGF/AGI Reconstruction
+ *
+ * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
+ * the AG headers by using the rmap data to rummage through the AG looking for
+ * btree roots.  This is not guaranteed to work if the AG is heavily damaged
+ * or the rmap data are corrupt.
+ *
+ * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
+ * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
+ * AGI is being rebuilt.  It must maintain these locks until it's safe for
+ * other threads to change the btrees' shapes.  The caller provides
+ * information about the btrees to look for by passing in an array of
+ * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
+ * The (root, height) fields will be set on return if anything is found.  The
+ * last element of the array should have a NULL buf_ops to mark the end of the
+ * array.
+ *
+ * For every rmapbt record matching any of the rmap owners in btree_info,
+ * read each block referenced by the rmap record.  If the block is a btree
+ * block from this filesystem matching any of the magic numbers and has a
+ * level higher than what we've already seen, remember the block and the
+ * height of the tree required to have such a block.  When the call completes,
+ * we return the highest block we've found for each btree description; those
+ * should be the roots.
+ */
+
+struct xrep_findroot {
+	struct xfs_scrub		*sc;
+	struct xfs_buf			*agfl_bp;
+	struct xfs_agf			*agf;
+	struct xrep_find_ag_btree	*btree_info;
+};
+
+/* See if our block is in the AGFL. */
+STATIC int
+xrep_findroot_agfl_walk(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		bno,
+	void			*priv)
+{
+	xfs_agblock_t		*agbno = priv;
+
+	return (*agbno == bno) ? -ECANCELED : 0;
+}
+
+/* Does this block match the btree information passed in? */
+STATIC int
+xrep_findroot_block(
+	struct xrep_findroot		*ri,
+	struct xrep_find_ag_btree	*fab,
+	uint64_t			owner,
+	xfs_agblock_t			agbno,
+	bool				*done_with_block)
+{
+	struct xfs_mount		*mp = ri->sc->mp;
+	struct xfs_buf			*bp;
+	struct xfs_btree_block		*btblock;
+	xfs_daddr_t			daddr;
+	int				block_level;
+	int				error = 0;
+
+	daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
+
+	/*
+	 * Blocks in the AGFL have stale contents that might just happen to
+	 * have a matching magic and uuid.  We don't want to pull these blocks
+	 * in as part of a tree root, so we have to filter out the AGFL stuff
+	 * here.  If the AGFL looks insane we'll just refuse to repair.
+	 */
+	if (owner == XFS_RMAP_OWN_AG) {
+		error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
+				xrep_findroot_agfl_walk, &agbno);
+		if (error == -ECANCELED)
+			return 0;
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Read the buffer into memory so that we can see if it's a match for
+	 * our btree type.  We have no clue if it is beforehand, and we want to
+	 * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
+	 * will cause needless disk reads in subsequent calls to this function)
+	 * and logging metadata verifier failures.
+	 *
+	 * Therefore, pass in NULL buffer ops.  If the buffer was already in
+	 * memory from some other caller it will already have b_ops assigned.
+	 * If it was in memory from a previous unsuccessful findroot_block
+	 * call, the buffer won't have b_ops but it should be clean and ready
+	 * for us to try to verify if the read call succeeds.  The same applies
+	 * if the buffer wasn't in memory at all.
+	 *
+	 * Note: If we never match a btree type with this buffer, it will be
+	 * left in memory with NULL b_ops.  This shouldn't be a problem unless
+	 * the buffer gets written.
+	 */
+	error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
+			mp->m_bsize, 0, &bp, NULL);
+	if (error)
+		return error;
+
+	/* Ensure the block magic matches the btree type we're looking for. */
+	btblock = XFS_BUF_TO_BLOCK(bp);
+	ASSERT(fab->buf_ops->magic[1] != 0);
+	if (btblock->bb_magic != fab->buf_ops->magic[1])
+		goto out;
+
+	/*
+	 * If the buffer already has ops applied and they're not the ones for
+	 * this btree type, we know this block doesn't match the btree and we
+	 * can bail out.
+	 *
+	 * If the buffer ops match ours, someone else has already validated
+	 * the block for us, so we can move on to checking if this is a root
+	 * block candidate.
+	 *
+	 * If the buffer does not have ops, nobody has successfully validated
+	 * the contents and the buffer cannot be dirty.  If the magic, uuid,
+	 * and structure match this btree type then we'll move on to checking
+	 * if it's a root block candidate.  If there is no match, bail out.
+	 */
+	if (bp->b_ops) {
+		if (bp->b_ops != fab->buf_ops)
+			goto out;
+	} else {
+		ASSERT(!xfs_trans_buf_is_dirty(bp));
+		if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
+				&mp->m_sb.sb_meta_uuid))
+			goto out;
+		/*
+		 * Read verifiers can reference b_ops, so we set the pointer
+		 * here.  If the verifier fails we'll reset the buffer state
+		 * to what it was before we touched the buffer.
+		 */
+		bp->b_ops = fab->buf_ops;
+		fab->buf_ops->verify_read(bp);
+		if (bp->b_error) {
+			bp->b_ops = NULL;
+			bp->b_error = 0;
+			goto out;
+		}
+
+		/*
+		 * Some read verifiers will (re)set b_ops, so we must be
+		 * careful not to change b_ops after running the verifier.
+		 */
+	}
+
+	/*
+	 * This block passes the magic/uuid and verifier tests for this btree
+	 * type.  We don't need the caller to try the other tree types.
+	 */
+	*done_with_block = true;
+
+	/*
+	 * Compare this btree block's level to the height of the current
+	 * candidate root block.
+	 *
+	 * If the level matches the root we found previously, throw away both
+	 * blocks because there can't be two candidate roots.
+	 *
+	 * If level is lower in the tree than the root we found previously,
+	 * ignore this block.
+	 */
+	block_level = xfs_btree_get_level(btblock);
+	if (block_level + 1 == fab->height) {
+		fab->root = NULLAGBLOCK;
+		goto out;
+	} else if (block_level < fab->height) {
+		goto out;
+	}
+
+	/*
+	 * This is the highest block in the tree that we've found so far.
+	 * Update the btree height to reflect what we've learned from this
+	 * block.
+	 */
+	fab->height = block_level + 1;
+
+	/*
+	 * If this block doesn't have sibling pointers, then it's the new root
+	 * block candidate.  Otherwise, the root will be found farther up the
+	 * tree.
+	 */
+	if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
+	    btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+		fab->root = agbno;
+	else
+		fab->root = NULLAGBLOCK;
+
+	trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
+			be32_to_cpu(btblock->bb_magic), fab->height - 1);
+out:
+	xfs_trans_brelse(ri->sc->tp, bp);
+	return error;
+}
+
+/*
+ * Do any of the blocks in this rmap record match one of the btrees we're
+ * looking for?
+ */
+STATIC int
+xrep_findroot_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_findroot		*ri = priv;
+	struct xrep_find_ag_btree	*fab;
+	xfs_agblock_t			b;
+	bool				done;
+	int				error = 0;
+
+	/* Ignore anything that isn't AG metadata. */
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+		return 0;
+
+	/* Otherwise scan each block + btree type. */
+	for (b = 0; b < rec->rm_blockcount; b++) {
+		done = false;
+		for (fab = ri->btree_info; fab->buf_ops; fab++) {
+			if (rec->rm_owner != fab->rmap_owner)
+				continue;
+			error = xrep_findroot_block(ri, fab,
+					rec->rm_owner, rec->rm_startblock + b,
+					&done);
+			if (error)
+				return error;
+			if (done)
+				break;
+		}
+	}
+
+	return 0;
+}
+
+/* Find the roots of the per-AG btrees described in btree_info. */
+int
+xrep_find_ag_btree_roots(
+	struct xfs_scrub		*sc,
+	struct xfs_buf			*agf_bp,
+	struct xrep_find_ag_btree	*btree_info,
+	struct xfs_buf			*agfl_bp)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xrep_findroot		ri;
+	struct xrep_find_ag_btree	*fab;
+	struct xfs_btree_cur		*cur;
+	int				error;
+
+	ASSERT(xfs_buf_islocked(agf_bp));
+	ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
+
+	ri.sc = sc;
+	ri.btree_info = btree_info;
+	ri.agf = agf_bp->b_addr;
+	ri.agfl_bp = agfl_bp;
+	for (fab = btree_info; fab->buf_ops; fab++) {
+		ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
+		ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
+		fab->root = NULLAGBLOCK;
+		fab->height = 0;
+	}
+
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
+	error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
+	xfs_btree_del_cursor(cur, error);
+
+	return error;
+}
+
+/* Force a quotacheck the next time we mount. */
+void
+xrep_force_quotacheck(
+	struct xfs_scrub	*sc,
+	xfs_dqtype_t		type)
+{
+	uint			flag;
+
+	flag = xfs_quota_chkd_flag(type);
+	if (!(flag & sc->mp->m_qflags))
+		return;
+
+	mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
+	sc->mp->m_qflags &= ~flag;
+	spin_lock(&sc->mp->m_sb_lock);
+	sc->mp->m_sb.sb_qflags &= ~flag;
+	spin_unlock(&sc->mp->m_sb_lock);
+	xfs_log_sb(sc->tp);
+	mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+}
+
+/*
+ * Attach dquots to this inode, or schedule quotacheck to fix them.
+ *
+ * This function ensures that the appropriate dquots are attached to an inode.
+ * We cannot allow the dquot code to allocate an on-disk dquot block here
+ * because we're already in transaction context with the inode locked.  The
+ * on-disk dquot should already exist anyway.  If the quota code signals
+ * corruption or missing quota information, schedule quotacheck, which will
+ * repair corruptions in the quota metadata.
+ */
+int
+xrep_ino_dqattach(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	error = xfs_qm_dqattach_locked(sc->ip, false);
+	switch (error) {
+	case -EFSBADCRC:
+	case -EFSCORRUPTED:
+	case -ENOENT:
+		xfs_err_ratelimited(sc->mp,
+"inode %llu repair encountered quota error %d, quotacheck forced.",
+				(unsigned long long)sc->ip->i_ino, error);
+		if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
+			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+		if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
+			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+		if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
+			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+		fallthrough;
+	case -ESRCH:
+		error = 0;
+		break;
+	default:
+		break;
+	}
+
+	return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
new file mode 100644
index 000000000..840f74ec4
--- /dev/null
+++ b/fs/xfs/scrub/repair.h
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_REPAIR_H__
+#define __XFS_SCRUB_REPAIR_H__
+
+#include "xfs_quota_defs.h"
+
+static inline int xrep_notsupported(struct xfs_scrub *sc)
+{
+	return -EOPNOTSUPP;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+
+/* Repair helpers */
+
+int xrep_attempt(struct xfs_scrub *sc);
+void xrep_failure(struct xfs_mount *mp);
+int xrep_roll_ag_trans(struct xfs_scrub *sc);
+bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
+		enum xfs_ag_resv_type type);
+xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
+int xrep_alloc_ag_block(struct xfs_scrub *sc,
+		const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+		enum xfs_ag_resv_type resv);
+int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
+		struct xfs_buf **bpp, xfs_btnum_t btnum,
+		const struct xfs_buf_ops *ops);
+
+struct xbitmap;
+
+int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
+int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
+int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist,
+		const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+
+struct xrep_find_ag_btree {
+	/* in: rmap owner of the btree we're looking for */
+	uint64_t			rmap_owner;
+
+	/* in: buffer ops */
+	const struct xfs_buf_ops	*buf_ops;
+
+	/* in: maximum btree height */
+	unsigned int			maxlevels;
+
+	/* out: the highest btree block found and the tree height */
+	xfs_agblock_t			root;
+	unsigned int			height;
+};
+
+int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
+		struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
+void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
+int xrep_ino_dqattach(struct xfs_scrub *sc);
+
+/* Metadata repairers */
+
+int xrep_probe(struct xfs_scrub *sc);
+int xrep_superblock(struct xfs_scrub *sc);
+int xrep_agf(struct xfs_scrub *sc);
+int xrep_agfl(struct xfs_scrub *sc);
+int xrep_agi(struct xfs_scrub *sc);
+
+#else
+
+static inline int
+xrep_attempt(
+	struct xfs_scrub	*sc)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void xrep_failure(struct xfs_mount *mp) {}
+
+static inline xfs_extlen_t
+xrep_calc_ag_resblks(
+	struct xfs_scrub	*sc)
+{
+	return 0;
+}
+
+#define xrep_probe			xrep_notsupported
+#define xrep_superblock			xrep_notsupported
+#define xrep_agf			xrep_notsupported
+#define xrep_agfl			xrep_notsupported
+#define xrep_agi			xrep_notsupported
+
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif	/* __XFS_SCRUB_REPAIR_H__ */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
new file mode 100644
index 000000000..229826b2e
--- /dev/null
+++ b/fs/xfs/scrub/rmap.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "xfs_ag.h"
+
+/*
+ * Set us up to scrub reverse mapping btrees.
+ */
+int
+xchk_setup_ag_rmapbt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_setup_ag_btree(sc, false);
+}
+
+/* Reverse-mapping scrubber. */
+
+/* Cross-reference a rmap against the refcount btree. */
+STATIC void
+xchk_rmapbt_xref_refc(
+	struct xfs_scrub	*sc,
+	struct xfs_rmap_irec	*irec)
+{
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	bool			non_inode;
+	bool			is_bmbt;
+	bool			is_attr;
+	bool			is_unwritten;
+	int			error;
+
+	if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+	is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+	is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+	is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+	/* If this is shared, must be a data fork extent. */
+	error = xfs_refcount_find_shared(sc->sa.refc_cur, irec->rm_startblock,
+			irec->rm_blockcount, &fbno, &flen, false);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
+		return;
+	if (flen != 0 && (non_inode || is_attr || is_bmbt || is_unwritten))
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_rmapbt_xref(
+	struct xfs_scrub	*sc,
+	struct xfs_rmap_irec	*irec)
+{
+	xfs_agblock_t		agbno = irec->rm_startblock;
+	xfs_extlen_t		len = irec->rm_blockcount;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	xchk_xref_is_used_space(sc, agbno, len);
+	if (irec->rm_owner == XFS_RMAP_OWN_INODES)
+		xchk_xref_is_inode_chunk(sc, agbno, len);
+	else
+		xchk_xref_is_not_inode_chunk(sc, agbno, len);
+	if (irec->rm_owner == XFS_RMAP_OWN_COW)
+		xchk_xref_is_cow_staging(sc, irec->rm_startblock,
+				irec->rm_blockcount);
+	else
+		xchk_rmapbt_xref_refc(sc, irec);
+}
+
+/* Scrub an rmapbt record. */
+STATIC int
+xchk_rmapbt_rec(
+	struct xchk_btree	*bs,
+	const union xfs_btree_rec *rec)
+{
+	struct xfs_mount	*mp = bs->cur->bc_mp;
+	struct xfs_rmap_irec	irec;
+	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
+	bool			non_inode;
+	bool			is_unwritten;
+	bool			is_bmbt;
+	bool			is_attr;
+	int			error;
+
+	error = xfs_rmap_btrec_to_irec(rec, &irec);
+	if (!xchk_btree_process_error(bs->sc, bs->cur, 0, &error))
+		goto out;
+
+	/* Check extent. */
+	if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (irec.rm_owner == XFS_RMAP_OWN_FS) {
+		/*
+		 * xfs_verify_agbno returns false for static fs metadata.
+		 * Since that only exists at the start of the AG, validate
+		 * that by hand.
+		 */
+		if (irec.rm_startblock != 0 ||
+		    irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	} else {
+		/*
+		 * Otherwise we must point somewhere past the static metadata
+		 * but before the end of the FS.  Run the regular check.
+		 */
+		if (!xfs_verify_agbno(pag, irec.rm_startblock) ||
+		    !xfs_verify_agbno(pag, irec.rm_startblock +
+				irec.rm_blockcount - 1))
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	}
+
+	/* Check flags. */
+	non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner);
+	is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK;
+	is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK;
+	is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN;
+
+	if (is_bmbt && irec.rm_offset != 0)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (non_inode && irec.rm_offset != 0)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (is_unwritten && (is_bmbt || non_inode || is_attr))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (non_inode && (is_bmbt || is_unwritten || is_attr))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (!non_inode) {
+		if (!xfs_verify_ino(mp, irec.rm_owner))
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	} else {
+		/* Non-inode owner within the magic values? */
+		if (irec.rm_owner <= XFS_RMAP_OWN_MIN ||
+		    irec.rm_owner > XFS_RMAP_OWN_FS)
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	}
+
+	xchk_rmapbt_xref(bs->sc, &irec);
+out:
+	return error;
+}
+
+/* Scrub the rmap btree for some AG. */
+int
+xchk_rmapbt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec,
+			&XFS_RMAP_OINFO_AG, NULL);
+}
+
+/* xref check that the extent is owned by a given owner */
+static inline void
+xchk_xref_check_owner(
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	bool				should_have_rmap)
+{
+	bool				has_rmap;
+	int				error;
+
+	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo,
+			&has_rmap);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	if (has_rmap != should_have_rmap)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+}
+
+/* xref check that the extent is owned by a given owner */
+void
+xchk_xref_is_owned_by(
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
+{
+	xchk_xref_check_owner(sc, bno, len, oinfo, true);
+}
+
+/* xref check that the extent is not owned by a given owner */
+void
+xchk_xref_is_not_owned_by(
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
+{
+	xchk_xref_check_owner(sc, bno, len, oinfo, false);
+}
+
+/* xref check that the extent has no reverse mapping at all */
+void
+xchk_xref_has_no_owner(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len)
+{
+	bool			has_rmap;
+	int			error;
+
+	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	if (has_rmap)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
new file mode 100644
index 000000000..0a3bde64c
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Set us up with the realtime metadata locked. */
+int
+xchk_setup_rt(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	error = xchk_setup_fs(sc);
+	if (error)
+		return error;
+
+	sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP;
+	sc->ip = sc->mp->m_rbmip;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+
+	return 0;
+}
+
+/* Realtime bitmap. */
+
+/* Scrub a free extent record from the realtime bitmap. */
+STATIC int
+xchk_rtbitmap_rec(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	const struct xfs_rtalloc_rec *rec,
+	void			*priv)
+{
+	struct xfs_scrub	*sc = priv;
+	xfs_rtblock_t		startblock;
+	xfs_rtblock_t		blockcount;
+
+	startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+	blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
+
+	if (!xfs_verify_rtext(mp, startblock, blockcount))
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+	return 0;
+}
+
+/* Make sure the entire rtbitmap file is mapped with written extents. */
+STATIC int
+xchk_rtbitmap_check_extents(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_bmbt_irec	map;
+	xfs_rtblock_t		off;
+	int			nmap;
+	int			error = 0;
+
+	for (off = 0; off < mp->m_sb.sb_rbmblocks;) {
+		if (xchk_should_terminate(sc, &error) ||
+		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+			break;
+
+		/* Make sure we have a written extent. */
+		nmap = 1;
+		error = xfs_bmapi_read(mp->m_rbmip, off,
+				mp->m_sb.sb_rbmblocks - off, &map, &nmap,
+				XFS_DATA_FORK);
+		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+			break;
+
+		if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+			break;
+		}
+
+		off += map.br_blockcount;
+	}
+
+	return error;
+}
+
+/* Scrub the realtime bitmap. */
+int
+xchk_rtbitmap(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	/* Is the size of the rtbitmap correct? */
+	if (sc->mp->m_rbmip->i_disk_size !=
+	    XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) {
+		xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+		return 0;
+	}
+
+	/* Invoke the fork scrubber. */
+	error = xchk_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	error = xchk_rtbitmap_check_extents(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+		goto out;
+
+out:
+	return error;
+}
+
+/* Scrub the realtime summary. */
+int
+xchk_rtsummary(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_inode	*rsumip = sc->mp->m_rsumip;
+	struct xfs_inode	*old_ip = sc->ip;
+	uint			old_ilock_flags = sc->ilock_flags;
+	int			error = 0;
+
+	/*
+	 * We ILOCK'd the rt bitmap ip in the setup routine, now lock the
+	 * rt summary ip in compliance with the rt inode locking rules.
+	 *
+	 * Since we switch sc->ip to rsumip we have to save the old ilock
+	 * flags so that we don't mix up the inode state that @sc tracks.
+	 */
+	sc->ip = rsumip;
+	sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+
+	/* Invoke the fork scrubber. */
+	error = xchk_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		goto out;
+
+	/* XXX: implement this some day */
+	xchk_set_incomplete(sc);
+out:
+	/* Switch back to the rtbitmap inode and lock flags. */
+	xfs_iunlock(sc->ip, sc->ilock_flags);
+	sc->ilock_flags = old_ilock_flags;
+	sc->ip = old_ip;
+	return error;
+}
+
+
+/* xref check that the extent is not free in the rtbitmap */
+void
+xchk_xref_is_used_rt_space(
+	struct xfs_scrub	*sc,
+	xfs_rtblock_t		fsbno,
+	xfs_extlen_t		len)
+{
+	xfs_rtblock_t		startext;
+	xfs_rtblock_t		endext;
+	xfs_rtblock_t		extcount;
+	bool			is_free;
+	int			error;
+
+	if (xchk_skip_xref(sc->sm))
+		return;
+
+	startext = fsbno;
+	endext = fsbno + len - 1;
+	do_div(startext, sc->mp->m_sb.sb_rextsize);
+	do_div(endext, sc->mp->m_sb.sb_rextsize);
+	extcount = endext - startext + 1;
+	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
+			&is_free);
+	if (!xchk_should_check_xref(sc, &error, NULL))
+		goto out_unlock;
+	if (is_free)
+		xchk_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+out_unlock:
+	xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
new file mode 100644
index 000000000..95132490f
--- /dev/null
+++ b/fs/xfs/scrub/scrub.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/health.h"
+
+/*
+ * Online Scrub and Repair
+ *
+ * Traditionally, XFS (the kernel driver) did not know how to check or
+ * repair on-disk data structures.  That task was left to the xfs_check
+ * and xfs_repair tools, both of which require taking the filesystem
+ * offline for a thorough but time consuming examination.  Online
+ * scrub & repair, on the other hand, enables us to check the metadata
+ * for obvious errors while carefully stepping around the filesystem's
+ * ongoing operations, locking rules, etc.
+ *
+ * Given that most XFS metadata consist of records stored in a btree,
+ * most of the checking functions iterate the btree blocks themselves
+ * looking for irregularities.  When a record block is encountered, each
+ * record can be checked for obviously bad values.  Record values can
+ * also be cross-referenced against other btrees to look for potential
+ * misunderstandings between pieces of metadata.
+ *
+ * It is expected that the checkers responsible for per-AG metadata
+ * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
+ * metadata structure, and perform any relevant cross-referencing before
+ * unlocking the AG and returning the results to userspace.  These
+ * scrubbers must not keep an AG locked for too long to avoid tying up
+ * the block and inode allocators.
+ *
+ * Block maps and b-trees rooted in an inode present a special challenge
+ * because they can involve extents from any AG.  The general scrubber
+ * structure of lock -> check -> xref -> unlock still holds, but AG
+ * locking order rules /must/ be obeyed to avoid deadlocks.  The
+ * ordering rule, of course, is that we must lock in increasing AG
+ * order.  Helper functions are provided to track which AG headers we've
+ * already locked.  If we detect an imminent locking order violation, we
+ * can signal a potential deadlock, in which case the scrubber can jump
+ * out to the top level, lock all the AGs in order, and retry the scrub.
+ *
+ * For file data (directories, extended attributes, symlinks) scrub, we
+ * can simply lock the inode and walk the data.  For btree data
+ * (directories and attributes) we follow the same btree-scrubbing
+ * strategy outlined previously to check the records.
+ *
+ * We use a bit of trickery with transactions to avoid buffer deadlocks
+ * if there is a cycle in the metadata.  The basic problem is that
+ * travelling down a btree involves locking the current buffer at each
+ * tree level.  If a pointer should somehow point back to a buffer that
+ * we've already examined, we will deadlock due to the second buffer
+ * locking attempt.  Note however that grabbing a buffer in transaction
+ * context links the locked buffer to the transaction.  If we try to
+ * re-grab the buffer in the context of the same transaction, we avoid
+ * the second lock attempt and continue.  Between the verifier and the
+ * scrubber, something will notice that something is amiss and report
+ * the corruption.  Therefore, each scrubber will allocate an empty
+ * transaction, attach buffers to it, and cancel the transaction at the
+ * end of the scrub run.  Cancelling a non-dirty transaction simply
+ * unlocks the buffers.
+ *
+ * There are four pieces of data that scrub can communicate to
+ * userspace.  The first is the error code (errno), which can be used to
+ * communicate operational errors in performing the scrub.  There are
+ * also three flags that can be set in the scrub context.  If the data
+ * structure itself is corrupt, the CORRUPT flag will be set.  If
+ * the metadata is correct but otherwise suboptimal, the PREEN flag
+ * will be set.
+ *
+ * We perform secondary validation of filesystem metadata by
+ * cross-referencing every record with all other available metadata.
+ * For example, for block mapping extents, we verify that there are no
+ * records in the free space and inode btrees corresponding to that
+ * space extent and that there is a corresponding entry in the reverse
+ * mapping btree.  Inconsistent metadata is noted by setting the
+ * XCORRUPT flag; btree query function errors are noted by setting the
+ * XFAIL flag and deleting the cursor to prevent further attempts to
+ * cross-reference with a defective btree.
+ *
+ * If a piece of metadata proves corrupt or suboptimal, the userspace
+ * program can ask the kernel to apply some tender loving care (TLC) to
+ * the metadata object by setting the REPAIR flag and re-calling the
+ * scrub ioctl.  "Corruption" is defined by metadata violating the
+ * on-disk specification; operations cannot continue if the violation is
+ * left untreated.  It is possible for XFS to continue if an object is
+ * "suboptimal", however performance may be degraded.  Repairs are
+ * usually performed by rebuilding the metadata entirely out of
+ * redundant metadata.  Optimizing, on the other hand, can sometimes be
+ * done without rebuilding entire structures.
+ *
+ * Generally speaking, the repair code has the following code structure:
+ * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
+ * The first check helps us figure out if we need to rebuild or simply
+ * optimize the structure so that the rebuild knows what to do.  The
+ * second check evaluates the completeness of the repair; that is what
+ * is reported to userspace.
+ *
+ * A quick note on symbol prefixes:
+ * - "xfs_" are general XFS symbols.
+ * - "xchk_" are symbols related to metadata checking.
+ * - "xrep_" are symbols related to metadata repair.
+ * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS.
+ */
+
+/*
+ * Scrub probe -- userspace uses this to probe if we're willing to scrub
+ * or repair a given mountpoint.  This will be used by xfs_scrub to
+ * probe the kernel's abilities to scrub (and repair) the metadata.  We
+ * do this by validating the ioctl inputs from userspace, preparing the
+ * filesystem for a scrub (or a repair) operation, and immediately
+ * returning to userspace.  Userspace can use the returned errno and
+ * structure state to decide (in broad terms) if scrub/repair are
+ * supported by the running kernel.
+ */
+static int
+xchk_probe(
+	struct xfs_scrub	*sc)
+{
+	int			error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	return 0;
+}
+
+/* Scrub setup and teardown */
+
+/* Free all the resources and finish the transactions. */
+STATIC int
+xchk_teardown(
+	struct xfs_scrub	*sc,
+	int			error)
+{
+	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
+
+	xchk_ag_free(sc, &sc->sa);
+	if (sc->tp) {
+		if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+			error = xfs_trans_commit(sc->tp);
+		else
+			xfs_trans_cancel(sc->tp);
+		sc->tp = NULL;
+	}
+	if (sc->ip) {
+		if (sc->ilock_flags)
+			xfs_iunlock(sc->ip, sc->ilock_flags);
+		if (sc->ip != ip_in &&
+		    !xfs_internal_inum(sc->mp, sc->ip->i_ino))
+			xfs_irele(sc->ip);
+		sc->ip = NULL;
+	}
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+		mnt_drop_write_file(sc->file);
+	if (sc->buf) {
+		kmem_free(sc->buf);
+		sc->buf = NULL;
+	}
+	return error;
+}
+
+/* Scrubbing dispatch. */
+
+static const struct xchk_meta_ops meta_scrub_ops[] = {
+	[XFS_SCRUB_TYPE_PROBE] = {	/* ioctl presence test */
+		.type	= ST_NONE,
+		.setup	= xchk_setup_fs,
+		.scrub	= xchk_probe,
+		.repair = xrep_probe,
+	},
+	[XFS_SCRUB_TYPE_SB] = {		/* superblock */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_fs,
+		.scrub	= xchk_superblock,
+		.repair	= xrep_superblock,
+	},
+	[XFS_SCRUB_TYPE_AGF] = {	/* agf */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_fs,
+		.scrub	= xchk_agf,
+		.repair	= xrep_agf,
+	},
+	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_fs,
+		.scrub	= xchk_agfl,
+		.repair	= xrep_agfl,
+	},
+	[XFS_SCRUB_TYPE_AGI] = {	/* agi */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_fs,
+		.scrub	= xchk_agi,
+		.repair	= xrep_agi,
+	},
+	[XFS_SCRUB_TYPE_BNOBT] = {	/* bnobt */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_ag_allocbt,
+		.scrub	= xchk_bnobt,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_CNTBT] = {	/* cntbt */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_ag_allocbt,
+		.scrub	= xchk_cntbt,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_INOBT] = {	/* inobt */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_ag_iallocbt,
+		.scrub	= xchk_inobt,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_FINOBT] = {	/* finobt */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_ag_iallocbt,
+		.scrub	= xchk_finobt,
+		.has	= xfs_has_finobt,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_RMAPBT] = {	/* rmapbt */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_ag_rmapbt,
+		.scrub	= xchk_rmapbt,
+		.has	= xfs_has_rmapbt,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_REFCNTBT] = {	/* refcountbt */
+		.type	= ST_PERAG,
+		.setup	= xchk_setup_ag_refcountbt,
+		.scrub	= xchk_refcountbt,
+		.has	= xfs_has_reflink,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_INODE] = {	/* inode record */
+		.type	= ST_INODE,
+		.setup	= xchk_setup_inode,
+		.scrub	= xchk_inode,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_BMBTD] = {	/* inode data fork */
+		.type	= ST_INODE,
+		.setup	= xchk_setup_inode_bmap,
+		.scrub	= xchk_bmap_data,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_BMBTA] = {	/* inode attr fork */
+		.type	= ST_INODE,
+		.setup	= xchk_setup_inode_bmap,
+		.scrub	= xchk_bmap_attr,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_BMBTC] = {	/* inode CoW fork */
+		.type	= ST_INODE,
+		.setup	= xchk_setup_inode_bmap,
+		.scrub	= xchk_bmap_cow,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_DIR] = {	/* directory */
+		.type	= ST_INODE,
+		.setup	= xchk_setup_directory,
+		.scrub	= xchk_directory,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_XATTR] = {	/* extended attributes */
+		.type	= ST_INODE,
+		.setup	= xchk_setup_xattr,
+		.scrub	= xchk_xattr,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_SYMLINK] = {	/* symbolic link */
+		.type	= ST_INODE,
+		.setup	= xchk_setup_symlink,
+		.scrub	= xchk_symlink,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_PARENT] = {	/* parent pointers */
+		.type	= ST_INODE,
+		.setup	= xchk_setup_parent,
+		.scrub	= xchk_parent,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_RTBITMAP] = {	/* realtime bitmap */
+		.type	= ST_FS,
+		.setup	= xchk_setup_rt,
+		.scrub	= xchk_rtbitmap,
+		.has	= xfs_has_realtime,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_RTSUM] = {	/* realtime summary */
+		.type	= ST_FS,
+		.setup	= xchk_setup_rt,
+		.scrub	= xchk_rtsummary,
+		.has	= xfs_has_realtime,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_UQUOTA] = {	/* user quota */
+		.type	= ST_FS,
+		.setup	= xchk_setup_quota,
+		.scrub	= xchk_quota,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_GQUOTA] = {	/* group quota */
+		.type	= ST_FS,
+		.setup	= xchk_setup_quota,
+		.scrub	= xchk_quota,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_PQUOTA] = {	/* project quota */
+		.type	= ST_FS,
+		.setup	= xchk_setup_quota,
+		.scrub	= xchk_quota,
+		.repair	= xrep_notsupported,
+	},
+	[XFS_SCRUB_TYPE_FSCOUNTERS] = {	/* fs summary counters */
+		.type	= ST_FS,
+		.setup	= xchk_setup_fscounters,
+		.scrub	= xchk_fscounters,
+		.repair	= xrep_notsupported,
+	},
+};
+
+static int
+xchk_validate_inputs(
+	struct xfs_mount		*mp,
+	struct xfs_scrub_metadata	*sm)
+{
+	int				error;
+	const struct xchk_meta_ops	*ops;
+
+	error = -EINVAL;
+	/* Check our inputs. */
+	sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+	if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
+		goto out;
+	/* sm_reserved[] must be zero */
+	if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
+		goto out;
+
+	error = -ENOENT;
+	/* Do we know about this type of metadata? */
+	if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
+		goto out;
+	ops = &meta_scrub_ops[sm->sm_type];
+	if (ops->setup == NULL || ops->scrub == NULL)
+		goto out;
+	/* Does this fs even support this type of metadata? */
+	if (ops->has && !ops->has(mp))
+		goto out;
+
+	error = -EINVAL;
+	/* restricting fields must be appropriate for type */
+	switch (ops->type) {
+	case ST_NONE:
+	case ST_FS:
+		if (sm->sm_ino || sm->sm_gen || sm->sm_agno)
+			goto out;
+		break;
+	case ST_PERAG:
+		if (sm->sm_ino || sm->sm_gen ||
+		    sm->sm_agno >= mp->m_sb.sb_agcount)
+			goto out;
+		break;
+	case ST_INODE:
+		if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
+			goto out;
+		break;
+	default:
+		goto out;
+	}
+
+	/*
+	 * We only want to repair read-write v5+ filesystems.  Defer the check
+	 * for ops->repair until after our scrub confirms that we need to
+	 * perform repairs so that we avoid failing due to not supporting
+	 * repairing an object that doesn't need repairs.
+	 */
+	if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+		error = -EOPNOTSUPP;
+		if (!xfs_has_crc(mp))
+			goto out;
+
+		error = -EROFS;
+		if (xfs_is_readonly(mp))
+			goto out;
+	}
+
+	error = 0;
+out:
+	return error;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+static inline void xchk_postmortem(struct xfs_scrub *sc)
+{
+	/*
+	 * Userspace asked us to repair something, we repaired it, rescanned
+	 * it, and the rescan says it's still broken.  Scream about this in
+	 * the system logs.
+	 */
+	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+	    (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+				 XFS_SCRUB_OFLAG_XCORRUPT)))
+		xrep_failure(sc->mp);
+}
+#else
+static inline void xchk_postmortem(struct xfs_scrub *sc)
+{
+	/*
+	 * Userspace asked us to scrub something, it's broken, and we have no
+	 * way of fixing it.  Scream in the logs.
+	 */
+	if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+				XFS_SCRUB_OFLAG_XCORRUPT))
+		xfs_alert_ratelimited(sc->mp,
+				"Corruption detected during scrub.");
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+/* Dispatch metadata scrubbing. */
+int
+xfs_scrub_metadata(
+	struct file			*file,
+	struct xfs_scrub_metadata	*sm)
+{
+	struct xfs_scrub		*sc;
+	struct xfs_mount		*mp = XFS_I(file_inode(file))->i_mount;
+	int				error = 0;
+
+	BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
+		(sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
+
+	trace_xchk_start(XFS_I(file_inode(file)), sm, error);
+
+	/* Forbidden if we are shut down or mounted norecovery. */
+	error = -ESHUTDOWN;
+	if (xfs_is_shutdown(mp))
+		goto out;
+	error = -ENOTRECOVERABLE;
+	if (xfs_has_norecovery(mp))
+		goto out;
+
+	error = xchk_validate_inputs(mp, sm);
+	if (error)
+		goto out;
+
+	xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
+ "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
+
+	sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+	if (!sc) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	sc->mp = mp;
+	sc->file = file;
+	sc->sm = sm;
+	sc->ops = &meta_scrub_ops[sm->sm_type];
+	sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
+retry_op:
+	/*
+	 * When repairs are allowed, prevent freezing or readonly remount while
+	 * scrub is running with a real transaction.
+	 */
+	if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+		error = mnt_want_write_file(sc->file);
+		if (error)
+			goto out_sc;
+	}
+
+	/* Set up for the operation. */
+	error = sc->ops->setup(sc);
+	if (error)
+		goto out_teardown;
+
+	/* Scrub for errors. */
+	error = sc->ops->scrub(sc);
+	if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
+		/*
+		 * Scrubbers return -EDEADLOCK to mean 'try harder'.
+		 * Tear down everything we hold, then set up again with
+		 * preparation for worst-case scenarios.
+		 */
+		error = xchk_teardown(sc, 0);
+		if (error)
+			goto out_sc;
+		sc->flags |= XCHK_TRY_HARDER;
+		goto retry_op;
+	} else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
+		goto out_teardown;
+
+	xchk_update_health(sc);
+
+	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+	    !(sc->flags & XREP_ALREADY_FIXED)) {
+		bool needs_fix;
+
+		/* Let debug users force us into the repair routines. */
+		if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+			sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+		needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+						 XFS_SCRUB_OFLAG_XCORRUPT |
+						 XFS_SCRUB_OFLAG_PREEN));
+		/*
+		 * If userspace asked for a repair but it wasn't necessary,
+		 * report that back to userspace.
+		 */
+		if (!needs_fix) {
+			sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
+			goto out_nofix;
+		}
+
+		/*
+		 * If it's broken, userspace wants us to fix it, and we haven't
+		 * already tried to fix it, then attempt a repair.
+		 */
+		error = xrep_attempt(sc);
+		if (error == -EAGAIN) {
+			/*
+			 * Either the repair function succeeded or it couldn't
+			 * get all the resources it needs; either way, we go
+			 * back to the beginning and call the scrub function.
+			 */
+			error = xchk_teardown(sc, 0);
+			if (error) {
+				xrep_failure(mp);
+				goto out_sc;
+			}
+			goto retry_op;
+		}
+	}
+
+out_nofix:
+	xchk_postmortem(sc);
+out_teardown:
+	error = xchk_teardown(sc, error);
+out_sc:
+	kmem_free(sc);
+out:
+	trace_xchk_done(XFS_I(file_inode(file)), sm, error);
+	if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
+		sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+		error = 0;
+	}
+	return error;
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
new file mode 100644
index 000000000..4cb32c27d
--- /dev/null
+++ b/fs/xfs/scrub/scrub.h
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_SCRUB_H__
+#define __XFS_SCRUB_SCRUB_H__
+
+struct xfs_scrub;
+
+/* Type info and names for the scrub types. */
+enum xchk_type {
+	ST_NONE = 1,	/* disabled */
+	ST_PERAG,	/* per-AG metadata */
+	ST_FS,		/* per-FS metadata */
+	ST_INODE,	/* per-inode metadata */
+};
+
+struct xchk_meta_ops {
+	/* Acquire whatever resources are needed for the operation. */
+	int		(*setup)(struct xfs_scrub *sc);
+
+	/* Examine metadata for errors. */
+	int		(*scrub)(struct xfs_scrub *);
+
+	/* Repair or optimize the metadata. */
+	int		(*repair)(struct xfs_scrub *);
+
+	/* Decide if we even have this piece of metadata. */
+	bool		(*has)(struct xfs_mount *);
+
+	/* type describing required/allowed inputs */
+	enum xchk_type	type;
+};
+
+/* Buffer pointers and btree cursors for an entire AG. */
+struct xchk_ag {
+	struct xfs_perag	*pag;
+
+	/* AG btree roots */
+	struct xfs_buf		*agf_bp;
+	struct xfs_buf		*agfl_bp;
+	struct xfs_buf		*agi_bp;
+
+	/* AG btrees */
+	struct xfs_btree_cur	*bno_cur;
+	struct xfs_btree_cur	*cnt_cur;
+	struct xfs_btree_cur	*ino_cur;
+	struct xfs_btree_cur	*fino_cur;
+	struct xfs_btree_cur	*rmap_cur;
+	struct xfs_btree_cur	*refc_cur;
+};
+
+struct xfs_scrub {
+	/* General scrub state. */
+	struct xfs_mount		*mp;
+	struct xfs_scrub_metadata	*sm;
+	const struct xchk_meta_ops	*ops;
+	struct xfs_trans		*tp;
+
+	/* File that scrub was called with. */
+	struct file			*file;
+
+	/*
+	 * File that is undergoing the scrub operation.  This can differ from
+	 * the file that scrub was called with if we're checking file-based fs
+	 * metadata (e.g. rt bitmaps) or if we're doing a scrub-by-handle for
+	 * something that can't be opened directly (e.g. symlinks).
+	 */
+	struct xfs_inode		*ip;
+
+	void				*buf;
+	uint				ilock_flags;
+
+	/* See the XCHK/XREP state flags below. */
+	unsigned int			flags;
+
+	/*
+	 * The XFS_SICK_* flags that correspond to the metadata being scrubbed
+	 * or repaired.  We will use this mask to update the in-core fs health
+	 * status with whatever we find.
+	 */
+	unsigned int			sick_mask;
+
+	/* State tracking for single-AG operations. */
+	struct xchk_ag			sa;
+};
+
+/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+#define XCHK_TRY_HARDER		(1 << 0)  /* can't get resources, try again */
+#define XREP_ALREADY_FIXED	(1 << 31) /* checking our repair work */
+
+/* Metadata scrubbers */
+int xchk_tester(struct xfs_scrub *sc);
+int xchk_superblock(struct xfs_scrub *sc);
+int xchk_agf(struct xfs_scrub *sc);
+int xchk_agfl(struct xfs_scrub *sc);
+int xchk_agi(struct xfs_scrub *sc);
+int xchk_bnobt(struct xfs_scrub *sc);
+int xchk_cntbt(struct xfs_scrub *sc);
+int xchk_inobt(struct xfs_scrub *sc);
+int xchk_finobt(struct xfs_scrub *sc);
+int xchk_rmapbt(struct xfs_scrub *sc);
+int xchk_refcountbt(struct xfs_scrub *sc);
+int xchk_inode(struct xfs_scrub *sc);
+int xchk_bmap_data(struct xfs_scrub *sc);
+int xchk_bmap_attr(struct xfs_scrub *sc);
+int xchk_bmap_cow(struct xfs_scrub *sc);
+int xchk_directory(struct xfs_scrub *sc);
+int xchk_xattr(struct xfs_scrub *sc);
+int xchk_symlink(struct xfs_scrub *sc);
+int xchk_parent(struct xfs_scrub *sc);
+#ifdef CONFIG_XFS_RT
+int xchk_rtbitmap(struct xfs_scrub *sc);
+int xchk_rtsummary(struct xfs_scrub *sc);
+#else
+static inline int
+xchk_rtbitmap(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
+static inline int
+xchk_rtsummary(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
+#endif
+#ifdef CONFIG_XFS_QUOTA
+int xchk_quota(struct xfs_scrub *sc);
+#else
+static inline int
+xchk_quota(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
+#endif
+int xchk_fscounters(struct xfs_scrub *sc);
+
+/* cross-referencing helpers */
+void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
+		xfs_extlen_t len);
+void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
+		xfs_extlen_t len);
+void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
+		xfs_extlen_t len);
+void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
+void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
+void xchk_xref_has_no_owner(struct xfs_scrub *sc, xfs_agblock_t agbno,
+		xfs_extlen_t len);
+void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
+		xfs_extlen_t len);
+void xchk_xref_is_not_shared(struct xfs_scrub *sc, xfs_agblock_t bno,
+		xfs_extlen_t len);
+#ifdef CONFIG_XFS_RT
+void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
+		xfs_extlen_t len);
+#else
+# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
+#endif
+
+struct xchk_fscounters {
+	uint64_t		icount;
+	uint64_t		ifree;
+	uint64_t		fdblocks;
+	unsigned long long	icount_min;
+	unsigned long long	icount_max;
+};
+
+#endif	/* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
new file mode 100644
index 000000000..75311f8da
--- /dev/null
+++ b/fs/xfs/scrub/symlink.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_inode.h"
+#include "xfs_symlink.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Set us up to scrub a symbolic link. */
+int
+xchk_setup_symlink(
+	struct xfs_scrub	*sc)
+{
+	/* Allocate the buffer without the inode lock held. */
+	sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
+	if (!sc->buf)
+		return -ENOMEM;
+
+	return xchk_setup_inode_contents(sc, 0);
+}
+
+/* Symbolic links. */
+
+int
+xchk_symlink(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_inode	*ip = sc->ip;
+	struct xfs_ifork	*ifp;
+	loff_t			len;
+	int			error = 0;
+
+	if (!S_ISLNK(VFS_I(ip)->i_mode))
+		return -ENOENT;
+	ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	len = ip->i_disk_size;
+
+	/* Plausible size? */
+	if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out;
+	}
+
+	/* Inline symlink? */
+	if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
+		if (len > xfs_inode_data_fork_size(ip) ||
+		    len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out;
+	}
+
+	/* Remote symlink; must read the contents. */
+	error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+		goto out;
+	if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+out:
+	return error;
+}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
new file mode 100644
index 000000000..b5f94676c
--- /dev/null
+++ b/fs/xfs/scrub/trace.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "scrub/scrub.h"
+#include "xfs_ag.h"
+
+/* Figure out which block the btree cursor was pointing to. */
+static inline xfs_fsblock_t
+xchk_btree_cur_fsbno(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level < cur->bc_nlevels && cur->bc_levels[level].bp)
+		return XFS_DADDR_TO_FSB(cur->bc_mp,
+				xfs_buf_daddr(cur->bc_levels[level].bp));
+
+	if (level == cur->bc_nlevels - 1 &&
+	    (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+		return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
+
+	return NULLFSBLOCK;
+}
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "scrub/trace.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
new file mode 100644
index 000000000..93ece6df0
--- /dev/null
+++ b/fs/xfs/scrub/trace.h
@@ -0,0 +1,920 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * NOTE: none of these tracepoints shall be considered a stable kernel ABI
+ * as they can change at any time.  See xfs_trace.h for documentation of
+ * specific units found in tracepoint output.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs_scrub
+
+#if !defined(_TRACE_XFS_SCRUB_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_SCRUB_TRACE_H
+
+#include <linux/tracepoint.h>
+#include "xfs_bit.h"
+
+/*
+ * ftrace's __print_symbolic requires that all enum values be wrapped in the
+ * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
+ * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
+ * code.
+ */
+TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGFL);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGI);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BNOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_CNTBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FINOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_REFCNTBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INODE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTD);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTC);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIR);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_XATTR);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SYMLINK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PARENT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTBITMAP);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
+
+#define XFS_SCRUB_TYPE_STRINGS \
+	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
+	{ XFS_SCRUB_TYPE_SB,		"sb" }, \
+	{ XFS_SCRUB_TYPE_AGF,		"agf" }, \
+	{ XFS_SCRUB_TYPE_AGFL,		"agfl" }, \
+	{ XFS_SCRUB_TYPE_AGI,		"agi" }, \
+	{ XFS_SCRUB_TYPE_BNOBT,		"bnobt" }, \
+	{ XFS_SCRUB_TYPE_CNTBT,		"cntbt" }, \
+	{ XFS_SCRUB_TYPE_INOBT,		"inobt" }, \
+	{ XFS_SCRUB_TYPE_FINOBT,	"finobt" }, \
+	{ XFS_SCRUB_TYPE_RMAPBT,	"rmapbt" }, \
+	{ XFS_SCRUB_TYPE_REFCNTBT,	"refcountbt" }, \
+	{ XFS_SCRUB_TYPE_INODE,		"inode" }, \
+	{ XFS_SCRUB_TYPE_BMBTD,		"bmapbtd" }, \
+	{ XFS_SCRUB_TYPE_BMBTA,		"bmapbta" }, \
+	{ XFS_SCRUB_TYPE_BMBTC,		"bmapbtc" }, \
+	{ XFS_SCRUB_TYPE_DIR,		"directory" }, \
+	{ XFS_SCRUB_TYPE_XATTR,		"xattr" }, \
+	{ XFS_SCRUB_TYPE_SYMLINK,	"symlink" }, \
+	{ XFS_SCRUB_TYPE_PARENT,	"parent" }, \
+	{ XFS_SCRUB_TYPE_RTBITMAP,	"rtbitmap" }, \
+	{ XFS_SCRUB_TYPE_RTSUM,		"rtsummary" }, \
+	{ XFS_SCRUB_TYPE_UQUOTA,	"usrquota" }, \
+	{ XFS_SCRUB_TYPE_GQUOTA,	"grpquota" }, \
+	{ XFS_SCRUB_TYPE_PQUOTA,	"prjquota" }, \
+	{ XFS_SCRUB_TYPE_FSCOUNTERS,	"fscounters" }
+
+#define XFS_SCRUB_FLAG_STRINGS \
+	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \
+	{ XFS_SCRUB_OFLAG_CORRUPT,		"corrupt" }, \
+	{ XFS_SCRUB_OFLAG_PREEN,		"preen" }, \
+	{ XFS_SCRUB_OFLAG_XFAIL,		"xfail" }, \
+	{ XFS_SCRUB_OFLAG_XCORRUPT,		"xcorrupt" }, \
+	{ XFS_SCRUB_OFLAG_INCOMPLETE,		"incomplete" }, \
+	{ XFS_SCRUB_OFLAG_WARNING,		"warning" }, \
+	{ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED,	"norepair" }
+
+DECLARE_EVENT_CLASS(xchk_class,
+	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
+		 int error),
+	TP_ARGS(ip, sm, error),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned int, type)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_ino_t, inum)
+		__field(unsigned int, gen)
+		__field(unsigned int, flags)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->dev = ip->i_mount->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->type = sm->sm_type;
+		__entry->agno = sm->sm_agno;
+		__entry->inum = sm->sm_ino;
+		__entry->gen = sm->sm_gen;
+		__entry->flags = sm->sm_flags;
+		__entry->error = error;
+	),
+	TP_printk("dev %d:%d ino 0x%llx type %s agno 0x%x inum 0x%llx gen 0x%x flags (%s) error %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __entry->agno,
+		  __entry->inum,
+		  __entry->gen,
+		  __print_flags(__entry->flags, "|", XFS_SCRUB_FLAG_STRINGS),
+		  __entry->error)
+)
+#define DEFINE_SCRUB_EVENT(name) \
+DEFINE_EVENT(xchk_class, name, \
+	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, \
+		 int error), \
+	TP_ARGS(ip, sm, error))
+
+DEFINE_SCRUB_EVENT(xchk_start);
+DEFINE_SCRUB_EVENT(xchk_done);
+DEFINE_SCRUB_EVENT(xchk_deadlock_retry);
+DEFINE_SCRUB_EVENT(xrep_attempt);
+DEFINE_SCRUB_EVENT(xrep_done);
+
+TRACE_EVENT(xchk_op_error,
+	TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno,
+		 xfs_agblock_t bno, int error, void *ret_ip),
+	TP_ARGS(sc, agno, bno, error, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, bno)
+		__field(int, error)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->type = sc->sm->sm_type;
+		__entry->agno = agno;
+		__entry->bno = bno;
+		__entry->error = error;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x error %d ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->error,
+		  __entry->ret_ip)
+);
+
+TRACE_EVENT(xchk_file_op_error,
+	TP_PROTO(struct xfs_scrub *sc, int whichfork,
+		 xfs_fileoff_t offset, int error, void *ret_ip),
+	TP_ARGS(sc, whichfork, offset, error, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, whichfork)
+		__field(unsigned int, type)
+		__field(xfs_fileoff_t, offset)
+		__field(int, error)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->ip->i_mount->m_super->s_dev;
+		__entry->ino = sc->ip->i_ino;
+		__entry->whichfork = whichfork;
+		__entry->type = sc->sm->sm_type;
+		__entry->offset = offset;
+		__entry->error = error;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx error %d ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __entry->offset,
+		  __entry->error,
+		  __entry->ret_ip)
+);
+
+DECLARE_EVENT_CLASS(xchk_block_error_class,
+	TP_PROTO(struct xfs_scrub *sc, xfs_daddr_t daddr, void *ret_ip),
+	TP_ARGS(sc, daddr, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->type = sc->sm->sm_type;
+		__entry->agno = xfs_daddr_to_agno(sc->mp, daddr);
+		__entry->agbno = xfs_daddr_to_agbno(sc->mp, daddr);
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->ret_ip)
+)
+
+#define DEFINE_SCRUB_BLOCK_ERROR_EVENT(name) \
+DEFINE_EVENT(xchk_block_error_class, name, \
+	TP_PROTO(struct xfs_scrub *sc, xfs_daddr_t daddr, \
+		 void *ret_ip), \
+	TP_ARGS(sc, daddr, ret_ip))
+
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error);
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error);
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen);
+
+DECLARE_EVENT_CLASS(xchk_ino_error_class,
+	TP_PROTO(struct xfs_scrub *sc, xfs_ino_t ino, void *ret_ip),
+	TP_ARGS(sc, ino, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned int, type)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = ino;
+		__entry->type = sc->sm->sm_type;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx type %s ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __entry->ret_ip)
+)
+
+#define DEFINE_SCRUB_INO_ERROR_EVENT(name) \
+DEFINE_EVENT(xchk_ino_error_class, name, \
+	TP_PROTO(struct xfs_scrub *sc, xfs_ino_t ino, \
+		 void *ret_ip), \
+	TP_ARGS(sc, ino, ret_ip))
+
+DEFINE_SCRUB_INO_ERROR_EVENT(xchk_ino_error);
+DEFINE_SCRUB_INO_ERROR_EVENT(xchk_ino_preen);
+DEFINE_SCRUB_INO_ERROR_EVENT(xchk_ino_warning);
+
+DECLARE_EVENT_CLASS(xchk_fblock_error_class,
+	TP_PROTO(struct xfs_scrub *sc, int whichfork,
+		 xfs_fileoff_t offset, void *ret_ip),
+	TP_ARGS(sc, whichfork, offset, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, whichfork)
+		__field(unsigned int, type)
+		__field(xfs_fileoff_t, offset)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->ip->i_mount->m_super->s_dev;
+		__entry->ino = sc->ip->i_ino;
+		__entry->whichfork = whichfork;
+		__entry->type = sc->sm->sm_type;
+		__entry->offset = offset;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __entry->offset,
+		  __entry->ret_ip)
+);
+
+#define DEFINE_SCRUB_FBLOCK_ERROR_EVENT(name) \
+DEFINE_EVENT(xchk_fblock_error_class, name, \
+	TP_PROTO(struct xfs_scrub *sc, int whichfork, \
+		 xfs_fileoff_t offset, void *ret_ip), \
+	TP_ARGS(sc, whichfork, offset, ret_ip))
+
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error);
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning);
+
+TRACE_EVENT(xchk_incomplete,
+	TP_PROTO(struct xfs_scrub *sc, void *ret_ip),
+	TP_ARGS(sc, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->type = sc->sm->sm_type;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d type %s ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __entry->ret_ip)
+);
+
+TRACE_EVENT(xchk_btree_op_error,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		 int level, int error, void *ret_ip),
+	TP_ARGS(sc, cur, level, error, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(xfs_btnum_t, btnum)
+		__field(int, level)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, bno)
+		__field(int, ptr)
+		__field(int, error)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
+
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->type = sc->sm->sm_type;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+		__entry->ptr = cur->bc_levels[level].ptr;
+		__entry->error = error;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->ptr,
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->error,
+		  __entry->ret_ip)
+);
+
+TRACE_EVENT(xchk_ifork_btree_op_error,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		 int level, int error, void *ret_ip),
+	TP_ARGS(sc, cur, level, error, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, whichfork)
+		__field(unsigned int, type)
+		__field(xfs_btnum_t, btnum)
+		__field(int, level)
+		__field(int, ptr)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, bno)
+		__field(int, error)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->ip->i_ino;
+		__entry->whichfork = cur->bc_ino.whichfork;
+		__entry->type = sc->sm->sm_type;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->ptr = cur->bc_levels[level].ptr;
+		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+		__entry->error = error;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->ptr,
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->error,
+		  __entry->ret_ip)
+);
+
+TRACE_EVENT(xchk_btree_error,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		 int level, void *ret_ip),
+	TP_ARGS(sc, cur, level, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(xfs_btnum_t, btnum)
+		__field(int, level)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, bno)
+		__field(int, ptr)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->type = sc->sm->sm_type;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+		__entry->ptr = cur->bc_levels[level].ptr;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->ptr,
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->ret_ip)
+);
+
+TRACE_EVENT(xchk_ifork_btree_error,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		 int level, void *ret_ip),
+	TP_ARGS(sc, cur, level, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, whichfork)
+		__field(unsigned int, type)
+		__field(xfs_btnum_t, btnum)
+		__field(int, level)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, bno)
+		__field(int, ptr)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->ip->i_ino;
+		__entry->whichfork = cur->bc_ino.whichfork;
+		__entry->type = sc->sm->sm_type;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+		__entry->ptr = cur->bc_levels[level].ptr;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->ptr,
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->ret_ip)
+);
+
+DECLARE_EVENT_CLASS(xchk_sbtree_class,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		 int level),
+	TP_ARGS(sc, cur, level),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, type)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, bno)
+		__field(int, level)
+		__field(int, nlevels)
+		__field(int, ptr)
+	),
+	TP_fast_assign(
+		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
+
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->type = sc->sm->sm_type;
+		__entry->btnum = cur->bc_btnum;
+		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+		__entry->level = level;
+		__entry->nlevels = cur->bc_nlevels;
+		__entry->ptr = cur->bc_levels[level].ptr;
+	),
+	TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->level,
+		  __entry->nlevels,
+		  __entry->ptr)
+)
+#define DEFINE_SCRUB_SBTREE_EVENT(name) \
+DEFINE_EVENT(xchk_sbtree_class, name, \
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur, \
+		 int level), \
+	TP_ARGS(sc, cur, level))
+
+DEFINE_SCRUB_SBTREE_EVENT(xchk_btree_rec);
+DEFINE_SCRUB_SBTREE_EVENT(xchk_btree_key);
+
+TRACE_EVENT(xchk_xref_error,
+	TP_PROTO(struct xfs_scrub *sc, int error, void *ret_ip),
+	TP_ARGS(sc, error, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, type)
+		__field(int, error)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->type = sc->sm->sm_type;
+		__entry->error = error;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d type %s xref error %d ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __entry->error,
+		  __entry->ret_ip)
+);
+
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t startino, xfs_daddr_t map_daddr,
+		 unsigned short map_len, unsigned int chunk_ino,
+		 unsigned int nr_inodes, uint16_t cluster_mask,
+		 uint16_t holemask, unsigned int cluster_ino),
+	TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+		cluster_mask, holemask, cluster_ino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+		__field(xfs_daddr_t, map_daddr)
+		__field(unsigned short, map_len)
+		__field(unsigned int, chunk_ino)
+		__field(unsigned int, nr_inodes)
+		__field(unsigned int, cluster_ino)
+		__field(uint16_t, cluster_mask)
+		__field(uint16_t, holemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = startino;
+		__entry->map_daddr = map_daddr;
+		__entry->map_len = map_len;
+		__entry->chunk_ino = chunk_ino;
+		__entry->nr_inodes = nr_inodes;
+		__entry->cluster_mask = cluster_mask;
+		__entry->holemask = holemask;
+		__entry->cluster_ino = cluster_ino;
+	),
+	TP_printk("dev %d:%d agno 0x%x startino 0x%x daddr 0x%llx bbcount 0x%x chunkino 0x%x nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startino,
+		  __entry->map_daddr,
+		  __entry->map_len,
+		  __entry->chunk_ino,
+		  __entry->nr_inodes,
+		  __entry->cluster_mask,
+		  __entry->holemask,
+		  __entry->cluster_ino)
+)
+
+TRACE_EVENT(xchk_fscounters_calc,
+	TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
+		 uint64_t fdblocks, uint64_t delalloc),
+	TP_ARGS(mp, icount, ifree, fdblocks, delalloc),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int64_t, icount_sb)
+		__field(uint64_t, icount_calculated)
+		__field(int64_t, ifree_sb)
+		__field(uint64_t, ifree_calculated)
+		__field(int64_t, fdblocks_sb)
+		__field(uint64_t, fdblocks_calculated)
+		__field(uint64_t, delalloc)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->icount_sb = mp->m_sb.sb_icount;
+		__entry->icount_calculated = icount;
+		__entry->ifree_sb = mp->m_sb.sb_ifree;
+		__entry->ifree_calculated = ifree;
+		__entry->fdblocks_sb = mp->m_sb.sb_fdblocks;
+		__entry->fdblocks_calculated = fdblocks;
+		__entry->delalloc = delalloc;
+	),
+	TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->icount_sb,
+		  __entry->icount_calculated,
+		  __entry->ifree_sb,
+		  __entry->ifree_calculated,
+		  __entry->fdblocks_sb,
+		  __entry->fdblocks_calculated,
+		  __entry->delalloc)
+)
+
+TRACE_EVENT(xchk_fscounters_within_range,
+	TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value,
+		 int64_t old_value),
+	TP_ARGS(mp, expected, curr_value, old_value),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint64_t, expected)
+		__field(int64_t, curr_value)
+		__field(int64_t, old_value)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->expected = expected;
+		__entry->curr_value = curr_value;
+		__entry->old_value = old_value;
+	),
+	TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->expected,
+		  __entry->curr_value,
+		  __entry->old_value)
+)
+
+/* repair tracepoints */
+#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
+
+DECLARE_EVENT_CLASS(xrep_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+);
+#define DEFINE_REPAIR_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_REPAIR_EXTENT_EVENT(xrep_dispose_btree_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
+
+DECLARE_EVENT_CLASS(xrep_rmap_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 uint64_t owner, uint64_t offset, unsigned int flags),
+	TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->owner = owner;
+		__entry->offset = offset;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
+#define DEFINE_REPAIR_RMAP_EVENT(name) \
+DEFINE_EVENT(xrep_rmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len, \
+		 uint64_t owner, uint64_t offset, unsigned int flags), \
+	TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn);
+
+TRACE_EVENT(xrep_refcount_extent_fn,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *irec),
+	TP_ARGS(mp, agno, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount)
+)
+
+TRACE_EVENT(xrep_init_btblock,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 xfs_btnum_t btnum),
+	TP_ARGS(mp, agno, agbno, btnum),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(uint32_t, btnum)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->btnum = btnum;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS))
+)
+TRACE_EVENT(xrep_findroot_block,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 uint32_t magic, uint16_t level),
+	TP_ARGS(mp, agno, agbno, magic, level),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(uint32_t, magic)
+		__field(uint16_t, level)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->magic = magic;
+		__entry->level = level;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x magic 0x%x level %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->magic,
+		  __entry->level)
+)
+TRACE_EVENT(xrep_calc_ag_resblks,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen,
+		 xfs_agblock_t usedlen),
+	TP_ARGS(mp, agno, icount, aglen, freelen, usedlen),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, icount)
+		__field(xfs_agblock_t, aglen)
+		__field(xfs_agblock_t, freelen)
+		__field(xfs_agblock_t, usedlen)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->icount = icount;
+		__entry->aglen = aglen;
+		__entry->freelen = freelen;
+		__entry->usedlen = usedlen;
+	),
+	TP_printk("dev %d:%d agno 0x%x icount %u aglen %u freelen %u usedlen %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->icount,
+		  __entry->aglen,
+		  __entry->freelen,
+		  __entry->usedlen)
+)
+TRACE_EVENT(xrep_calc_ag_resblks_btsize,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz,
+		 xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz),
+	TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, bnobt_sz)
+		__field(xfs_agblock_t, inobt_sz)
+		__field(xfs_agblock_t, rmapbt_sz)
+		__field(xfs_agblock_t, refcbt_sz)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->bnobt_sz = bnobt_sz;
+		__entry->inobt_sz = inobt_sz;
+		__entry->rmapbt_sz = rmapbt_sz;
+		__entry->refcbt_sz = refcbt_sz;
+	),
+	TP_printk("dev %d:%d agno 0x%x bnobt %u inobt %u rmapbt %u refcountbt %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->bnobt_sz,
+		  __entry->inobt_sz,
+		  __entry->rmapbt_sz,
+		  __entry->refcbt_sz)
+)
+TRACE_EVENT(xrep_reset_counters,
+	TP_PROTO(struct xfs_mount *mp),
+	TP_ARGS(mp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+	),
+	TP_printk("dev %d:%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev))
+)
+
+TRACE_EVENT(xrep_ialloc_insert,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t startino, uint16_t holemask, uint8_t count,
+		 uint8_t freecount, uint64_t freemask),
+	TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+		__field(uint16_t, holemask)
+		__field(uint8_t, count)
+		__field(uint8_t, freecount)
+		__field(uint64_t, freemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = startino;
+		__entry->holemask = holemask;
+		__entry->count = count;
+		__entry->freecount = freecount;
+		__entry->freemask = freemask;
+	),
+	TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startino,
+		  __entry->holemask,
+		  __entry->count,
+		  __entry->freecount,
+		  __entry->freemask)
+)
+
+#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
+
+#endif /* _TRACE_XFS_SCRUB_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE scrub/trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
new file mode 100644
index 000000000..2ceae614a
--- /dev/null
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_H__
+#define __XFS_SCRUB_H__
+
+#ifndef CONFIG_XFS_ONLINE_SCRUB
+# define xfs_scrub_metadata(file, sm)	(-ENOTTY)
+#else
+int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm);
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
+
+#endif	/* __XFS_SCRUB_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
new file mode 100644
index 000000000..f6ffb4f24
--- /dev/null
+++ b/fs/xfs/xfs.h
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_H__
+#define __XFS_H__
+
+#ifdef CONFIG_XFS_DEBUG
+#define DEBUG 1
+#endif
+
+#ifdef CONFIG_XFS_ASSERT_FATAL
+#define XFS_ASSERT_FATAL 1
+#endif
+
+#ifdef CONFIG_XFS_WARN
+#define XFS_WARN 1
+#endif
+
+
+#include "xfs_linux.h"
+
+#endif	/* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
new file mode 100644
index 000000000..b744c6205
--- /dev/null
+++ b/fs/xfs/xfs_acl.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_acl.h"
+#include "xfs_trans.h"
+#include "xfs_xattr.h"
+
+#include <linux/posix_acl_xattr.h>
+
+/*
+ * Locking scheme:
+ *  - all ACL updates are protected by inode->i_mutex, which is taken before
+ *    calling into this file.
+ */
+
+STATIC struct posix_acl *
+xfs_acl_from_disk(
+	struct xfs_mount	*mp,
+	const struct xfs_acl	*aclp,
+	int			len,
+	int			max_entries)
+{
+	struct posix_acl_entry *acl_e;
+	struct posix_acl *acl;
+	const struct xfs_acl_entry *ace;
+	unsigned int count, i;
+
+	if (len < sizeof(*aclp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp,
+				len);
+		return ERR_PTR(-EFSCORRUPTED);
+	}
+
+	count = be32_to_cpu(aclp->acl_cnt);
+	if (count > max_entries || XFS_ACL_SIZE(count) != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp,
+				len);
+		return ERR_PTR(-EFSCORRUPTED);
+	}
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		acl_e = &acl->a_entries[i];
+		ace = &aclp->acl_entry[i];
+
+		/*
+		 * The tag is 32 bits on disk and 16 bits in core.
+		 *
+		 * Because every access to it goes through the core
+		 * format first this is not a problem.
+		 */
+		acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+		acl_e->e_perm = be16_to_cpu(ace->ae_perm);
+
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			acl_e->e_uid = make_kuid(&init_user_ns,
+						 be32_to_cpu(ace->ae_id));
+			break;
+		case ACL_GROUP:
+			acl_e->e_gid = make_kgid(&init_user_ns,
+						 be32_to_cpu(ace->ae_id));
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			break;
+		default:
+			goto fail;
+		}
+	}
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
+{
+	const struct posix_acl_entry *acl_e;
+	struct xfs_acl_entry *ace;
+	int i;
+
+	aclp->acl_cnt = cpu_to_be32(acl->a_count);
+	for (i = 0; i < acl->a_count; i++) {
+		ace = &aclp->acl_entry[i];
+		acl_e = &acl->a_entries[i];
+
+		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			ace->ae_id = cpu_to_be32(
+					from_kuid(&init_user_ns, acl_e->e_uid));
+			break;
+		case ACL_GROUP:
+			ace->ae_id = cpu_to_be32(
+					from_kgid(&init_user_ns, acl_e->e_gid));
+			break;
+		default:
+			ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+			break;
+		}
+
+		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
+	}
+}
+
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type, bool rcu)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct posix_acl	*acl = NULL;
+	struct xfs_da_args	args = {
+		.dp		= ip,
+		.attr_filter	= XFS_ATTR_ROOT,
+		.valuelen	= XFS_ACL_MAX_SIZE(mp),
+	};
+	int			error;
+
+	if (rcu)
+		return ERR_PTR(-ECHILD);
+
+	trace_xfs_get_acl(ip);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		args.name = SGI_ACL_FILE;
+		break;
+	case ACL_TYPE_DEFAULT:
+		args.name = SGI_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	args.namelen = strlen(args.name);
+
+	/*
+	 * If the attribute doesn't exist make sure we have a negative cache
+	 * entry, for any other error assume it is transient.
+	 */
+	error = xfs_attr_get(&args);
+	if (!error) {
+		acl = xfs_acl_from_disk(mp, args.value, args.valuelen,
+					XFS_ACL_MAX_ENTRIES(mp));
+	} else if (error != -ENOATTR) {
+		acl = ERR_PTR(error);
+	}
+
+	kmem_free(args.value);
+	return acl;
+}
+
+int
+__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_da_args	args = {
+		.dp		= ip,
+		.attr_filter	= XFS_ATTR_ROOT,
+	};
+	int			error;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		args.name = SGI_ACL_FILE;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		args.name = SGI_ACL_DEFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+	args.namelen = strlen(args.name);
+
+	if (acl) {
+		args.valuelen = XFS_ACL_SIZE(acl->a_count);
+		args.value = kvzalloc(args.valuelen, GFP_KERNEL);
+		if (!args.value)
+			return -ENOMEM;
+		xfs_acl_to_disk(args.value, acl);
+	}
+
+	error = xfs_attr_change(&args);
+	kmem_free(args.value);
+
+	/*
+	 * If the attribute didn't exist to start with that's fine.
+	 */
+	if (!acl && error == -ENOATTR)
+		error = 0;
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
+}
+
+static int
+xfs_acl_set_mode(
+	struct inode		*inode,
+	umode_t			mode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	inode->i_mode = mode;
+	inode->i_ctime = current_time(inode);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (xfs_has_wsync(mp))
+		xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp);
+}
+
+int
+xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	    struct posix_acl *acl, int type)
+{
+	umode_t mode;
+	bool set_mode = false;
+	int error = 0;
+
+	if (!acl)
+		goto set_acl;
+
+	error = -E2BIG;
+	if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
+		return error;
+
+	if (type == ACL_TYPE_ACCESS) {
+		error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl);
+		if (error)
+			return error;
+		set_mode = true;
+	}
+
+ set_acl:
+	/*
+	 * We set the mode after successfully updating the ACL xattr because the
+	 * xattr update can fail at ENOSPC and we don't want to change the mode
+	 * if the ACL update hasn't been applied.
+	 */
+	error =  __xfs_set_acl(inode, acl, type);
+	if (!error && set_mode && mode != inode->i_mode)
+		error = xfs_acl_set_mode(inode, mode);
+	return error;
+}
+
+/*
+ * Invalidate any cached ACLs if the user has bypassed the ACL interface.
+ * We don't validate the content whatsoever so it is caller responsibility to
+ * provide data in valid format and ensure i_mode is consistent.
+ */
+void
+xfs_forget_acl(
+	struct inode		*inode,
+	const char		*name)
+{
+	if (!strcmp(name, SGI_ACL_FILE))
+		forget_cached_acl(inode, ACL_TYPE_ACCESS);
+	else if (!strcmp(name, SGI_ACL_DEFAULT))
+		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
new file mode 100644
index 000000000..263404d0b
--- /dev/null
+++ b/fs/xfs/xfs_acl.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ACL_H__
+#define __XFS_ACL_H__
+
+struct inode;
+struct posix_acl;
+
+#ifdef CONFIG_XFS_POSIX_ACL
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu);
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		       struct posix_acl *acl, int type);
+extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+void xfs_forget_acl(struct inode *inode, const char *name);
+#else
+#define xfs_get_acl NULL
+#define xfs_set_acl NULL
+static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl,
+				int type)
+{
+	return 0;
+}
+static inline void xfs_forget_acl(struct inode *inode, const char *name)
+{
+}
+#endif /* CONFIG_XFS_POSIX_ACL */
+
+#endif	/* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
new file mode 100644
index 000000000..5d1a995b1
--- /dev/null
+++ b/fs/xfs/xfs_aops.c
@@ -0,0 +1,584 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_iomap.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_reflink.h"
+
+struct xfs_writepage_ctx {
+	struct iomap_writepage_ctx ctx;
+	unsigned int		data_seq;
+	unsigned int		cow_seq;
+};
+
+static inline struct xfs_writepage_ctx *
+XFS_WPC(struct iomap_writepage_ctx *ctx)
+{
+	return container_of(ctx, struct xfs_writepage_ctx, ctx);
+}
+
+/*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
+{
+	return ioend->io_offset + ioend->io_size >
+		XFS_I(ioend->io_inode)->i_disk_size;
+}
+
+/*
+ * Update on-disk file size now that data has been written to disk.
+ */
+int
+xfs_setfilesize(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	size_t			size)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	xfs_fsize_t		isize;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	isize = xfs_new_eof(ip, offset + size);
+	if (!isize) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(tp);
+		return 0;
+	}
+
+	trace_xfs_setfilesize(ip, offset, size);
+
+	ip->i_disk_size = isize;
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	return xfs_trans_commit(tp);
+}
+
+/*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_ioend(
+	struct iomap_ioend	*ioend)
+{
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_off_t		offset = ioend->io_offset;
+	size_t			size = ioend->io_size;
+	unsigned int		nofs_flag;
+	int			error;
+
+	/*
+	 * We can allocate memory here while doing writeback on behalf of
+	 * memory reclaim.  To avoid memory allocation deadlocks set the
+	 * task-wide nofs context for the following operations.
+	 */
+	nofs_flag = memalloc_nofs_save();
+
+	/*
+	 * Just clean up the in-memory structures if the fs has been shut down.
+	 */
+	if (xfs_is_shutdown(mp)) {
+		error = -EIO;
+		goto done;
+	}
+
+	/*
+	 * Clean up all COW blocks and underlying data fork delalloc blocks on
+	 * I/O error. The delalloc punch is required because this ioend was
+	 * mapped to blocks in the COW fork and the associated pages are no
+	 * longer dirty. If we don't remove delalloc blocks here, they become
+	 * stale and can corrupt free space accounting on unmount.
+	 */
+	error = blk_status_to_errno(ioend->io_bio->bi_status);
+	if (unlikely(error)) {
+		if (ioend->io_flags & IOMAP_F_SHARED) {
+			xfs_reflink_cancel_cow_range(ip, offset, size, true);
+			xfs_bmap_punch_delalloc_range(ip,
+						      XFS_B_TO_FSBT(mp, offset),
+						      XFS_B_TO_FSB(mp, size));
+		}
+		goto done;
+	}
+
+	/*
+	 * Success: commit the COW or unwritten blocks if needed.
+	 */
+	if (ioend->io_flags & IOMAP_F_SHARED)
+		error = xfs_reflink_end_cow(ip, offset, size);
+	else if (ioend->io_type == IOMAP_UNWRITTEN)
+		error = xfs_iomap_write_unwritten(ip, offset, size, false);
+
+	if (!error && xfs_ioend_is_append(ioend))
+		error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+done:
+	iomap_finish_ioends(ioend, error);
+	memalloc_nofs_restore(nofs_flag);
+}
+
+/*
+ * Finish all pending IO completions that require transactional modifications.
+ *
+ * We try to merge physical and logically contiguous ioends before completion to
+ * minimise the number of transactions we need to perform during IO completion.
+ * Both unwritten extent conversion and COW remapping need to iterate and modify
+ * one physical extent at a time, so we gain nothing by merging physically
+ * discontiguous extents here.
+ *
+ * The ioend chain length that we can be processing here is largely unbound in
+ * length and we may have to perform significant amounts of work on each ioend
+ * to complete it. Hence we have to be careful about holding the CPU for too
+ * long in this loop.
+ */
+void
+xfs_end_io(
+	struct work_struct	*work)
+{
+	struct xfs_inode	*ip =
+		container_of(work, struct xfs_inode, i_ioend_work);
+	struct iomap_ioend	*ioend;
+	struct list_head	tmp;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&ip->i_ioend_lock, flags);
+	list_replace_init(&ip->i_ioend_list, &tmp);
+	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
+
+	iomap_sort_ioends(&tmp);
+	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
+			io_list))) {
+		list_del_init(&ioend->io_list);
+		iomap_ioend_try_merge(ioend, &tmp);
+		xfs_end_ioend(ioend);
+		cond_resched();
+	}
+}
+
+STATIC void
+xfs_end_bio(
+	struct bio		*bio)
+{
+	struct iomap_ioend	*ioend = bio->bi_private;
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	unsigned long		flags;
+
+	spin_lock_irqsave(&ip->i_ioend_lock, flags);
+	if (list_empty(&ip->i_ioend_list))
+		WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+					 &ip->i_ioend_work));
+	list_add_tail(&ioend->io_list, &ip->i_ioend_list);
+	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
+}
+
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+	struct iomap_writepage_ctx	*wpc,
+	struct xfs_inode		*ip,
+	loff_t				offset)
+{
+	if (offset < wpc->iomap.offset ||
+	    offset >= wpc->iomap.offset + wpc->iomap.length)
+		return false;
+	/*
+	 * If this is a COW mapping, it is sufficient to check that the mapping
+	 * covers the offset. Be careful to check this first because the caller
+	 * can revalidate a COW mapping without updating the data seqno.
+	 */
+	if (wpc->iomap.flags & IOMAP_F_SHARED)
+		return true;
+
+	/*
+	 * This is not a COW mapping. Check the sequence number of the data fork
+	 * because concurrent changes could have invalidated the extent. Check
+	 * the COW fork because concurrent changes since the last time we
+	 * checked (and found nothing at this offset) could have added
+	 * overlapping blocks.
+	 */
+	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
+		return false;
+	if (xfs_inode_has_cow_data(ip) &&
+	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+		return false;
+	return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->iomap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+	struct iomap_writepage_ctx *wpc,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	loff_t			offset)
+{
+	int			error;
+	unsigned		*seq;
+
+	if (whichfork == XFS_COW_FORK)
+		seq = &XFS_WPC(wpc)->cow_seq;
+	else
+		seq = &XFS_WPC(wpc)->data_seq;
+
+	/*
+	 * Attempt to allocate whatever delalloc extent currently backs offset
+	 * and put the result into wpc->iomap.  Allocate in a loop because it
+	 * may take several attempts to allocate real blocks for a contiguous
+	 * delalloc extent if free space is sufficiently fragmented.
+	 */
+	do {
+		error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+				&wpc->iomap, seq);
+		if (error)
+			return error;
+	} while (wpc->iomap.offset + wpc->iomap.length <= offset);
+
+	return 0;
+}
+
+static int
+xfs_map_blocks(
+	struct iomap_writepage_ctx *wpc,
+	struct inode		*inode,
+	loff_t			offset)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			count = i_blocksize(inode);
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+	xfs_fileoff_t		cow_fsb;
+	int			whichfork;
+	struct xfs_bmbt_irec	imap;
+	struct xfs_iext_cursor	icur;
+	int			retries = 0;
+	int			error = 0;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/*
+	 * COW fork blocks can overlap data fork blocks even if the blocks
+	 * aren't shared.  COW I/O always takes precedent, so we must always
+	 * check for overlap on reflink inodes unless the mapping is already a
+	 * COW one, or the COW fork hasn't changed from the last time we looked
+	 * at it.
+	 *
+	 * It's safe to check the COW fork if_seq here without the ILOCK because
+	 * we've indirectly protected against concurrent updates: writeback has
+	 * the page locked, which prevents concurrent invalidations by reflink
+	 * and directio and prevents concurrent buffered writes to the same
+	 * page.  Changes to if_seq always happen under i_lock, which protects
+	 * against concurrent updates and provides a memory barrier on the way
+	 * out that ensures that we always see the current value.
+	 */
+	if (xfs_imap_valid(wpc, ip, offset))
+		return 0;
+
+	/*
+	 * If we don't have a valid map, now it's time to get a new one for this
+	 * offset.  This will convert delayed allocations (including COW ones)
+	 * into real extents.  If we return without a valid map, it means we
+	 * landed in a hole and we skip the block.
+	 */
+retry:
+	cow_fsb = NULLFILEOFF;
+	whichfork = XFS_DATA_FORK;
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	ASSERT(!xfs_need_iread_extents(&ip->i_df));
+
+	/*
+	 * Check if this is offset is covered by a COW extents, and if yes use
+	 * it directly instead of looking up anything in the data fork.
+	 */
+	if (xfs_inode_has_cow_data(ip) &&
+	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+		cow_fsb = imap.br_startoff;
+	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+		XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+		whichfork = XFS_COW_FORK;
+		goto allocate_blocks;
+	}
+
+	/*
+	 * No COW extent overlap. Revalidate now that we may have updated
+	 * ->cow_seq. If the data mapping is still valid, we're done.
+	 */
+	if (xfs_imap_valid(wpc, ip, offset)) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return 0;
+	}
+
+	/*
+	 * If we don't have a valid map, now it's time to get a new one for this
+	 * offset.  This will convert delayed allocations (including COW ones)
+	 * into real extents.
+	 */
+	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
+		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
+	XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	/* landed in a hole or beyond EOF? */
+	if (imap.br_startoff > offset_fsb) {
+		imap.br_blockcount = imap.br_startoff - offset_fsb;
+		imap.br_startoff = offset_fsb;
+		imap.br_startblock = HOLESTARTBLOCK;
+		imap.br_state = XFS_EXT_NORM;
+	}
+
+	/*
+	 * Truncate to the next COW extent if there is one.  This is the only
+	 * opportunity to do this because we can skip COW fork lookups for the
+	 * subsequent blocks in the mapping; however, the requirement to treat
+	 * the COW range separately remains.
+	 */
+	if (cow_fsb != NULLFILEOFF &&
+	    cow_fsb < imap.br_startoff + imap.br_blockcount)
+		imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+	/* got a delalloc extent? */
+	if (imap.br_startblock != HOLESTARTBLOCK &&
+	    isnullstartblock(imap.br_startblock))
+		goto allocate_blocks;
+
+	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
+	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
+	return 0;
+allocate_blocks:
+	error = xfs_convert_blocks(wpc, ip, whichfork, offset);
+	if (error) {
+		/*
+		 * If we failed to find the extent in the COW fork we might have
+		 * raced with a COW to data fork conversion or truncate.
+		 * Restart the lookup to catch the extent in the data fork for
+		 * the former case, but prevent additional retries to avoid
+		 * looping forever for the latter case.
+		 */
+		if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
+			goto retry;
+		ASSERT(error != -EAGAIN);
+		return error;
+	}
+
+	/*
+	 * Due to merging the return real extent might be larger than the
+	 * original delalloc one.  Trim the return extent to the next COW
+	 * boundary again to force a re-lookup.
+	 */
+	if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
+		loff_t		cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
+
+		if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
+			wpc->iomap.length = cow_offset - wpc->iomap.offset;
+	}
+
+	ASSERT(wpc->iomap.offset <= offset);
+	ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
+	trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
+	return 0;
+}
+
+static int
+xfs_prepare_ioend(
+	struct iomap_ioend	*ioend,
+	int			status)
+{
+	unsigned int		nofs_flag;
+
+	/*
+	 * We can allocate memory here while doing writeback on behalf of
+	 * memory reclaim.  To avoid memory allocation deadlocks set the
+	 * task-wide nofs context for the following operations.
+	 */
+	nofs_flag = memalloc_nofs_save();
+
+	/* Convert CoW extents to regular */
+	if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
+		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
+				ioend->io_offset, ioend->io_size);
+	}
+
+	memalloc_nofs_restore(nofs_flag);
+
+	/* send ioends that might require a transaction to the completion wq */
+	if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
+	    (ioend->io_flags & IOMAP_F_SHARED))
+		ioend->io_bio->bi_end_io = xfs_end_bio;
+	return status;
+}
+
+/*
+ * If the page has delalloc blocks on it, we need to punch them out before we
+ * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip up a later direct I/O read operation on the same region.
+ *
+ * We prevent this by truncating away the delalloc regions on the page.  Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+ * transaction as there is no space left for block reservation (typically why we
+ * see a ENOSPC in writeback).
+ */
+static void
+xfs_discard_folio(
+	struct folio		*folio,
+	loff_t			pos)
+{
+	struct inode		*inode = folio->mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	size_t			offset = offset_in_folio(folio, pos);
+	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, pos);
+	xfs_fileoff_t		pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
+	int			error;
+
+	if (xfs_is_shutdown(mp))
+		return;
+
+	xfs_alert_ratelimited(mp,
+		"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+			folio, ip->i_ino, pos);
+
+	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+			i_blocks_per_folio(inode, folio) - pageoff_fsb);
+	if (error && !xfs_is_shutdown(mp))
+		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
+}
+
+static const struct iomap_writeback_ops xfs_writeback_ops = {
+	.map_blocks		= xfs_map_blocks,
+	.prepare_ioend		= xfs_prepare_ioend,
+	.discard_folio		= xfs_discard_folio,
+};
+
+STATIC int
+xfs_vm_writepages(
+	struct address_space	*mapping,
+	struct writeback_control *wbc)
+{
+	struct xfs_writepage_ctx wpc = { };
+
+	/*
+	 * Writing back data in a transaction context can result in recursive
+	 * transactions. This is bad, so issue a warning and get out of here.
+	 */
+	if (WARN_ON_ONCE(current->journal_info))
+		return 0;
+
+	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
+}
+
+STATIC int
+xfs_dax_writepages(
+	struct address_space	*mapping,
+	struct writeback_control *wbc)
+{
+	struct xfs_inode	*ip = XFS_I(mapping->host);
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	return dax_writeback_mapping_range(mapping,
+			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
+}
+
+STATIC sector_t
+xfs_vm_bmap(
+	struct address_space	*mapping,
+	sector_t		block)
+{
+	struct xfs_inode	*ip = XFS_I(mapping->host);
+
+	trace_xfs_vm_bmap(ip);
+
+	/*
+	 * The swap code (ab-)uses ->bmap to get a block mapping and then
+	 * bypasses the file system for actual I/O.  We really can't allow
+	 * that on reflinks inodes, so we have to skip out here.  And yes,
+	 * 0 is the magic code for a bmap error.
+	 *
+	 * Since we don't pass back blockdev info, we can't return bmap
+	 * information for rt files either.
+	 */
+	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+		return 0;
+	return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
+}
+
+STATIC int
+xfs_vm_read_folio(
+	struct file		*unused,
+	struct folio		*folio)
+{
+	return iomap_read_folio(folio, &xfs_read_iomap_ops);
+}
+
+STATIC void
+xfs_vm_readahead(
+	struct readahead_control	*rac)
+{
+	iomap_readahead(rac, &xfs_read_iomap_ops);
+}
+
+static int
+xfs_iomap_swapfile_activate(
+	struct swap_info_struct		*sis,
+	struct file			*swap_file,
+	sector_t			*span)
+{
+	sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+	return iomap_swapfile_activate(sis, swap_file, span,
+			&xfs_read_iomap_ops);
+}
+
+const struct address_space_operations xfs_address_space_operations = {
+	.read_folio		= xfs_vm_read_folio,
+	.readahead		= xfs_vm_readahead,
+	.writepages		= xfs_vm_writepages,
+	.dirty_folio		= filemap_dirty_folio,
+	.release_folio		= iomap_release_folio,
+	.invalidate_folio	= iomap_invalidate_folio,
+	.bmap			= xfs_vm_bmap,
+	.direct_IO		= noop_direct_IO,
+	.migrate_folio		= filemap_migrate_folio,
+	.is_partially_uptodate  = iomap_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+	.swap_activate		= xfs_iomap_swapfile_activate,
+};
+
+const struct address_space_operations xfs_dax_aops = {
+	.writepages		= xfs_dax_writepages,
+	.direct_IO		= noop_direct_IO,
+	.dirty_folio		= noop_dirty_folio,
+	.swap_activate		= xfs_iomap_swapfile_activate,
+};
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
new file mode 100644
index 000000000..e0bd68419
--- /dev/null
+++ b/fs/xfs/xfs_aops.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_AOPS_H__
+#define __XFS_AOPS_H__
+
+extern const struct address_space_operations xfs_address_space_operations;
+extern const struct address_space_operations xfs_dax_aops;
+
+int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+
+#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
new file mode 100644
index 000000000..5db87b34f
--- /dev/null
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_quota.h"
+#include "xfs_dir2.h"
+#include "xfs_error.h"
+
+/*
+ * Invalidate any incore buffers associated with this remote attribute value
+ * extent.   We never log remote attribute value buffers, which means that they
+ * won't be attached to a transaction and are therefore safe to mark stale.
+ * The actual bunmapi will be taken care of later.
+ */
+STATIC int
+xfs_attr3_rmt_stale(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		blkno,
+	int			blkcnt)
+{
+	struct xfs_bmbt_irec	map;
+	int			nmap;
+	int			error;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's
+	 * blocks.
+	 */
+	while (blkcnt > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt,
+				       &map, &nmap, XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(dp->i_mount, nmap != 1))
+			return -EFSCORRUPTED;
+
+		/*
+		 * Mark any incore buffers for the remote value as stale.  We
+		 * never log remote attr value buffers, so the buffer should be
+		 * easy to kill.
+		 */
+		error = xfs_attr_rmtval_stale(dp, &map, 0);
+		if (error)
+			return error;
+
+		blkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+
+	return 0;
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+STATIC int
+xfs_attr3_leaf_inactive(
+	struct xfs_trans		**trans,
+	struct xfs_inode		*dp,
+	struct xfs_buf			*bp)
+{
+	struct xfs_attr3_icleaf_hdr	ichdr;
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_attr_leafblock	*leaf = bp->b_addr;
+	struct xfs_attr_leaf_entry	*entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	int				error = 0;
+	int				i;
+
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+
+	/*
+	 * Find the remote value extents for this leaf and invalidate their
+	 * incore buffers.
+	 */
+	entry = xfs_attr3_leaf_entryp(leaf);
+	for (i = 0; i < ichdr.count; entry++, i++) {
+		int		blkcnt;
+
+		if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL))
+			continue;
+
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+		if (!name_rmt->valueblk)
+			continue;
+
+		blkcnt = xfs_attr3_rmt_blocks(dp->i_mount,
+				be32_to_cpu(name_rmt->valuelen));
+		error = xfs_attr3_rmt_stale(dp,
+				be32_to_cpu(name_rmt->valueblk), blkcnt);
+		if (error)
+			goto err;
+	}
+
+	xfs_trans_brelse(*trans, bp);
+err:
+	return error;
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+STATIC int
+xfs_attr3_node_inactive(
+	struct xfs_trans	**trans,
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp,
+	int			level)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_blkinfo	*info;
+	xfs_dablk_t		child_fsb;
+	xfs_daddr_t		parent_blkno, child_blkno;
+	struct xfs_buf		*child_bp;
+	struct xfs_da3_icnode_hdr ichdr;
+	int			error, i;
+
+	/*
+	 * Since this code is recursive (gasp!) we must protect ourselves.
+	 */
+	if (level > XFS_DA_NODE_MAXDEPTH) {
+		xfs_buf_mark_corrupt(bp);
+		xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
+		return -EFSCORRUPTED;
+	}
+
+	xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr);
+	parent_blkno = xfs_buf_daddr(bp);
+	if (!ichdr.count) {
+		xfs_trans_brelse(*trans, bp);
+		return 0;
+	}
+	child_fsb = be32_to_cpu(ichdr.btree[0].before);
+	xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
+	bp = NULL;
+
+	/*
+	 * If this is the node level just above the leaves, simply loop
+	 * over the leaves removing all of them.  If this is higher up
+	 * in the tree, recurse downward.
+	 */
+	for (i = 0; i < ichdr.count; i++) {
+		/*
+		 * Read the subsidiary block to see what we have to work with.
+		 * Don't do this in a transaction.  This is a depth-first
+		 * traversal of the tree so we may deal with many blocks
+		 * before we come back to this one.
+		 */
+		error = xfs_da3_node_read(*trans, dp, child_fsb, &child_bp,
+					  XFS_ATTR_FORK);
+		if (error)
+			return error;
+
+		/* save for re-read later */
+		child_blkno = xfs_buf_daddr(child_bp);
+
+		/*
+		 * Invalidate the subtree, however we have to.
+		 */
+		info = child_bp->b_addr;
+		switch (info->magic) {
+		case cpu_to_be16(XFS_DA_NODE_MAGIC):
+		case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+			error = xfs_attr3_node_inactive(trans, dp, child_bp,
+							level + 1);
+			break;
+		case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+		case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+			error = xfs_attr3_leaf_inactive(trans, dp, child_bp);
+			break;
+		default:
+			xfs_buf_mark_corrupt(child_bp);
+			xfs_trans_brelse(*trans, child_bp);
+			error = -EFSCORRUPTED;
+			break;
+		}
+		if (error)
+			return error;
+
+		/*
+		 * Remove the subsidiary block from the cache and from the log.
+		 */
+		error = xfs_trans_get_buf(*trans, mp->m_ddev_targp,
+				child_blkno,
+				XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0,
+				&child_bp);
+		if (error)
+			return error;
+		xfs_trans_binval(*trans, child_bp);
+		child_bp = NULL;
+
+		/*
+		 * If we're not done, re-read the parent to get the next
+		 * child block number.
+		 */
+		if (i + 1 < ichdr.count) {
+			struct xfs_da3_icnode_hdr phdr;
+
+			error = xfs_da3_node_read_mapped(*trans, dp,
+					parent_blkno, &bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+			xfs_da3_node_hdr_from_disk(dp->i_mount, &phdr,
+						  bp->b_addr);
+			child_fsb = be32_to_cpu(phdr.btree[i + 1].before);
+			xfs_trans_brelse(*trans, bp);
+			bp = NULL;
+		}
+		/*
+		 * Atomically commit the whole invalidate stuff.
+		 */
+		error = xfs_trans_roll_inode(trans, dp);
+		if (error)
+			return  error;
+	}
+
+	return 0;
+}
+
+/*
+ * Indiscriminately delete the entire attribute fork
+ *
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+static int
+xfs_attr3_root_inactive(
+	struct xfs_trans	**trans,
+	struct xfs_inode	*dp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_blkinfo	*info;
+	struct xfs_buf		*bp;
+	xfs_daddr_t		blkno;
+	int			error;
+
+	/*
+	 * Read block 0 to see what we have to work with.
+	 * We only get here if we have extents, since we remove
+	 * the extents in reverse order the extent containing
+	 * block 0 must still be there.
+	 */
+	error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK);
+	if (error)
+		return error;
+	blkno = xfs_buf_daddr(bp);
+
+	/*
+	 * Invalidate the tree, even if the "tree" is only a single leaf block.
+	 * This is a depth-first traversal!
+	 */
+	info = bp->b_addr;
+	switch (info->magic) {
+	case cpu_to_be16(XFS_DA_NODE_MAGIC):
+	case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+		error = xfs_attr3_node_inactive(trans, dp, bp, 1);
+		break;
+	case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+	case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+		error = xfs_attr3_leaf_inactive(trans, dp, bp);
+		break;
+	default:
+		error = -EFSCORRUPTED;
+		xfs_buf_mark_corrupt(bp);
+		xfs_trans_brelse(*trans, bp);
+		break;
+	}
+	if (error)
+		return error;
+
+	/*
+	 * Invalidate the incore copy of the root block.
+	 */
+	error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno,
+			XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp);
+	if (error)
+		return error;
+	error = bp->b_error;
+	if (error) {
+		xfs_trans_brelse(*trans, bp);
+		return error;
+	}
+	xfs_trans_binval(*trans, bp);	/* remove from cache */
+	/*
+	 * Commit the invalidate and start the next transaction.
+	 */
+	error = xfs_trans_roll_inode(trans, dp);
+
+	return error;
+}
+
+/*
+ * xfs_attr_inactive kills all traces of an attribute fork on an inode. It
+ * removes both the on-disk and in-memory inode fork. Note that this also has to
+ * handle the condition of inodes without attributes but with an attribute fork
+ * configured, so we can't use xfs_inode_hasattr() here.
+ *
+ * The in-memory attribute fork is removed even on error.
+ */
+int
+xfs_attr_inactive(
+	struct xfs_inode	*dp)
+{
+	struct xfs_trans	*trans;
+	struct xfs_mount	*mp;
+	int			lock_mode = XFS_ILOCK_SHARED;
+	int			error = 0;
+
+	mp = dp->i_mount;
+	ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+	xfs_ilock(dp, lock_mode);
+	if (!xfs_inode_has_attr_fork(dp))
+		goto out_destroy_fork;
+	xfs_iunlock(dp, lock_mode);
+
+	lock_mode = 0;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans);
+	if (error)
+		goto out_destroy_fork;
+
+	lock_mode = XFS_ILOCK_EXCL;
+	xfs_ilock(dp, lock_mode);
+
+	if (!xfs_inode_has_attr_fork(dp))
+		goto out_cancel;
+
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks, not allocate, in the common case.
+	 */
+	xfs_trans_ijoin(trans, dp, 0);
+
+	/*
+	 * Invalidate and truncate the attribute fork extents. Make sure the
+	 * fork actually has xattr blocks as otherwise the invalidation has no
+	 * blocks to read and returns an error. In this case, just do the fork
+	 * removal below.
+	 */
+	if (dp->i_af.if_nextents > 0) {
+		error = xfs_attr3_root_inactive(&trans, dp);
+		if (error)
+			goto out_cancel;
+
+		error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
+		if (error)
+			goto out_cancel;
+	}
+
+	/* Reset the attribute fork - this also destroys the in-core fork */
+	xfs_attr_fork_remove(dp, trans);
+
+	error = xfs_trans_commit(trans);
+	xfs_iunlock(dp, lock_mode);
+	return error;
+
+out_cancel:
+	xfs_trans_cancel(trans);
+out_destroy_fork:
+	/* kill the in-core attr fork before we drop the inode lock */
+	xfs_ifork_zap_attr(dp);
+	if (lock_mode)
+		xfs_iunlock(dp, lock_mode);
+	return error;
+}
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
new file mode 100644
index 000000000..2788a6f2e
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.c
@@ -0,0 +1,881 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022 Oracle.  All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_item.h"
+#include "xfs_trace.h"
+#include "xfs_trans_space.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache		*xfs_attri_cache;
+struct kmem_cache		*xfs_attrd_cache;
+
+static const struct xfs_item_ops xfs_attri_item_ops;
+static const struct xfs_item_ops xfs_attrd_item_ops;
+static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
+					struct xfs_attri_log_item *attrip);
+
+static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_attri_log_item, attri_item);
+}
+
+/*
+ * Shared xattr name/value buffers for logged extended attribute operations
+ *
+ * When logging updates to extended attributes, we can create quite a few
+ * attribute log intent items for a single xattr update.  To avoid cycling the
+ * memory allocator and memcpy overhead, the name (and value, for setxattr)
+ * are kept in a refcounted object that is shared across all related log items
+ * and the upper-level deferred work state structure.  The shared buffer has
+ * a control structure, followed by the name, and then the value.
+ */
+
+static inline struct xfs_attri_log_nameval *
+xfs_attri_log_nameval_get(
+	struct xfs_attri_log_nameval	*nv)
+{
+	if (!refcount_inc_not_zero(&nv->refcount))
+		return NULL;
+	return nv;
+}
+
+static inline void
+xfs_attri_log_nameval_put(
+	struct xfs_attri_log_nameval	*nv)
+{
+	if (!nv)
+		return;
+	if (refcount_dec_and_test(&nv->refcount))
+		kvfree(nv);
+}
+
+static inline struct xfs_attri_log_nameval *
+xfs_attri_log_nameval_alloc(
+	const void			*name,
+	unsigned int			name_len,
+	const void			*value,
+	unsigned int			value_len)
+{
+	struct xfs_attri_log_nameval	*nv;
+
+	/*
+	 * This could be over 64kB in length, so we have to use kvmalloc() for
+	 * this. But kvmalloc() utterly sucks, so we use our own version.
+	 */
+	nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
+					name_len + value_len);
+
+	nv->name.i_addr = nv + 1;
+	nv->name.i_len = name_len;
+	nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
+	memcpy(nv->name.i_addr, name, name_len);
+
+	if (value_len) {
+		nv->value.i_addr = nv->name.i_addr + name_len;
+		nv->value.i_len = value_len;
+		memcpy(nv->value.i_addr, value, value_len);
+	} else {
+		nv->value.i_addr = NULL;
+		nv->value.i_len = 0;
+	}
+	nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
+
+	refcount_set(&nv->refcount, 1);
+	return nv;
+}
+
+STATIC void
+xfs_attri_item_free(
+	struct xfs_attri_log_item	*attrip)
+{
+	kmem_free(attrip->attri_item.li_lv_shadow);
+	xfs_attri_log_nameval_put(attrip->attri_nameval);
+	kmem_cache_free(xfs_attri_cache, attrip);
+}
+
+/*
+ * Freeing the attrip requires that we remove it from the AIL if it has already
+ * been placed there. However, the ATTRI may not yet have been placed in the
+ * AIL when called by xfs_attri_release() from ATTRD processing due to the
+ * ordering of committed vs unpin operations in bulk insert operations. Hence
+ * the reference count to ensure only the last caller frees the ATTRI.
+ */
+STATIC void
+xfs_attri_release(
+	struct xfs_attri_log_item	*attrip)
+{
+	ASSERT(atomic_read(&attrip->attri_refcount) > 0);
+	if (!atomic_dec_and_test(&attrip->attri_refcount))
+		return;
+
+	xfs_trans_ail_delete(&attrip->attri_item, 0);
+	xfs_attri_item_free(attrip);
+}
+
+STATIC void
+xfs_attri_item_size(
+	struct xfs_log_item		*lip,
+	int				*nvecs,
+	int				*nbytes)
+{
+	struct xfs_attri_log_item       *attrip = ATTRI_ITEM(lip);
+	struct xfs_attri_log_nameval	*nv = attrip->attri_nameval;
+
+	*nvecs += 2;
+	*nbytes += sizeof(struct xfs_attri_log_format) +
+			xlog_calc_iovec_len(nv->name.i_len);
+
+	if (!nv->value.i_len)
+		return;
+
+	*nvecs += 1;
+	*nbytes += xlog_calc_iovec_len(nv->value.i_len);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attri log
+ * item. We use  1 iovec for the attri_format_item, 1 for the name, and
+ * another for the value if it is present
+ */
+STATIC void
+xfs_attri_item_format(
+	struct xfs_log_item		*lip,
+	struct xfs_log_vec		*lv)
+{
+	struct xfs_attri_log_item	*attrip = ATTRI_ITEM(lip);
+	struct xfs_log_iovec		*vecp = NULL;
+	struct xfs_attri_log_nameval	*nv = attrip->attri_nameval;
+
+	attrip->attri_format.alfi_type = XFS_LI_ATTRI;
+	attrip->attri_format.alfi_size = 1;
+
+	/*
+	 * This size accounting must be done before copying the attrip into the
+	 * iovec.  If we do it after, the wrong size will be recorded to the log
+	 * and we trip across assertion checks for bad region sizes later during
+	 * the log recovery.
+	 */
+
+	ASSERT(nv->name.i_len > 0);
+	attrip->attri_format.alfi_size++;
+
+	if (nv->value.i_len > 0)
+		attrip->attri_format.alfi_size++;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
+			&attrip->attri_format,
+			sizeof(struct xfs_attri_log_format));
+	xlog_copy_from_iovec(lv, &vecp, &nv->name);
+	if (nv->value.i_len > 0)
+		xlog_copy_from_iovec(lv, &vecp, &nv->value);
+}
+
+/*
+ * The unpin operation is the last place an ATTRI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the ATTRI transaction has been successfully committed to make
+ * it this far. Therefore, we expect whoever committed the ATTRI to either
+ * construct and commit the ATTRD or drop the ATTRD's reference in the event of
+ * error. Simply drop the log's ATTRI reference now that the log is done with
+ * it.
+ */
+STATIC void
+xfs_attri_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+
+STATIC void
+xfs_attri_item_release(
+	struct xfs_log_item	*lip)
+{
+	xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an attri item.  Caller may allocate an additional
+ * trailing buffer for name and value
+ */
+STATIC struct xfs_attri_log_item *
+xfs_attri_init(
+	struct xfs_mount		*mp,
+	struct xfs_attri_log_nameval	*nv)
+{
+	struct xfs_attri_log_item	*attrip;
+
+	attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL);
+
+	/*
+	 * Grab an extra reference to the name/value buffer for this log item.
+	 * The caller retains its own reference!
+	 */
+	attrip->attri_nameval = xfs_attri_log_nameval_get(nv);
+	ASSERT(attrip->attri_nameval);
+
+	xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI,
+			  &xfs_attri_item_ops);
+	attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip;
+	atomic_set(&attrip->attri_refcount, 2);
+
+	return attrip;
+}
+
+static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_attrd_log_item, attrd_item);
+}
+
+STATIC void
+xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
+{
+	kmem_free(attrdp->attrd_item.li_lv_shadow);
+	kmem_cache_free(xfs_attrd_cache, attrdp);
+}
+
+STATIC void
+xfs_attrd_item_size(
+	struct xfs_log_item		*lip,
+	int				*nvecs,
+	int				*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_attrd_log_format);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attrd log item. We use
+ * only 1 iovec for the attrd_format, and we point that at the attr_log_format
+ * structure embedded in the attrd item.
+ */
+STATIC void
+xfs_attrd_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_attrd_log_item	*attrdp = ATTRD_ITEM(lip);
+	struct xfs_log_iovec		*vecp = NULL;
+
+	attrdp->attrd_format.alfd_type = XFS_LI_ATTRD;
+	attrdp->attrd_format.alfd_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT,
+			&attrdp->attrd_format,
+			sizeof(struct xfs_attrd_log_format));
+}
+
+/*
+ * The ATTRD is either committed or aborted if the transaction is canceled. If
+ * the transaction is canceled, drop our reference to the ATTRI and free the
+ * ATTRD.
+ */
+STATIC void
+xfs_attrd_item_release(
+	struct xfs_log_item		*lip)
+{
+	struct xfs_attrd_log_item	*attrdp = ATTRD_ITEM(lip);
+
+	xfs_attri_release(attrdp->attrd_attrip);
+	xfs_attrd_item_free(attrdp);
+}
+
+static struct xfs_log_item *
+xfs_attrd_item_intent(
+	struct xfs_log_item	*lip)
+{
+	return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
+}
+
+/*
+ * Performs one step of an attribute update intent and marks the attrd item
+ * dirty..  An attr operation may be a set or a remove.  Note that the
+ * transaction is marked dirty regardless of whether the operation succeeds or
+ * fails to support the ATTRI/ATTRD lifecycle rules.
+ */
+STATIC int
+xfs_xattri_finish_update(
+	struct xfs_attr_intent		*attr,
+	struct xfs_attrd_log_item	*attrdp)
+{
+	struct xfs_da_args		*args = attr->xattri_da_args;
+	int				error;
+
+	if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+		error = -EIO;
+		goto out;
+	}
+
+	error = xfs_attr_set_iter(attr);
+	if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+		error = -EAGAIN;
+out:
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the ATTRI and frees the ATTRD
+	 * 2.) shuts down the filesystem
+	 */
+	args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+
+	/*
+	 * attr intent/done items are null when logged attributes are disabled
+	 */
+	if (attrdp)
+		set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+	return error;
+}
+
+/* Log an attr to the intent item. */
+STATIC void
+xfs_attr_log_item(
+	struct xfs_trans		*tp,
+	struct xfs_attri_log_item	*attrip,
+	const struct xfs_attr_intent	*attr)
+{
+	struct xfs_attri_log_format	*attrp;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
+
+	/*
+	 * At this point the xfs_attr_intent has been constructed, and we've
+	 * created the log intent. Fill in the attri log item and log format
+	 * structure with fields from this xfs_attr_intent
+	 */
+	attrp = &attrip->attri_format;
+	attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+	ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
+	attrp->alfi_op_flags = attr->xattri_op_flags;
+	attrp->alfi_value_len = attr->xattri_nameval->value.i_len;
+	attrp->alfi_name_len = attr->xattri_nameval->name.i_len;
+	ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
+	attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter;
+}
+
+/* Get an ATTRI. */
+static struct xfs_log_item *
+xfs_attr_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_attri_log_item	*attrip;
+	struct xfs_attr_intent		*attr;
+	struct xfs_da_args		*args;
+
+	ASSERT(count == 1);
+
+	/*
+	 * Each attr item only performs one attribute operation at a time, so
+	 * this is a list of one
+	 */
+	attr = list_first_entry_or_null(items, struct xfs_attr_intent,
+			xattri_list);
+	args = attr->xattri_da_args;
+
+	if (!(args->op_flags & XFS_DA_OP_LOGGED))
+		return NULL;
+
+	/*
+	 * Create a buffer to store the attribute name and value.  This buffer
+	 * will be shared between the higher level deferred xattr work state
+	 * and the lower level xattr log items.
+	 */
+	if (!attr->xattri_nameval) {
+		/*
+		 * Transfer our reference to the name/value buffer to the
+		 * deferred work state structure.
+		 */
+		attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
+				args->namelen, args->value, args->valuelen);
+	}
+
+	attrip = xfs_attri_init(mp, attr->xattri_nameval);
+	xfs_trans_add_item(tp, &attrip->attri_item);
+	xfs_attr_log_item(tp, attrip, attr);
+
+	return &attrip->attri_item;
+}
+
+static inline void
+xfs_attr_free_item(
+	struct xfs_attr_intent		*attr)
+{
+	if (attr->xattri_da_state)
+		xfs_da_state_free(attr->xattri_da_state);
+	xfs_attri_log_nameval_put(attr->xattri_nameval);
+	if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY)
+		kmem_free(attr);
+	else
+		kmem_cache_free(xfs_attr_intent_cache, attr);
+}
+
+/* Process an attr. */
+STATIC int
+xfs_attr_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_attr_intent		*attr;
+	struct xfs_attrd_log_item	*done_item = NULL;
+	int				error;
+
+	attr = container_of(item, struct xfs_attr_intent, xattri_list);
+	if (done)
+		done_item = ATTRD_ITEM(done);
+
+	/*
+	 * Always reset trans after EAGAIN cycle
+	 * since the transaction is new
+	 */
+	attr->xattri_da_args->trans = tp;
+
+	error = xfs_xattri_finish_update(attr, done_item);
+	if (error != -EAGAIN)
+		xfs_attr_free_item(attr);
+
+	return error;
+}
+
+/* Abort all pending ATTRs. */
+STATIC void
+xfs_attr_abort_intent(
+	struct xfs_log_item		*intent)
+{
+	xfs_attri_release(ATTRI_ITEM(intent));
+}
+
+/* Cancel an attr */
+STATIC void
+xfs_attr_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_attr_intent		*attr;
+
+	attr = container_of(item, struct xfs_attr_intent, xattri_list);
+	xfs_attr_free_item(attr);
+}
+
+STATIC bool
+xfs_attri_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
+}
+
+/* Is this recovered ATTRI format ok? */
+static inline bool
+xfs_attri_validate(
+	struct xfs_mount		*mp,
+	struct xfs_attri_log_format	*attrp)
+{
+	unsigned int			op = attrp->alfi_op_flags &
+					     XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+
+	if (attrp->__pad != 0)
+		return false;
+
+	if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)
+		return false;
+
+	if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK)
+		return false;
+
+	/* alfi_op_flags should be either a set or remove */
+	switch (op) {
+	case XFS_ATTRI_OP_FLAGS_SET:
+	case XFS_ATTRI_OP_FLAGS_REPLACE:
+	case XFS_ATTRI_OP_FLAGS_REMOVE:
+		break;
+	default:
+		return false;
+	}
+
+	if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+		return false;
+
+	if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
+	    (attrp->alfi_name_len == 0))
+		return false;
+
+	return xfs_verify_ino(mp, attrp->alfi_ino);
+}
+
+/*
+ * Process an attr intent item that was recovered from the log.  We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attri_item_recover(
+	struct xfs_log_item		*lip,
+	struct list_head		*capture_list)
+{
+	struct xfs_attri_log_item	*attrip = ATTRI_ITEM(lip);
+	struct xfs_attr_intent		*attr;
+	struct xfs_mount		*mp = lip->li_log->l_mp;
+	struct xfs_inode		*ip;
+	struct xfs_da_args		*args;
+	struct xfs_trans		*tp;
+	struct xfs_trans_res		tres;
+	struct xfs_attri_log_format	*attrp;
+	struct xfs_attri_log_nameval	*nv = attrip->attri_nameval;
+	int				error;
+	int				total;
+	int				local;
+	struct xfs_attrd_log_item	*done_item = NULL;
+
+	/*
+	 * First check the validity of the attr described by the ATTRI.  If any
+	 * are bad, then assume that all are bad and just toss the ATTRI.
+	 */
+	attrp = &attrip->attri_format;
+	if (!xfs_attri_validate(mp, attrp) ||
+	    !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+		return -EFSCORRUPTED;
+
+	error = xlog_recover_iget(mp,  attrp->alfi_ino, &ip);
+	if (error)
+		return error;
+
+	attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
+			   sizeof(struct xfs_da_args), KM_NOFS);
+	args = (struct xfs_da_args *)(attr + 1);
+
+	attr->xattri_da_args = args;
+	attr->xattri_op_flags = attrp->alfi_op_flags &
+						XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+
+	/*
+	 * We're reconstructing the deferred work state structure from the
+	 * recovered log item.  Grab a reference to the name/value buffer and
+	 * attach it to the new work state.
+	 */
+	attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
+	ASSERT(attr->xattri_nameval);
+
+	args->dp = ip;
+	args->geo = mp->m_attr_geo;
+	args->whichfork = XFS_ATTR_FORK;
+	args->name = nv->name.i_addr;
+	args->namelen = nv->name.i_len;
+	args->hashval = xfs_da_hashname(args->name, args->namelen);
+	args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
+	args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
+			 XFS_DA_OP_LOGGED;
+
+	ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
+
+	switch (attr->xattri_op_flags) {
+	case XFS_ATTRI_OP_FLAGS_SET:
+	case XFS_ATTRI_OP_FLAGS_REPLACE:
+		args->value = nv->value.i_addr;
+		args->valuelen = nv->value.i_len;
+		args->total = xfs_attr_calc_size(args, &local);
+		if (xfs_inode_hasattr(args->dp))
+			attr->xattri_dela_state = xfs_attr_init_replace_state(args);
+		else
+			attr->xattri_dela_state = xfs_attr_init_add_state(args);
+		break;
+	case XFS_ATTRI_OP_FLAGS_REMOVE:
+		if (!xfs_inode_hasattr(args->dp))
+			goto out;
+		attr->xattri_dela_state = xfs_attr_init_remove_state(args);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	xfs_init_attr_trans(args, &tres, &total);
+	error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+	if (error)
+		goto out;
+
+	args->trans = tp;
+	done_item = xfs_trans_get_attrd(tp, attrip);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_xattri_finish_update(attr, done_item);
+	if (error == -EAGAIN) {
+		/*
+		 * There's more work to do, so add the intent item to this
+		 * transaction so that we can continue it later.
+		 */
+		xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
+		error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+		if (error)
+			goto out_unlock;
+
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_irele(ip);
+		return 0;
+	}
+	if (error) {
+		xfs_trans_cancel(tp);
+		goto out_unlock;
+	}
+
+	error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_irele(ip);
+out:
+	xfs_attr_free_item(attr);
+	return error;
+}
+
+/* Re-log an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_attri_item_relog(
+	struct xfs_log_item		*intent,
+	struct xfs_trans		*tp)
+{
+	struct xfs_attrd_log_item	*attrdp;
+	struct xfs_attri_log_item	*old_attrip;
+	struct xfs_attri_log_item	*new_attrip;
+	struct xfs_attri_log_format	*new_attrp;
+	struct xfs_attri_log_format	*old_attrp;
+
+	old_attrip = ATTRI_ITEM(intent);
+	old_attrp = &old_attrip->attri_format;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	attrdp = xfs_trans_get_attrd(tp, old_attrip);
+	set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+	/*
+	 * Create a new log item that shares the same name/value buffer as the
+	 * old log item.
+	 */
+	new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval);
+	new_attrp = &new_attrip->attri_format;
+
+	new_attrp->alfi_ino = old_attrp->alfi_ino;
+	new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
+	new_attrp->alfi_value_len = old_attrp->alfi_value_len;
+	new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+	new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
+
+	xfs_trans_add_item(tp, &new_attrip->attri_item);
+	set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
+
+	return &new_attrip->attri_item;
+}
+
+STATIC int
+xlog_recover_attri_commit_pass2(
+	struct xlog                     *log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item        *item,
+	xfs_lsn_t                       lsn)
+{
+	struct xfs_mount                *mp = log->l_mp;
+	struct xfs_attri_log_item       *attrip;
+	struct xfs_attri_log_format     *attri_formatp;
+	struct xfs_attri_log_nameval	*nv;
+	const void			*attr_value = NULL;
+	const void			*attr_name;
+	size_t				len;
+
+	attri_formatp = item->ri_buf[0].i_addr;
+	attr_name = item->ri_buf[1].i_addr;
+
+	/* Validate xfs_attri_log_format before the large memory allocation */
+	len = sizeof(struct xfs_attri_log_format);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	if (!xfs_attri_validate(mp, attri_formatp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	/* Validate the attr name */
+	if (item->ri_buf[1].i_len !=
+			xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[1].i_addr, item->ri_buf[1].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	/* Validate the attr value, if present */
+	if (attri_formatp->alfi_value_len != 0) {
+		if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					item->ri_buf[0].i_addr,
+					item->ri_buf[0].i_len);
+			return -EFSCORRUPTED;
+		}
+
+		attr_value = item->ri_buf[2].i_addr;
+	}
+
+	/*
+	 * Memory alloc failure will cause replay to abort.  We attach the
+	 * name/value buffer to the recovered incore log item and drop our
+	 * reference.
+	 */
+	nv = xfs_attri_log_nameval_alloc(attr_name,
+			attri_formatp->alfi_name_len, attr_value,
+			attri_formatp->alfi_value_len);
+
+	attrip = xfs_attri_init(mp, nv);
+	memcpy(&attrip->attri_format, attri_formatp, len);
+
+	/*
+	 * The ATTRI has two references. One for the ATTRD and one for ATTRI to
+	 * ensure it makes it into the AIL. Insert the ATTRI into the AIL
+	 * directly and drop the ATTRI reference. Note that
+	 * xfs_trans_ail_update() drops the AIL lock.
+	 */
+	xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
+	xfs_attri_release(attrip);
+	xfs_attri_log_nameval_put(nv);
+	return 0;
+}
+
+/*
+ * This routine is called to allocate an "attr free done" log item.
+ */
+static struct xfs_attrd_log_item *
+xfs_trans_get_attrd(struct xfs_trans		*tp,
+		  struct xfs_attri_log_item	*attrip)
+{
+	struct xfs_attrd_log_item		*attrdp;
+
+	ASSERT(tp != NULL);
+
+	attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+	xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+			  &xfs_attrd_item_ops);
+	attrdp->attrd_attrip = attrip;
+	attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+	xfs_trans_add_item(tp, &attrdp->attrd_item);
+	return attrdp;
+}
+
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	if (!intent)
+		return NULL;
+
+	return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+	.max_items	= 1,
+	.create_intent	= xfs_attr_create_intent,
+	.abort_intent	= xfs_attr_abort_intent,
+	.create_done	= xfs_attr_create_done,
+	.finish_item	= xfs_attr_finish_item,
+	.cancel_item	= xfs_attr_cancel_item,
+};
+
+/*
+ * This routine is called when an ATTRD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding ATTRI if
+ * it was still in the log. To do this it searches the AIL for the ATTRI with
+ * an id equal to that in the ATTRD format structure. If we find it we drop
+ * the ATTRD reference, which removes the ATTRI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_attrd_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_attrd_log_format	*attrd_formatp;
+
+	attrd_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_ATTRI,
+				    attrd_formatp->alfd_alf_id);
+	return 0;
+}
+
+static const struct xfs_item_ops xfs_attri_item_ops = {
+	.flags		= XFS_ITEM_INTENT,
+	.iop_size	= xfs_attri_item_size,
+	.iop_format	= xfs_attri_item_format,
+	.iop_unpin	= xfs_attri_item_unpin,
+	.iop_release    = xfs_attri_item_release,
+	.iop_recover	= xfs_attri_item_recover,
+	.iop_match	= xfs_attri_item_match,
+	.iop_relog	= xfs_attri_item_relog,
+};
+
+const struct xlog_recover_item_ops xlog_attri_item_ops = {
+	.item_type	= XFS_LI_ATTRI,
+	.commit_pass2	= xlog_recover_attri_commit_pass2,
+};
+
+static const struct xfs_item_ops xfs_attrd_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED |
+			  XFS_ITEM_INTENT_DONE,
+	.iop_size	= xfs_attrd_item_size,
+	.iop_format	= xfs_attrd_item_format,
+	.iop_release    = xfs_attrd_item_release,
+	.iop_intent	= xfs_attrd_item_intent,
+};
+
+const struct xlog_recover_item_ops xlog_attrd_item_ops = {
+	.item_type	= XFS_LI_ATTRD,
+	.commit_pass2	= xlog_recover_attrd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
new file mode 100644
index 000000000..3280a7930
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2022 Oracle.  All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+#ifndef	__XFS_ATTR_ITEM_H__
+#define	__XFS_ATTR_ITEM_H__
+
+/* kernel only ATTRI/ATTRD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+struct xfs_attri_log_nameval {
+	struct xfs_log_iovec	name;
+	struct xfs_log_iovec	value;
+	refcount_t		refcount;
+
+	/* name and value follow the end of this struct */
+};
+
+/*
+ * This is the "attr intention" log item.  It is used to log the fact that some
+ * extended attribute operations need to be processed.  An operation is
+ * currently either a set or remove.  Set or remove operations are described by
+ * the xfs_attr_intent which may be logged to this intent.
+ *
+ * During a normal attr operation, name and value point to the name and value
+ * fields of the caller's xfs_da_args structure.  During a recovery, the name
+ * and value buffers are copied from the log, and stored in a trailing buffer
+ * attached to the xfs_attr_intent until they are committed.  They are freed
+ * when the xfs_attr_intent itself is freed when the work is done.
+ */
+struct xfs_attri_log_item {
+	struct xfs_log_item		attri_item;
+	atomic_t			attri_refcount;
+	struct xfs_attri_log_nameval	*attri_nameval;
+	struct xfs_attri_log_format	attri_format;
+};
+
+/*
+ * This is the "attr done" log item.  It is used to log the fact that some attrs
+ * earlier mentioned in an attri item have been freed.
+ */
+struct xfs_attrd_log_item {
+	struct xfs_log_item		attrd_item;
+	struct xfs_attri_log_item	*attrd_attrip;
+	struct xfs_attrd_log_format	attrd_format;
+};
+
+extern struct kmem_cache	*xfs_attri_cache;
+extern struct kmem_cache	*xfs_attrd_cache;
+
+#endif	/* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
new file mode 100644
index 000000000..99bbbe1a0
--- /dev/null
+++ b/fs/xfs/xfs_attr_list.c
@@ -0,0 +1,539 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_dir2.h"
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+	xfs_attr_sf_sort_t *sa, *sb;
+
+	sa = (xfs_attr_sf_sort_t *)a;
+	sb = (xfs_attr_sf_sort_t *)b;
+	if (sa->hash < sb->hash) {
+		return -1;
+	} else if (sa->hash > sb->hash) {
+		return 1;
+	} else {
+		return sa->entno - sb->entno;
+	}
+}
+
+#define XFS_ISRESET_CURSOR(cursor) \
+	(!((cursor)->initted) && !((cursor)->hashval) && \
+	 !((cursor)->blkno) && !((cursor)->offset))
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform attribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+static int
+xfs_attr_shortform_list(
+	struct xfs_attr_list_context	*context)
+{
+	struct xfs_attrlist_cursor_kern	*cursor = &context->cursor;
+	struct xfs_inode		*dp = context->dp;
+	struct xfs_attr_sf_sort		*sbuf, *sbp;
+	struct xfs_attr_shortform	*sf;
+	struct xfs_attr_sf_entry	*sfe;
+	int				sbsize, nsbuf, count, i;
+	int				error = 0;
+
+	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
+	ASSERT(sf != NULL);
+	if (!sf->hdr.count)
+		return 0;
+
+	trace_xfs_attr_list_sf(context);
+
+	/*
+	 * If the buffer is large enough and the cursor is at the start,
+	 * do not bother with sorting since we will return everything in
+	 * one buffer and another call using the cursor won't need to be
+	 * made.
+	 * Note the generous fudge factor of 16 overhead bytes per entry.
+	 * If bufsize is zero then put_listent must be a search function
+	 * and can just scan through what we have.
+	 */
+	if (context->bufsize == 0 ||
+	    (XFS_ISRESET_CURSOR(cursor) &&
+	     (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+			if (XFS_IS_CORRUPT(context->dp->i_mount,
+					   !xfs_attr_namecheck(sfe->nameval,
+							       sfe->namelen)))
+				return -EFSCORRUPTED;
+			context->put_listent(context,
+					     sfe->flags,
+					     sfe->nameval,
+					     (int)sfe->namelen,
+					     (int)sfe->valuelen);
+			/*
+			 * Either search callback finished early or
+			 * didn't fit it all in the buffer after all.
+			 */
+			if (context->seen_enough)
+				break;
+			sfe = xfs_attr_sf_nextentry(sfe);
+		}
+		trace_xfs_attr_list_sf_all(context);
+		return 0;
+	}
+
+	/* do no more for a search callback */
+	if (context->bufsize == 0)
+		return 0;
+
+	/*
+	 * It didn't all fit, so we have to sort everything on hashval.
+	 */
+	sbsize = sf->hdr.count * sizeof(*sbuf);
+	sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
+
+	/*
+	 * Scan the attribute list for the rest of the entries, storing
+	 * the relevant info from only those that match into a buffer.
+	 */
+	nsbuf = 0;
+	for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+		if (unlikely(
+		    ((char *)sfe < (char *)sf) ||
+		    ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
+			XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+					     XFS_ERRLEVEL_LOW,
+					     context->dp->i_mount, sfe,
+					     sizeof(*sfe));
+			kmem_free(sbuf);
+			return -EFSCORRUPTED;
+		}
+
+		sbp->entno = i;
+		sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+		sbp->name = sfe->nameval;
+		sbp->namelen = sfe->namelen;
+		/* These are bytes, and both on-disk, don't endian-flip */
+		sbp->valuelen = sfe->valuelen;
+		sbp->flags = sfe->flags;
+		sfe = xfs_attr_sf_nextentry(sfe);
+		sbp++;
+		nsbuf++;
+	}
+
+	/*
+	 * Sort the entries on hash then entno.
+	 */
+	xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+	/*
+	 * Re-find our place IN THE SORTED LIST.
+	 */
+	count = 0;
+	cursor->initted = 1;
+	cursor->blkno = 0;
+	for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+		if (sbp->hash == cursor->hashval) {
+			if (cursor->offset == count) {
+				break;
+			}
+			count++;
+		} else if (sbp->hash > cursor->hashval) {
+			break;
+		}
+	}
+	if (i == nsbuf)
+		goto out;
+
+	/*
+	 * Loop putting entries into the user buffer.
+	 */
+	for ( ; i < nsbuf; i++, sbp++) {
+		if (cursor->hashval != sbp->hash) {
+			cursor->hashval = sbp->hash;
+			cursor->offset = 0;
+		}
+		if (XFS_IS_CORRUPT(context->dp->i_mount,
+				   !xfs_attr_namecheck(sbp->name,
+						       sbp->namelen))) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+		context->put_listent(context,
+				     sbp->flags,
+				     sbp->name,
+				     sbp->namelen,
+				     sbp->valuelen);
+		if (context->seen_enough)
+			break;
+		cursor->offset++;
+	}
+out:
+	kmem_free(sbuf);
+	return error;
+}
+
+/*
+ * We didn't find the block & hash mentioned in the cursor state, so
+ * walk down the attr btree looking for the hash.
+ */
+STATIC int
+xfs_attr_node_list_lookup(
+	struct xfs_attr_list_context	*context,
+	struct xfs_attrlist_cursor_kern	*cursor,
+	struct xfs_buf			**pbp)
+{
+	struct xfs_da3_icnode_hdr	nodehdr;
+	struct xfs_da_intnode		*node;
+	struct xfs_da_node_entry	*btree;
+	struct xfs_inode		*dp = context->dp;
+	struct xfs_mount		*mp = dp->i_mount;
+	struct xfs_trans		*tp = context->tp;
+	struct xfs_buf			*bp;
+	int				i;
+	int				error = 0;
+	unsigned int			expected_level = 0;
+	uint16_t			magic;
+
+	ASSERT(*pbp == NULL);
+	cursor->blkno = 0;
+	for (;;) {
+		error = xfs_da3_node_read(tp, dp, cursor->blkno, &bp,
+				XFS_ATTR_FORK);
+		if (error)
+			return error;
+		node = bp->b_addr;
+		magic = be16_to_cpu(node->hdr.info.magic);
+		if (magic == XFS_ATTR_LEAF_MAGIC ||
+		    magic == XFS_ATTR3_LEAF_MAGIC)
+			break;
+		if (magic != XFS_DA_NODE_MAGIC &&
+		    magic != XFS_DA3_NODE_MAGIC) {
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					node, sizeof(*node));
+			goto out_corruptbuf;
+		}
+
+		xfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
+
+		/* Tree taller than we can handle; bail out! */
+		if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+			goto out_corruptbuf;
+
+		/* Check the level from the root node. */
+		if (cursor->blkno == 0)
+			expected_level = nodehdr.level - 1;
+		else if (expected_level != nodehdr.level)
+			goto out_corruptbuf;
+		else
+			expected_level--;
+
+		btree = nodehdr.btree;
+		for (i = 0; i < nodehdr.count; btree++, i++) {
+			if (cursor->hashval <= be32_to_cpu(btree->hashval)) {
+				cursor->blkno = be32_to_cpu(btree->before);
+				trace_xfs_attr_list_node_descend(context,
+						btree);
+				break;
+			}
+		}
+		xfs_trans_brelse(tp, bp);
+
+		if (i == nodehdr.count)
+			return 0;
+
+		/* We can't point back to the root. */
+		if (XFS_IS_CORRUPT(mp, cursor->blkno == 0))
+			return -EFSCORRUPTED;
+	}
+
+	if (expected_level != 0)
+		goto out_corruptbuf;
+
+	*pbp = bp;
+	return 0;
+
+out_corruptbuf:
+	xfs_buf_mark_corrupt(bp);
+	xfs_trans_brelse(tp, bp);
+	return -EFSCORRUPTED;
+}
+
+STATIC int
+xfs_attr_node_list(
+	struct xfs_attr_list_context	*context)
+{
+	struct xfs_attrlist_cursor_kern	*cursor = &context->cursor;
+	struct xfs_attr3_icleaf_hdr	leafhdr;
+	struct xfs_attr_leafblock	*leaf;
+	struct xfs_da_intnode		*node;
+	struct xfs_buf			*bp;
+	struct xfs_inode		*dp = context->dp;
+	struct xfs_mount		*mp = dp->i_mount;
+	int				error = 0;
+
+	trace_xfs_attr_node_list(context);
+
+	cursor->initted = 1;
+
+	/*
+	 * Do all sorts of validation on the passed-in cursor structure.
+	 * If anything is amiss, ignore the cursor and look up the hashval
+	 * starting from the btree root.
+	 */
+	bp = NULL;
+	if (cursor->blkno > 0) {
+		error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
+				XFS_ATTR_FORK);
+		if ((error != 0) && (error != -EFSCORRUPTED))
+			return error;
+		if (bp) {
+			struct xfs_attr_leaf_entry *entries;
+
+			node = bp->b_addr;
+			switch (be16_to_cpu(node->hdr.info.magic)) {
+			case XFS_DA_NODE_MAGIC:
+			case XFS_DA3_NODE_MAGIC:
+				trace_xfs_attr_list_wrong_blk(context);
+				xfs_trans_brelse(context->tp, bp);
+				bp = NULL;
+				break;
+			case XFS_ATTR_LEAF_MAGIC:
+			case XFS_ATTR3_LEAF_MAGIC:
+				leaf = bp->b_addr;
+				xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
+							     &leafhdr, leaf);
+				entries = xfs_attr3_leaf_entryp(leaf);
+				if (cursor->hashval > be32_to_cpu(
+						entries[leafhdr.count - 1].hashval)) {
+					trace_xfs_attr_list_wrong_blk(context);
+					xfs_trans_brelse(context->tp, bp);
+					bp = NULL;
+				} else if (cursor->hashval <= be32_to_cpu(
+						entries[0].hashval)) {
+					trace_xfs_attr_list_wrong_blk(context);
+					xfs_trans_brelse(context->tp, bp);
+					bp = NULL;
+				}
+				break;
+			default:
+				trace_xfs_attr_list_wrong_blk(context);
+				xfs_trans_brelse(context->tp, bp);
+				bp = NULL;
+			}
+		}
+	}
+
+	/*
+	 * We did not find what we expected given the cursor's contents,
+	 * so we start from the top and work down based on the hash value.
+	 * Note that start of node block is same as start of leaf block.
+	 */
+	if (bp == NULL) {
+		error = xfs_attr_node_list_lookup(context, cursor, &bp);
+		if (error || !bp)
+			return error;
+	}
+	ASSERT(bp != NULL);
+
+	/*
+	 * Roll upward through the blocks, processing each leaf block in
+	 * order.  As long as there is space in the result buffer, keep
+	 * adding the information.
+	 */
+	for (;;) {
+		leaf = bp->b_addr;
+		error = xfs_attr3_leaf_list_int(bp, context);
+		if (error)
+			break;
+		xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+		if (context->seen_enough || leafhdr.forw == 0)
+			break;
+		cursor->blkno = leafhdr.forw;
+		xfs_trans_brelse(context->tp, bp);
+		error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno,
+					    &bp);
+		if (error)
+			return error;
+	}
+	xfs_trans_brelse(context->tp, bp);
+	return error;
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr3_leaf_list_int(
+	struct xfs_buf			*bp,
+	struct xfs_attr_list_context	*context)
+{
+	struct xfs_attrlist_cursor_kern	*cursor = &context->cursor;
+	struct xfs_attr_leafblock	*leaf;
+	struct xfs_attr3_icleaf_hdr	ichdr;
+	struct xfs_attr_leaf_entry	*entries;
+	struct xfs_attr_leaf_entry	*entry;
+	int				i;
+	struct xfs_mount		*mp = context->dp->i_mount;
+
+	trace_xfs_attr_list_leaf(context);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+
+	cursor->initted = 1;
+
+	/*
+	 * Re-find our place in the leaf block if this is a new syscall.
+	 */
+	if (context->resynch) {
+		entry = &entries[0];
+		for (i = 0; i < ichdr.count; entry++, i++) {
+			if (be32_to_cpu(entry->hashval) == cursor->hashval) {
+				if (cursor->offset == context->dupcnt) {
+					context->dupcnt = 0;
+					break;
+				}
+				context->dupcnt++;
+			} else if (be32_to_cpu(entry->hashval) >
+					cursor->hashval) {
+				context->dupcnt = 0;
+				break;
+			}
+		}
+		if (i == ichdr.count) {
+			trace_xfs_attr_list_notfound(context);
+			return 0;
+		}
+	} else {
+		entry = &entries[0];
+		i = 0;
+	}
+	context->resynch = 0;
+
+	/*
+	 * We have found our place, start copying out the new attributes.
+	 */
+	for (; i < ichdr.count; entry++, i++) {
+		char *name;
+		int namelen, valuelen;
+
+		if (be32_to_cpu(entry->hashval) != cursor->hashval) {
+			cursor->hashval = be32_to_cpu(entry->hashval);
+			cursor->offset = 0;
+		}
+
+		if ((entry->flags & XFS_ATTR_INCOMPLETE) &&
+		    !context->allow_incomplete)
+			continue;
+
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			xfs_attr_leaf_name_local_t *name_loc;
+
+			name_loc = xfs_attr3_leaf_name_local(leaf, i);
+			name = name_loc->nameval;
+			namelen = name_loc->namelen;
+			valuelen = be16_to_cpu(name_loc->valuelen);
+		} else {
+			xfs_attr_leaf_name_remote_t *name_rmt;
+
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+			name = name_rmt->name;
+			namelen = name_rmt->namelen;
+			valuelen = be32_to_cpu(name_rmt->valuelen);
+		}
+
+		if (XFS_IS_CORRUPT(context->dp->i_mount,
+				   !xfs_attr_namecheck(name, namelen)))
+			return -EFSCORRUPTED;
+		context->put_listent(context, entry->flags,
+					      name, namelen, valuelen);
+		if (context->seen_enough)
+			break;
+		cursor->offset++;
+	}
+	trace_xfs_attr_list_leaf_end(context);
+	return 0;
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(
+	struct xfs_attr_list_context	*context)
+{
+	struct xfs_buf			*bp;
+	int				error;
+
+	trace_xfs_attr_leaf_list(context);
+
+	context->cursor.blkno = 0;
+	error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp);
+	if (error)
+		return error;
+
+	error = xfs_attr3_leaf_list_int(bp, context);
+	xfs_trans_brelse(context->tp, bp);
+	return error;
+}
+
+int
+xfs_attr_list_ilocked(
+	struct xfs_attr_list_context	*context)
+{
+	struct xfs_inode		*dp = context->dp;
+
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (!xfs_inode_hasattr(dp))
+		return 0;
+	if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+		return xfs_attr_shortform_list(context);
+	if (xfs_attr_is_leaf(dp))
+		return xfs_attr_leaf_list(context);
+	return xfs_attr_node_list(context);
+}
+
+int
+xfs_attr_list(
+	struct xfs_attr_list_context	*context)
+{
+	struct xfs_inode		*dp = context->dp;
+	uint				lock_mode;
+	int				error;
+
+	XFS_STATS_INC(dp->i_mount, xs_attr_list);
+
+	if (xfs_is_shutdown(dp->i_mount))
+		return -EIO;
+
+	lock_mode = xfs_ilock_attr_map_shared(dp);
+	error = xfs_attr_list_ilocked(context);
+	xfs_iunlock(dp, lock_mode);
+	return error;
+}
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
new file mode 100644
index 000000000..fe21c76f7
--- /dev/null
+++ b/fs/xfs/xfs_bio_io.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Christoph Hellwig.
+ */
+#include "xfs.h"
+
+static inline unsigned int bio_max_vecs(unsigned int count)
+{
+	return bio_max_segs(howmany(count, PAGE_SIZE));
+}
+
+int
+xfs_rw_bdev(
+	struct block_device	*bdev,
+	sector_t		sector,
+	unsigned int		count,
+	char			*data,
+	enum req_op		op)
+
+{
+	unsigned int		is_vmalloc = is_vmalloc_addr(data);
+	unsigned int		left = count;
+	int			error;
+	struct bio		*bio;
+
+	if (is_vmalloc && op == REQ_OP_WRITE)
+		flush_kernel_vmap_range(data, count);
+
+	bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
+			GFP_KERNEL);
+	bio->bi_iter.bi_sector = sector;
+
+	do {
+		struct page	*page = kmem_to_page(data);
+		unsigned int	off = offset_in_page(data);
+		unsigned int	len = min_t(unsigned, left, PAGE_SIZE - off);
+
+		while (bio_add_page(bio, page, len, off) != len) {
+			struct bio	*prev = bio;
+
+			bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+					prev->bi_opf, GFP_KERNEL);
+			bio->bi_iter.bi_sector = bio_end_sector(prev);
+			bio_chain(prev, bio);
+
+			submit_bio(prev);
+		}
+
+		data += len;
+		left -= len;
+	} while (left > 0);
+
+	error = submit_bio_wait(bio);
+	bio_put(bio);
+
+	if (is_vmalloc && op == REQ_OP_READ)
+		invalidate_kernel_vmap_range(data, count);
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
new file mode 100644
index 000000000..41323da52
--- /dev/null
+++ b/fs/xfs/xfs_bmap_item.c
@@ -0,0 +1,712 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_item.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache	*xfs_bui_cache;
+struct kmem_cache	*xfs_bud_cache;
+
+static const struct xfs_item_ops xfs_bui_item_ops;
+
+static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_bui_log_item, bui_item);
+}
+
+STATIC void
+xfs_bui_item_free(
+	struct xfs_bui_log_item	*buip)
+{
+	kmem_free(buip->bui_item.li_lv_shadow);
+	kmem_cache_free(xfs_bui_cache, buip);
+}
+
+/*
+ * Freeing the BUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the BUI may not yet have been placed in the AIL
+ * when called by xfs_bui_release() from BUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the BUI.
+ */
+STATIC void
+xfs_bui_release(
+	struct xfs_bui_log_item	*buip)
+{
+	ASSERT(atomic_read(&buip->bui_refcount) > 0);
+	if (!atomic_dec_and_test(&buip->bui_refcount))
+		return;
+
+	xfs_trans_ail_delete(&buip->bui_item, 0);
+	xfs_bui_item_free(buip);
+}
+
+
+STATIC void
+xfs_bui_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given bui log item. We use only 1 iovec, and we point that
+ * at the bui_log_format structure embedded in the bui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the bui item have been filled.
+ */
+STATIC void
+xfs_bui_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&buip->bui_next_extent) ==
+			buip->bui_format.bui_nextents);
+
+	buip->bui_format.bui_type = XFS_LI_BUI;
+	buip->bui_format.bui_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format,
+			xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents));
+}
+
+/*
+ * The unpin operation is the last place an BUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the BUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the BUI to either construct
+ * and commit the BUD or drop the BUD's reference in the event of error. Simply
+ * drop the log's BUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_bui_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+
+	xfs_bui_release(buip);
+}
+
+/*
+ * The BUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an BUD isn't going to be
+ * constructed and thus we free the BUI here directly.
+ */
+STATIC void
+xfs_bui_item_release(
+	struct xfs_log_item	*lip)
+{
+	xfs_bui_release(BUI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an bui item with the given number of extents.
+ */
+STATIC struct xfs_bui_log_item *
+xfs_bui_init(
+	struct xfs_mount		*mp)
+
+{
+	struct xfs_bui_log_item		*buip;
+
+	buip = kmem_cache_zalloc(xfs_bui_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+	xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
+	buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
+	buip->bui_format.bui_id = (uintptr_t)(void *)buip;
+	atomic_set(&buip->bui_next_extent, 0);
+	atomic_set(&buip->bui_refcount, 2);
+
+	return buip;
+}
+
+static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_bud_log_item, bud_item);
+}
+
+STATIC void
+xfs_bud_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_bud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given bud log item. We use only 1 iovec, and we point that
+ * at the bud_log_format structure embedded in the bud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the bud item have been filled.
+ */
+STATIC void
+xfs_bud_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	budp->bud_format.bud_type = XFS_LI_BUD;
+	budp->bud_format.bud_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format,
+			sizeof(struct xfs_bud_log_format));
+}
+
+/*
+ * The BUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the BUI and free the
+ * BUD.
+ */
+STATIC void
+xfs_bud_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
+
+	xfs_bui_release(budp->bud_buip);
+	kmem_free(budp->bud_item.li_lv_shadow);
+	kmem_cache_free(xfs_bud_cache, budp);
+}
+
+static struct xfs_log_item *
+xfs_bud_item_intent(
+	struct xfs_log_item	*lip)
+{
+	return &BUD_ITEM(lip)->bud_buip->bui_item;
+}
+
+static const struct xfs_item_ops xfs_bud_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED |
+			  XFS_ITEM_INTENT_DONE,
+	.iop_size	= xfs_bud_item_size,
+	.iop_format	= xfs_bud_item_format,
+	.iop_release	= xfs_bud_item_release,
+	.iop_intent	= xfs_bud_item_intent,
+};
+
+static struct xfs_bud_log_item *
+xfs_trans_get_bud(
+	struct xfs_trans		*tp,
+	struct xfs_bui_log_item		*buip)
+{
+	struct xfs_bud_log_item		*budp;
+
+	budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
+			  &xfs_bud_item_ops);
+	budp->bud_buip = buip;
+	budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+	xfs_trans_add_item(tp, &budp->bud_item);
+	return budp;
+}
+
+/*
+ * Finish an bmap update and log it to the BUD. Note that the
+ * transaction is marked dirty regardless of whether the bmap update
+ * succeeds or fails to support the BUI/BUD lifecycle rules.
+ */
+static int
+xfs_trans_log_finish_bmap_update(
+	struct xfs_trans		*tp,
+	struct xfs_bud_log_item		*budp,
+	enum xfs_bmap_intent_type	type,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			*blockcount,
+	xfs_exntst_t			state)
+{
+	int				error;
+
+	error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
+			startblock, blockcount, state);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the BUI and frees the BUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+	return error;
+}
+
+/* Sort bmap intents by inode. */
+static int
+xfs_bmap_update_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_bmap_intent		*ba;
+	struct xfs_bmap_intent		*bb;
+
+	ba = container_of(a, struct xfs_bmap_intent, bi_list);
+	bb = container_of(b, struct xfs_bmap_intent, bi_list);
+	return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
+}
+
+/* Set the map extent flags for this mapping. */
+static void
+xfs_trans_set_bmap_flags(
+	struct xfs_map_extent		*bmap,
+	enum xfs_bmap_intent_type	type,
+	int				whichfork,
+	xfs_exntst_t			state)
+{
+	bmap->me_flags = 0;
+	switch (type) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		bmap->me_flags = type;
+		break;
+	default:
+		ASSERT(0);
+	}
+	if (state == XFS_EXT_UNWRITTEN)
+		bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+	if (whichfork == XFS_ATTR_FORK)
+		bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+}
+
+/* Log bmap updates in the intent item. */
+STATIC void
+xfs_bmap_update_log_item(
+	struct xfs_trans		*tp,
+	struct xfs_bui_log_item		*buip,
+	struct xfs_bmap_intent		*bmap)
+{
+	uint				next_extent;
+	struct xfs_map_extent		*map;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
+	ASSERT(next_extent < buip->bui_format.bui_nextents);
+	map = &buip->bui_format.bui_extents[next_extent];
+	map->me_owner = bmap->bi_owner->i_ino;
+	map->me_startblock = bmap->bi_bmap.br_startblock;
+	map->me_startoff = bmap->bi_bmap.br_startoff;
+	map->me_len = bmap->bi_bmap.br_blockcount;
+	xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
+			bmap->bi_bmap.br_state);
+}
+
+static struct xfs_log_item *
+xfs_bmap_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_bui_log_item		*buip = xfs_bui_init(mp);
+	struct xfs_bmap_intent		*bmap;
+
+	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+
+	xfs_trans_add_item(tp, &buip->bui_item);
+	if (sort)
+		list_sort(mp, items, xfs_bmap_update_diff_items);
+	list_for_each_entry(bmap, items, bi_list)
+		xfs_bmap_update_log_item(tp, buip, bmap);
+	return &buip->bui_item;
+}
+
+/* Get an BUD so we can process all the deferred rmap updates. */
+static struct xfs_log_item *
+xfs_bmap_update_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_bmap_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_bmap_intent		*bmap;
+	xfs_filblks_t			count;
+	int				error;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	count = bmap->bi_bmap.br_blockcount;
+	error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done),
+			bmap->bi_type,
+			bmap->bi_owner, bmap->bi_whichfork,
+			bmap->bi_bmap.br_startoff,
+			bmap->bi_bmap.br_startblock,
+			&count,
+			bmap->bi_bmap.br_state);
+	if (!error && count > 0) {
+		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+		bmap->bi_bmap.br_blockcount = count;
+		return -EAGAIN;
+	}
+	kmem_cache_free(xfs_bmap_intent_cache, bmap);
+	return error;
+}
+
+/* Abort all pending BUIs. */
+STATIC void
+xfs_bmap_update_abort_intent(
+	struct xfs_log_item		*intent)
+{
+	xfs_bui_release(BUI_ITEM(intent));
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_bmap_intent		*bmap;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	kmem_cache_free(xfs_bmap_intent_cache, bmap);
+}
+
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_bmap_update_create_intent,
+	.abort_intent	= xfs_bmap_update_abort_intent,
+	.create_done	= xfs_bmap_update_create_done,
+	.finish_item	= xfs_bmap_update_finish_item,
+	.cancel_item	= xfs_bmap_update_cancel_item,
+};
+
+/* Is this recovered BUI ok? */
+static inline bool
+xfs_bui_validate(
+	struct xfs_mount		*mp,
+	struct xfs_bui_log_item		*buip)
+{
+	struct xfs_map_extent		*bmap;
+
+	/* Only one mapping operation per BUI... */
+	if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+		return false;
+
+	bmap = &buip->bui_format.bui_extents[0];
+
+	if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)
+		return false;
+
+	switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		break;
+	default:
+		return false;
+	}
+
+	if (!xfs_verify_ino(mp, bmap->me_owner))
+		return false;
+
+	if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len))
+		return false;
+
+	return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len);
+}
+
+/*
+ * Process a bmap update intent item that was recovered from the log.
+ * We need to update some inode's bmbt.
+ */
+STATIC int
+xfs_bui_item_recover(
+	struct xfs_log_item		*lip,
+	struct list_head		*capture_list)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_bui_log_item		*buip = BUI_ITEM(lip);
+	struct xfs_trans		*tp;
+	struct xfs_inode		*ip = NULL;
+	struct xfs_mount		*mp = lip->li_log->l_mp;
+	struct xfs_map_extent		*bmap;
+	struct xfs_bud_log_item		*budp;
+	xfs_filblks_t			count;
+	xfs_exntst_t			state;
+	unsigned int			bui_type;
+	int				whichfork;
+	int				iext_delta;
+	int				error = 0;
+
+	if (!xfs_bui_validate(mp, buip)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				&buip->bui_format, sizeof(buip->bui_format));
+		return -EFSCORRUPTED;
+	}
+
+	bmap = &buip->bui_format.bui_extents[0];
+	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+	whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+			XFS_ATTR_FORK : XFS_DATA_FORK;
+	bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+
+	error = xlog_recover_iget(mp, bmap->me_owner, &ip);
+	if (error)
+		return error;
+
+	/* Allocate transaction and do the work. */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+	if (error)
+		goto err_rele;
+
+	budp = xfs_trans_get_bud(tp, buip);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	if (bui_type == XFS_BMAP_MAP)
+		iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
+	else
+		iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
+
+	error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+	if (error == -EFBIG)
+		error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+	if (error)
+		goto err_cancel;
+
+	count = bmap->me_len;
+	error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
+			whichfork, bmap->me_startoff, bmap->me_startblock,
+			&count, state);
+	if (error == -EFSCORRUPTED)
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap,
+				sizeof(*bmap));
+	if (error)
+		goto err_cancel;
+
+	if (count > 0) {
+		ASSERT(bui_type == XFS_BMAP_UNMAP);
+		irec.br_startblock = bmap->me_startblock;
+		irec.br_blockcount = count;
+		irec.br_startoff = bmap->me_startoff;
+		irec.br_state = state;
+		xfs_bmap_unmap_extent(tp, ip, &irec);
+	}
+
+	/*
+	 * Commit transaction, which frees the transaction and saves the inode
+	 * for later replay activities.
+	 */
+	error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+	if (error)
+		goto err_unlock;
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_irele(ip);
+	return 0;
+
+err_cancel:
+	xfs_trans_cancel(tp);
+err_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+err_rele:
+	xfs_irele(ip);
+	return error;
+}
+
+STATIC bool
+xfs_bui_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_bui_item_relog(
+	struct xfs_log_item		*intent,
+	struct xfs_trans		*tp)
+{
+	struct xfs_bud_log_item		*budp;
+	struct xfs_bui_log_item		*buip;
+	struct xfs_map_extent		*extp;
+	unsigned int			count;
+
+	count = BUI_ITEM(intent)->bui_format.bui_nextents;
+	extp = BUI_ITEM(intent)->bui_format.bui_extents;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
+	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+	buip = xfs_bui_init(tp->t_mountp);
+	memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
+	atomic_set(&buip->bui_next_extent, count);
+	xfs_trans_add_item(tp, &buip->bui_item);
+	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+	return &buip->bui_item;
+}
+
+static const struct xfs_item_ops xfs_bui_item_ops = {
+	.flags		= XFS_ITEM_INTENT,
+	.iop_size	= xfs_bui_item_size,
+	.iop_format	= xfs_bui_item_format,
+	.iop_unpin	= xfs_bui_item_unpin,
+	.iop_release	= xfs_bui_item_release,
+	.iop_recover	= xfs_bui_item_recover,
+	.iop_match	= xfs_bui_item_match,
+	.iop_relog	= xfs_bui_item_relog,
+};
+
+static inline void
+xfs_bui_copy_format(
+	struct xfs_bui_log_format	*dst,
+	const struct xfs_bui_log_format	*src)
+{
+	unsigned int			i;
+
+	memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents));
+
+	for (i = 0; i < src->bui_nextents; i++)
+		memcpy(&dst->bui_extents[i], &src->bui_extents[i],
+				sizeof(struct xfs_map_extent));
+}
+
+/*
+ * This routine is called to create an in-core extent bmap update
+ * item from the bui format structure which was logged on disk.
+ * It allocates an in-core bui, copies the extents from the format
+ * structure into it, and adds the bui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_bui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_bui_log_item		*buip;
+	struct xfs_bui_log_format	*bui_formatp;
+	size_t				len;
+
+	bui_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	buip = xfs_bui_init(mp);
+	xfs_bui_copy_format(&buip->bui_format, bui_formatp);
+	atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
+	/*
+	 * Insert the intent into the AIL directly and drop one reference so
+	 * that finishing or canceling the work will drop the other.
+	 */
+	xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn);
+	xfs_bui_release(buip);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_bui_item_ops = {
+	.item_type		= XFS_LI_BUI,
+	.commit_pass2		= xlog_recover_bui_commit_pass2,
+};
+
+/*
+ * This routine is called when an BUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding BUI if it
+ * was still in the log. To do this it searches the AIL for the BUI with an id
+ * equal to that in the BUD format structure. If we find it we drop the BUD
+ * reference, which removes the BUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_bud_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_bud_log_format	*bud_formatp;
+
+	bud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_BUI, bud_formatp->bud_bui_id);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_bud_item_ops = {
+	.item_type		= XFS_LI_BUD,
+	.commit_pass2		= xlog_recover_bud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
new file mode 100644
index 000000000..3fafd3881
--- /dev/null
+++ b/fs/xfs/xfs_bmap_item.h
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef	__XFS_BMAP_ITEM_H__
+#define	__XFS_BMAP_ITEM_H__
+
+/*
+ * There are (currently) two pairs of bmap btree redo item types: map & unmap.
+ * The common abbreviations for these are BUI (bmap update intent) and BUD
+ * (bmap update done).  The redo item type is encoded in the flags field of
+ * each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated bmbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * bmbt metadata updates in the non-first transaction.
+ */
+
+/* kernel only BUI/BUD definitions */
+
+struct xfs_mount;
+struct kmem_cache;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_BUI_MAX_FAST_EXTENTS	1
+
+/*
+ * This is the "bmap update intent" log item.  It is used to log the fact that
+ * some reverse mappings need to change.  It is used in conjunction with the
+ * "bmap update done" log item described below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_bui_log_item {
+	struct xfs_log_item		bui_item;
+	atomic_t			bui_refcount;
+	atomic_t			bui_next_extent;
+	struct xfs_bui_log_format	bui_format;
+};
+
+static inline size_t
+xfs_bui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_bui_log_item, bui_format) +
+			xfs_bui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "bmap update done" log item.  It is used to log the fact that
+ * some bmbt updates mentioned in an earlier bui item have been performed.
+ */
+struct xfs_bud_log_item {
+	struct xfs_log_item		bud_item;
+	struct xfs_bui_log_item		*bud_buip;
+	struct xfs_bud_log_format	bud_format;
+};
+
+extern struct kmem_cache	*xfs_bui_cache;
+extern struct kmem_cache	*xfs_bud_cache;
+
+#endif	/* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
new file mode 100644
index 000000000..04d0c2bff
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.c
@@ -0,0 +1,1856 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_iomap.h"
+#include "xfs_reflink.h"
+
+/* Kernel only BMAP related definitions and functions */
+
+/*
+ * Convert the given file system block to a disk block.  We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
+ */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+	if (XFS_IS_REALTIME_INODE(ip))
+		return XFS_FSB_TO_BB(ip->i_mount, fsb);
+	return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
+}
+
+/*
+ * Routine to zero an extent on disk allocated to the specific inode.
+ *
+ * The VFS functions take a linearised filesystem block offset, so we have to
+ * convert the sparse xfs fsb to the right format first.
+ * VFS types are real funky, too.
+ */
+int
+xfs_zero_extent(
+	struct xfs_inode	*ip,
+	xfs_fsblock_t		start_fsb,
+	xfs_off_t		count_fsb)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	xfs_daddr_t		sector = xfs_fsb_to_db(ip, start_fsb);
+	sector_t		block = XFS_BB_TO_FSBT(mp, sector);
+
+	return blkdev_issue_zeroout(target->bt_bdev,
+		block << (mp->m_super->s_blocksize_bits - 9),
+		count_fsb << (mp->m_super->s_blocksize_bits - 9),
+		GFP_NOFS, 0);
+}
+
+#ifdef CONFIG_XFS_RT
+int
+xfs_bmap_rtalloc(
+	struct xfs_bmalloca	*ap)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	xfs_fileoff_t		orig_offset = ap->offset;
+	xfs_rtblock_t		rtb;
+	xfs_extlen_t		prod = 0;  /* product factor for allocators */
+	xfs_extlen_t		mod = 0;   /* product factor for allocators */
+	xfs_extlen_t		ralen = 0; /* realtime allocation length */
+	xfs_extlen_t		align;     /* minimum allocation alignment */
+	xfs_extlen_t		orig_length = ap->length;
+	xfs_extlen_t		minlen = mp->m_sb.sb_rextsize;
+	xfs_extlen_t		raminlen;
+	bool			rtlocked = false;
+	bool			ignore_locality = false;
+	int			error;
+
+	align = xfs_get_extsz_hint(ap->ip);
+retry:
+	prod = align / mp->m_sb.sb_rextsize;
+	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+					align, 1, ap->eof, 0,
+					ap->conv, &ap->offset, &ap->length);
+	if (error)
+		return error;
+	ASSERT(ap->length);
+	ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+
+	/*
+	 * If we shifted the file offset downward to satisfy an extent size
+	 * hint, increase minlen by that amount so that the allocator won't
+	 * give us an allocation that's too short to cover at least one of the
+	 * blocks that the caller asked for.
+	 */
+	if (ap->offset != orig_offset)
+		minlen += orig_offset - ap->offset;
+
+	/*
+	 * If the offset & length are not perfectly aligned
+	 * then kill prod, it will just get us in trouble.
+	 */
+	div_u64_rem(ap->offset, align, &mod);
+	if (mod || ap->length % align)
+		prod = 1;
+	/*
+	 * Set ralen to be the actual requested length in rtextents.
+	 */
+	ralen = ap->length / mp->m_sb.sb_rextsize;
+	/*
+	 * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
+	 * we rounded up to it, cut it back so it's valid again.
+	 * Note that if it's a really large request (bigger than
+	 * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
+	 * adjust the starting point to match it.
+	 */
+	if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
+		ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
+
+	/*
+	 * Lock out modifications to both the RT bitmap and summary inodes
+	 */
+	if (!rtlocked) {
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+		xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+		xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		rtlocked = true;
+	}
+
+	/*
+	 * If it's an allocation to an empty file at offset 0,
+	 * pick an extent that will space things out in the rt area.
+	 */
+	if (ap->eof && ap->offset == 0) {
+		xfs_rtblock_t rtx; /* realtime extent no */
+
+		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+		if (error)
+			return error;
+		ap->blkno = rtx * mp->m_sb.sb_rextsize;
+	} else {
+		ap->blkno = 0;
+	}
+
+	xfs_bmap_adjacent(ap);
+
+	/*
+	 * Realtime allocation, done through xfs_rtallocate_extent.
+	 */
+	if (ignore_locality)
+		ap->blkno = 0;
+	else
+		do_div(ap->blkno, mp->m_sb.sb_rextsize);
+	rtb = ap->blkno;
+	ap->length = ralen;
+	raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
+	error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
+			&ralen, ap->wasdel, prod, &rtb);
+	if (error)
+		return error;
+
+	if (rtb != NULLRTBLOCK) {
+		ap->blkno = rtb * mp->m_sb.sb_rextsize;
+		ap->length = ralen * mp->m_sb.sb_rextsize;
+		ap->ip->i_nblocks += ap->length;
+		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+		if (ap->wasdel)
+			ap->ip->i_delayed_blks -= ap->length;
+		/*
+		 * Adjust the disk quota also. This was reserved
+		 * earlier.
+		 */
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+					XFS_TRANS_DQ_RTBCOUNT, ap->length);
+		return 0;
+	}
+
+	if (align > mp->m_sb.sb_rextsize) {
+		/*
+		 * We previously enlarged the request length to try to satisfy
+		 * an extent size hint.  The allocator didn't return anything,
+		 * so reset the parameters to the original values and try again
+		 * without alignment criteria.
+		 */
+		ap->offset = orig_offset;
+		ap->length = orig_length;
+		minlen = align = mp->m_sb.sb_rextsize;
+		goto retry;
+	}
+
+	if (!ignore_locality && ap->blkno != 0) {
+		/*
+		 * If we can't allocate near a specific rt extent, try again
+		 * without locality criteria.
+		 */
+		ignore_locality = true;
+		goto retry;
+	}
+
+	ap->blkno = NULLFSBLOCK;
+	ap->length = 0;
+	return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Extent tree block counting routines.
+ */
+
+/*
+ * Count leaf blocks given a range of extent records.  Delayed allocation
+ * extents are not counted towards the totals.
+ */
+xfs_extnum_t
+xfs_bmap_count_leaves(
+	struct xfs_ifork	*ifp,
+	xfs_filblks_t		*count)
+{
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	xfs_extnum_t		numrecs = 0;
+
+	for_each_xfs_iext(ifp, &icur, &got) {
+		if (!isnullstartblock(got.br_startblock)) {
+			*count += got.br_blockcount;
+			numrecs++;
+		}
+	}
+
+	return numrecs;
+}
+
+/*
+ * Count fsblocks of the given fork.  Delayed allocation extents are
+ * not counted towards the totals.
+ */
+int
+xfs_bmap_count_blocks(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_extnum_t		*nextents,
+	xfs_filblks_t		*count)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_cur	*cur;
+	xfs_extlen_t		btblocks = 0;
+	int			error;
+
+	*nextents = 0;
+	*count = 0;
+
+	if (!ifp)
+		return 0;
+
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		error = xfs_btree_count_blocks(cur, &btblocks);
+		xfs_btree_del_cursor(cur, error);
+		if (error)
+			return error;
+
+		/*
+		 * xfs_btree_count_blocks includes the root block contained in
+		 * the inode fork in @btblocks, so subtract one because we're
+		 * only interested in allocated disk blocks.
+		 */
+		*count += btblocks - 1;
+
+		fallthrough;
+	case XFS_DINODE_FMT_EXTENTS:
+		*nextents = xfs_bmap_count_leaves(ifp, count);
+		break;
+	}
+
+	return 0;
+}
+
+static int
+xfs_getbmap_report_one(
+	struct xfs_inode	*ip,
+	struct getbmapx		*bmv,
+	struct kgetbmap		*out,
+	int64_t			bmv_end,
+	struct xfs_bmbt_irec	*got)
+{
+	struct kgetbmap		*p = out + bmv->bmv_entries;
+	bool			shared = false;
+	int			error;
+
+	error = xfs_reflink_trim_around_shared(ip, got, &shared);
+	if (error)
+		return error;
+
+	if (isnullstartblock(got->br_startblock) ||
+	    got->br_startblock == DELAYSTARTBLOCK) {
+		/*
+		 * Delalloc extents that start beyond EOF can occur due to
+		 * speculative EOF allocation when the delalloc extent is larger
+		 * than the largest freespace extent at conversion time.  These
+		 * extents cannot be converted by data writeback, so can exist
+		 * here even if we are not supposed to be finding delalloc
+		 * extents.
+		 */
+		if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
+			ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
+
+		p->bmv_oflags |= BMV_OF_DELALLOC;
+		p->bmv_block = -2;
+	} else {
+		p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
+	}
+
+	if (got->br_state == XFS_EXT_UNWRITTEN &&
+	    (bmv->bmv_iflags & BMV_IF_PREALLOC))
+		p->bmv_oflags |= BMV_OF_PREALLOC;
+
+	if (shared)
+		p->bmv_oflags |= BMV_OF_SHARED;
+
+	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
+	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
+
+	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
+	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
+	bmv->bmv_entries++;
+	return 0;
+}
+
+static void
+xfs_getbmap_report_hole(
+	struct xfs_inode	*ip,
+	struct getbmapx		*bmv,
+	struct kgetbmap		*out,
+	int64_t			bmv_end,
+	xfs_fileoff_t		bno,
+	xfs_fileoff_t		end)
+{
+	struct kgetbmap		*p = out + bmv->bmv_entries;
+
+	if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
+		return;
+
+	p->bmv_block = -1;
+	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
+	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
+
+	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
+	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
+	bmv->bmv_entries++;
+}
+
+static inline bool
+xfs_getbmap_full(
+	struct getbmapx		*bmv)
+{
+	return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1;
+}
+
+static bool
+xfs_getbmap_next_rec(
+	struct xfs_bmbt_irec	*rec,
+	xfs_fileoff_t		total_end)
+{
+	xfs_fileoff_t		end = rec->br_startoff + rec->br_blockcount;
+
+	if (end == total_end)
+		return false;
+
+	rec->br_startoff += rec->br_blockcount;
+	if (!isnullstartblock(rec->br_startblock) &&
+	    rec->br_startblock != DELAYSTARTBLOCK)
+		rec->br_startblock += rec->br_blockcount;
+	rec->br_blockcount = total_end - end;
+	return true;
+}
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
+ */
+int						/* error code */
+xfs_getbmap(
+	struct xfs_inode	*ip,
+	struct getbmapx		*bmv,		/* user bmap structure */
+	struct kgetbmap		*out)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			iflags = bmv->bmv_iflags;
+	int			whichfork, lock, error = 0;
+	int64_t			bmv_end, max_len;
+	xfs_fileoff_t		bno, first_bno;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmbt_irec	got, rec;
+	xfs_filblks_t		len;
+	struct xfs_iext_cursor	icur;
+
+	if (bmv->bmv_iflags & ~BMV_IF_VALID)
+		return -EINVAL;
+#ifndef DEBUG
+	/* Only allow CoW fork queries if we're debugging. */
+	if (iflags & BMV_IF_COWFORK)
+		return -EINVAL;
+#endif
+	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
+		return -EINVAL;
+
+	if (bmv->bmv_length < -1)
+		return -EINVAL;
+	bmv->bmv_entries = 0;
+	if (bmv->bmv_length == 0)
+		return 0;
+
+	if (iflags & BMV_IF_ATTRFORK)
+		whichfork = XFS_ATTR_FORK;
+	else if (iflags & BMV_IF_COWFORK)
+		whichfork = XFS_COW_FORK;
+	else
+		whichfork = XFS_DATA_FORK;
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	switch (whichfork) {
+	case XFS_ATTR_FORK:
+		lock = xfs_ilock_attr_map_shared(ip);
+		if (!xfs_inode_has_attr_fork(ip))
+			goto out_unlock_ilock;
+
+		max_len = 1LL << 32;
+		break;
+	case XFS_COW_FORK:
+		lock = XFS_ILOCK_SHARED;
+		xfs_ilock(ip, lock);
+
+		/* No CoW fork? Just return */
+		if (!xfs_ifork_ptr(ip, whichfork))
+			goto out_unlock_ilock;
+
+		if (xfs_get_cowextsz_hint(ip))
+			max_len = mp->m_super->s_maxbytes;
+		else
+			max_len = XFS_ISIZE(ip);
+		break;
+	case XFS_DATA_FORK:
+		if (!(iflags & BMV_IF_DELALLOC) &&
+		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_disk_size)) {
+			error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+			if (error)
+				goto out_unlock_iolock;
+
+			/*
+			 * Even after flushing the inode, there can still be
+			 * delalloc blocks on the inode beyond EOF due to
+			 * speculative preallocation.  These are not removed
+			 * until the release function is called or the inode
+			 * is inactivated.  Hence we cannot assert here that
+			 * ip->i_delayed_blks == 0.
+			 */
+		}
+
+		if (xfs_get_extsz_hint(ip) ||
+		    (ip->i_diflags &
+		     (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
+			max_len = mp->m_super->s_maxbytes;
+		else
+			max_len = XFS_ISIZE(ip);
+
+		lock = xfs_ilock_data_map_shared(ip);
+		break;
+	}
+
+	ifp = xfs_ifork_ptr(ip, whichfork);
+
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		/* Local format inode forks report no extents. */
+		goto out_unlock_ilock;
+	default:
+		error = -EINVAL;
+		goto out_unlock_ilock;
+	}
+
+	if (bmv->bmv_length == -1) {
+		max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
+		bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
+	}
+
+	bmv_end = bmv->bmv_offset + bmv->bmv_length;
+
+	first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
+	len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
+
+	error = xfs_iread_extents(NULL, ip, whichfork);
+	if (error)
+		goto out_unlock_ilock;
+
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
+		/*
+		 * Report a whole-file hole if the delalloc flag is set to
+		 * stay compatible with the old implementation.
+		 */
+		if (iflags & BMV_IF_DELALLOC)
+			xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
+					XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+		goto out_unlock_ilock;
+	}
+
+	while (!xfs_getbmap_full(bmv)) {
+		xfs_trim_extent(&got, first_bno, len);
+
+		/*
+		 * Report an entry for a hole if this extent doesn't directly
+		 * follow the previous one.
+		 */
+		if (got.br_startoff > bno) {
+			xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
+					got.br_startoff);
+			if (xfs_getbmap_full(bmv))
+				break;
+		}
+
+		/*
+		 * In order to report shared extents accurately, we report each
+		 * distinct shared / unshared part of a single bmbt record with
+		 * an individual getbmapx record.
+		 */
+		bno = got.br_startoff + got.br_blockcount;
+		rec = got;
+		do {
+			error = xfs_getbmap_report_one(ip, bmv, out, bmv_end,
+					&rec);
+			if (error || xfs_getbmap_full(bmv))
+				goto out_unlock_ilock;
+		} while (xfs_getbmap_next_rec(&rec, bno));
+
+		if (!xfs_iext_next_extent(ifp, &icur, &got)) {
+			xfs_fileoff_t	end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+
+			out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST;
+
+			if (whichfork != XFS_ATTR_FORK && bno < end &&
+			    !xfs_getbmap_full(bmv)) {
+				xfs_getbmap_report_hole(ip, bmv, out, bmv_end,
+						bno, end);
+			}
+			break;
+		}
+
+		if (bno >= first_bno + len)
+			break;
+	}
+
+out_unlock_ilock:
+	xfs_iunlock(ip, lock);
+out_unlock_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return error;
+}
+
+/*
+ * Dead simple method of punching delalyed allocation blocks from a range in
+ * the inode.  This will always punch out both the start and end blocks, even
+ * if the ranges only partially overlap them, so it is up to the caller to
+ * ensure that partial blocks are not passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		length)
+{
+	struct xfs_ifork	*ifp = &ip->i_df;
+	xfs_fileoff_t		end_fsb = start_fsb + length;
+	struct xfs_bmbt_irec	got, del;
+	struct xfs_iext_cursor	icur;
+	int			error = 0;
+
+	ASSERT(!xfs_need_iread_extents(ifp));
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
+		goto out_unlock;
+
+	while (got.br_startoff + got.br_blockcount > start_fsb) {
+		del = got;
+		xfs_trim_extent(&del, start_fsb, length);
+
+		/*
+		 * A delete can push the cursor forward. Step back to the
+		 * previous extent on non-delalloc or extents outside the
+		 * target range.
+		 */
+		if (!del.br_blockcount ||
+		    !isnullstartblock(del.br_startblock)) {
+			if (!xfs_iext_prev_extent(ifp, &icur, &got))
+				break;
+			continue;
+		}
+
+		error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
+						  &got, &del);
+		if (error || !xfs_iext_get_extent(ifp, &icur, &got))
+			break;
+	}
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(
+	struct xfs_inode	*ip,
+	bool			force)
+{
+	struct xfs_bmbt_irec	imap;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		end_fsb;
+	xfs_fileoff_t		last_fsb;
+	int			nimaps = 1;
+	int			error;
+
+	/*
+	 * Caller must either hold the exclusive io lock; or be inactivating
+	 * the inode, which guarantees there are no other users of the inode.
+	 */
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) ||
+	       (VFS_I(ip)->i_state & I_FREEING));
+
+	/* prealloc/delalloc exists only on regular files */
+	if (!S_ISREG(VFS_I(ip)->i_mode))
+		return false;
+
+	/*
+	 * Zero sized files with no cached pages and delalloc blocks will not
+	 * have speculative prealloc/delalloc blocks to remove.
+	 */
+	if (VFS_I(ip)->i_size == 0 &&
+	    VFS_I(ip)->i_mapping->nrpages == 0 &&
+	    ip->i_delayed_blks == 0)
+		return false;
+
+	/* If we haven't read in the extent list, then don't do it now. */
+	if (xfs_need_iread_extents(&ip->i_df))
+		return false;
+
+	/*
+	 * Do not free real preallocated or append-only files unless the file
+	 * has delalloc blocks and we are forced to remove them.
+	 */
+	if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+		if (!force || ip->i_delayed_blks == 0)
+			return false;
+
+	/*
+	 * Do not try to free post-EOF blocks if EOF is beyond the end of the
+	 * range supported by the page cache, because the truncation will loop
+	 * forever.
+	 */
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
+		end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
+	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+	if (last_fsb <= end_fsb)
+		return false;
+
+	/*
+	 * Look up the mapping for the first block past EOF.  If we can't find
+	 * it, there's nothing to free.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_bmapi_read(ip, end_fsb, last_fsb - end_fsb, &imap, &nimaps,
+			0);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (error || nimaps == 0)
+		return false;
+
+	/*
+	 * If there's a real mapping there or there are delayed allocation
+	 * reservations, then we have post-EOF blocks to try to free.
+	 */
+	return imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks;
+}
+
+/*
+ * This is called to free any blocks beyond eof. The caller must hold
+ * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
+ * reference to the inode.
+ */
+int
+xfs_free_eofblocks(
+	struct xfs_inode	*ip)
+{
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	/* Attach the dquots to the inode up front. */
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		return error;
+
+	/* Wait on dio to ensure i_size has settled. */
+	inode_dio_wait(VFS_I(ip));
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error) {
+		ASSERT(xfs_is_shutdown(mp));
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Do not update the on-disk file size.  If we update the on-disk file
+	 * size and then the system crashes before the contents of the file are
+	 * flushed to disk then the files may be full of holes (ie NULL files
+	 * bug).
+	 */
+	error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
+				XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
+	if (error)
+		goto err_cancel;
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_unlock;
+
+	xfs_inode_clear_eofblocks_tag(ip);
+	goto out_unlock;
+
+err_cancel:
+	/*
+	 * If we get an error at this point we simply don't
+	 * bother truncating the file.
+	 */
+	xfs_trans_cancel(tp);
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+int
+xfs_alloc_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_off_t		count;
+	xfs_filblks_t		allocated_fsb;
+	xfs_filblks_t		allocatesize_fsb;
+	xfs_extlen_t		extsz, temp;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_fileoff_t		endoffset_fsb;
+	int			nimaps;
+	int			rt;
+	xfs_trans_t		*tp;
+	xfs_bmbt_irec_t		imaps[1], *imapp;
+	int			error;
+
+	trace_xfs_alloc_file_space(ip);
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		return error;
+
+	if (len <= 0)
+		return -EINVAL;
+
+	rt = XFS_IS_REALTIME_INODE(ip);
+	extsz = xfs_get_extsz_hint(ip);
+
+	count = len;
+	imapp = &imaps[0];
+	nimaps = 1;
+	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
+	endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+	allocatesize_fsb = endoffset_fsb - startoffset_fsb;
+
+	/*
+	 * Allocate file space until done or until there is an error
+	 */
+	while (allocatesize_fsb && !error) {
+		xfs_fileoff_t	s, e;
+		unsigned int	dblocks, rblocks, resblks;
+
+		/*
+		 * Determine space reservations for data/realtime.
+		 */
+		if (unlikely(extsz)) {
+			s = startoffset_fsb;
+			do_div(s, extsz);
+			s *= extsz;
+			e = startoffset_fsb + allocatesize_fsb;
+			div_u64_rem(startoffset_fsb, extsz, &temp);
+			if (temp)
+				e += temp;
+			div_u64_rem(e, extsz, &temp);
+			if (temp)
+				e += extsz - temp;
+		} else {
+			s = 0;
+			e = allocatesize_fsb;
+		}
+
+		/*
+		 * The transaction reservation is limited to a 32-bit block
+		 * count, hence we need to limit the number of blocks we are
+		 * trying to reserve to avoid an overflow. We can't allocate
+		 * more than @nimaps extents, and an extent is limited on disk
+		 * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
+		 * limit.
+		 */
+		resblks = min_t(xfs_fileoff_t, (e - s),
+				(XFS_MAX_BMBT_EXTLEN * nimaps));
+		if (unlikely(rt)) {
+			dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+			rblocks = resblks;
+		} else {
+			dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+			rblocks = 0;
+		}
+
+		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+				dblocks, rblocks, false, &tp);
+		if (error)
+			break;
+
+		error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+				XFS_IEXT_ADD_NOSPLIT_CNT);
+		if (error == -EFBIG)
+			error = xfs_iext_count_upgrade(tp, ip,
+					XFS_IEXT_ADD_NOSPLIT_CNT);
+		if (error)
+			goto error;
+
+		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+				&nimaps);
+		if (error)
+			goto error;
+
+		ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		error = xfs_trans_commit(tp);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error)
+			break;
+
+		allocated_fsb = imapp->br_blockcount;
+
+		if (nimaps == 0) {
+			error = -ENOSPC;
+			break;
+		}
+
+		startoffset_fsb += allocated_fsb;
+		allocatesize_fsb -= allocated_fsb;
+	}
+
+	return error;
+
+error:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+static int
+xfs_unmap_extent(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		startoffset_fsb,
+	xfs_filblks_t		len_fsb,
+	int			*done)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	uint			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+	int			error;
+
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+			false, &tp);
+	if (error)
+		return error;
+
+	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+			XFS_IEXT_PUNCH_HOLE_CNT);
+	if (error == -EFBIG)
+		error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_trans_commit(tp);
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	goto out_unlock;
+}
+
+/* Caller must first wait for the completion of any pending DIOs if required. */
+int
+xfs_flush_unmap_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	xfs_off_t		rounding, start, end;
+	int			error;
+
+	rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE);
+	start = round_down(offset, rounding);
+	end = round_up(offset + len, rounding) - 1;
+
+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (error)
+		return error;
+	truncate_pagecache_range(inode, start, end);
+	return 0;
+}
+
+int
+xfs_free_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_fileoff_t		endoffset_fsb;
+	int			done = 0, error;
+
+	trace_xfs_free_file_space(ip);
+
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		return error;
+
+	if (len <= 0)	/* if nothing being freed */
+		return 0;
+
+	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+
+	/* We can only free complete realtime extents. */
+	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+		startoffset_fsb = roundup_64(startoffset_fsb,
+					     mp->m_sb.sb_rextsize);
+		endoffset_fsb = rounddown_64(endoffset_fsb,
+					     mp->m_sb.sb_rextsize);
+	}
+
+	/*
+	 * Need to zero the stuff we're not freeing, on disk.
+	 */
+	if (endoffset_fsb > startoffset_fsb) {
+		while (!done) {
+			error = xfs_unmap_extent(ip, startoffset_fsb,
+					endoffset_fsb - startoffset_fsb, &done);
+			if (error)
+				return error;
+		}
+	}
+
+	/*
+	 * Now that we've unmap all full blocks we'll have to zero out any
+	 * partial block at the beginning and/or end.  xfs_zero_range is smart
+	 * enough to skip any holes, including those we just created, but we
+	 * must take care not to zero beyond EOF and enlarge i_size.
+	 */
+	if (offset >= XFS_ISIZE(ip))
+		return 0;
+	if (offset + len > XFS_ISIZE(ip))
+		len = XFS_ISIZE(ip) - offset;
+	error = xfs_zero_range(ip, offset, len, NULL);
+	if (error)
+		return error;
+
+	/*
+	 * If we zeroed right up to EOF and EOF straddles a page boundary we
+	 * must make sure that the post-EOF area is also zeroed because the
+	 * page could be mmap'd and xfs_zero_range doesn't do that for us.
+	 * Writeback of the eof page will do this, albeit clumsily.
+	 */
+	if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
+		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+				round_down(offset + len, PAGE_SIZE), LLONG_MAX);
+	}
+
+	return error;
+}
+
+static int
+xfs_prepare_shift(
+	struct xfs_inode	*ip,
+	loff_t			offset)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	/*
+	 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
+	 * into the accessible region of the file.
+	 */
+	if (xfs_can_free_eofblocks(ip, true)) {
+		error = xfs_free_eofblocks(ip);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Shift operations must stabilize the start block offset boundary along
+	 * with the full range of the operation. If we don't, a COW writeback
+	 * completion could race with an insert, front merge with the start
+	 * extent (after split) during the shift and corrupt the file. Start
+	 * with the block just prior to the start to stabilize the boundary.
+	 */
+	offset = round_down(offset, mp->m_sb.sb_blocksize);
+	if (offset)
+		offset -= mp->m_sb.sb_blocksize;
+
+	/*
+	 * Writeback and invalidate cache for the remainder of the file as we're
+	 * about to shift down every extent from offset to EOF.
+	 */
+	error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
+	if (error)
+		return error;
+
+	/*
+	 * Clean out anything hanging around in the cow fork now that
+	 * we've flushed all the dirty data out to disk to avoid having
+	 * CoW extents at the wrong offsets.
+	 */
+	if (xfs_inode_has_cow_data(ip)) {
+		error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
+				true);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * xfs_collapse_file_space()
+ *	This routine frees disk space and shift extent for the given file.
+ *	The first thing we do is to free data blocks in the specified range
+ *	by calling xfs_free_file_space(). It would also sync dirty data
+ *	and invalidate page cache over the region on which collapse range
+ *	is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ *	0 on success
+ *	errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+	xfs_fileoff_t		next_fsb = XFS_B_TO_FSB(mp, offset + len);
+	xfs_fileoff_t		shift_fsb = XFS_B_TO_FSB(mp, len);
+	bool			done = false;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
+	trace_xfs_collapse_file_space(ip);
+
+	error = xfs_free_file_space(ip, offset, len);
+	if (error)
+		return error;
+
+	error = xfs_prepare_shift(ip, offset);
+	if (error)
+		return error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	while (!done) {
+		error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
+				&done);
+		if (error)
+			goto out_trans_cancel;
+		if (done)
+			break;
+
+		/* finish any deferred frees and roll the transaction */
+		error = xfs_defer_finish(&tp);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_insert_file_space()
+ *	This routine create hole space by shifting extents for the given file.
+ *	The first thing we do is to sync dirty data and invalidate page cache
+ *	over the region on which insert range is working. And split an extent
+ *	to two extents at given offset by calling xfs_bmap_split_extent.
+ *	And shift all extent records which are laying between [offset,
+ *	last allocated extent] to the right to reserve hole range.
+ * RETURNS:
+ *	0 on success
+ *	errno on error
+ */
+int
+xfs_insert_file_space(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+	xfs_fileoff_t		stop_fsb = XFS_B_TO_FSB(mp, offset);
+	xfs_fileoff_t		next_fsb = NULLFSBLOCK;
+	xfs_fileoff_t		shift_fsb = XFS_B_TO_FSB(mp, len);
+	bool			done = false;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
+	trace_xfs_insert_file_space(ip);
+
+	error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
+	if (error)
+		return error;
+
+	error = xfs_prepare_shift(ip, offset);
+	if (error)
+		return error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+			XFS_IEXT_PUNCH_HOLE_CNT);
+	if (error == -EFBIG)
+		error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * The extent shifting code works on extent granularity. So, if stop_fsb
+	 * is not the starting block of extent, we need to split the extent at
+	 * stop_fsb.
+	 */
+	error = xfs_bmap_split_extent(tp, ip, stop_fsb);
+	if (error)
+		goto out_trans_cancel;
+
+	do {
+		error = xfs_defer_finish(&tp);
+		if (error)
+			goto out_trans_cancel;
+
+		error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
+				&done, stop_fsb);
+		if (error)
+			goto out_trans_cancel;
+	} while (!done);
+
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+	struct xfs_inode	*ip,	/* target inode */
+	struct xfs_inode	*tip)	/* tmp inode */
+{
+	struct xfs_ifork	*ifp = &ip->i_df;
+	struct xfs_ifork	*tifp = &tip->i_df;
+
+	/* User/group/project quota ids must match if quotas are enforced. */
+	if (XFS_IS_QUOTA_ON(ip->i_mount) &&
+	    (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
+	     !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
+	     ip->i_projid != tip->i_projid))
+		return -EINVAL;
+
+	/* Should never get a local format */
+	if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
+	    tifp->if_format == XFS_DINODE_FMT_LOCAL)
+		return -EINVAL;
+
+	/*
+	 * if the target inode has less extents that then temporary inode then
+	 * why did userspace call us?
+	 */
+	if (ifp->if_nextents < tifp->if_nextents)
+		return -EINVAL;
+
+	/*
+	 * If we have to use the (expensive) rmap swap method, we can
+	 * handle any number of extents and any format.
+	 */
+	if (xfs_has_rmapbt(ip->i_mount))
+		return 0;
+
+	/*
+	 * if the target inode is in extent form and the temp inode is in btree
+	 * form then we will end up with the target inode in the wrong format
+	 * as we already know there are less extents in the temp inode.
+	 */
+	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+	    tifp->if_format == XFS_DINODE_FMT_BTREE)
+		return -EINVAL;
+
+	/* Check temp in extent form to max in target */
+	if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+	    tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+		return -EINVAL;
+
+	/* Check target in extent form to max in temp */
+	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+	    ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+		return -EINVAL;
+
+	/*
+	 * If we are in a btree format, check that the temp root block will fit
+	 * in the target and that it has enough extents to be in btree format
+	 * in the target.
+	 *
+	 * Note that we have to be careful to allow btree->extent conversions
+	 * (a common defrag case) which will occur when the temp inode is in
+	 * extent format...
+	 */
+	if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
+		if (xfs_inode_has_attr_fork(ip) &&
+		    XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
+			return -EINVAL;
+		if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+			return -EINVAL;
+	}
+
+	/* Reciprocal target->temp btree format checks */
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		if (xfs_inode_has_attr_fork(tip) &&
+		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
+			return -EINVAL;
+		if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+xfs_swap_extent_flush(
+	struct xfs_inode	*ip)
+{
+	int	error;
+
+	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+	if (error)
+		return error;
+	truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+	/* Verify O_DIRECT for ftmp */
+	if (VFS_I(ip)->i_mapping->nrpages)
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Move extents from one file to another, when rmap is enabled.
+ */
+STATIC int
+xfs_swap_extent_rmap(
+	struct xfs_trans		**tpp,
+	struct xfs_inode		*ip,
+	struct xfs_inode		*tip)
+{
+	struct xfs_trans		*tp = *tpp;
+	struct xfs_bmbt_irec		irec;
+	struct xfs_bmbt_irec		uirec;
+	struct xfs_bmbt_irec		tirec;
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	xfs_filblks_t			count_fsb;
+	int				error;
+	xfs_filblks_t			ilen;
+	xfs_filblks_t			rlen;
+	int				nimaps;
+	uint64_t			tip_flags2;
+
+	/*
+	 * If the source file has shared blocks, we must flag the donor
+	 * file as having shared blocks so that we get the shared-block
+	 * rmap functions when we go to fix up the rmaps.  The flags
+	 * will be switch for reals later.
+	 */
+	tip_flags2 = tip->i_diflags2;
+	if (ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
+		tip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+
+	offset_fsb = 0;
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
+	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+
+	while (count_fsb) {
+		/* Read extent from the donor file */
+		nimaps = 1;
+		error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
+				&nimaps, 0);
+		if (error)
+			goto out;
+		ASSERT(nimaps == 1);
+		ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
+
+		trace_xfs_swap_extent_rmap_remap(tip, &tirec);
+		ilen = tirec.br_blockcount;
+
+		/* Unmap the old blocks in the source file. */
+		while (tirec.br_blockcount) {
+			ASSERT(tp->t_firstblock == NULLFSBLOCK);
+			trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
+
+			/* Read extent from the source file */
+			nimaps = 1;
+			error = xfs_bmapi_read(ip, tirec.br_startoff,
+					tirec.br_blockcount, &irec,
+					&nimaps, 0);
+			if (error)
+				goto out;
+			ASSERT(nimaps == 1);
+			ASSERT(tirec.br_startoff == irec.br_startoff);
+			trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
+
+			/* Trim the extent. */
+			uirec = tirec;
+			uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
+					tirec.br_blockcount,
+					irec.br_blockcount);
+			trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
+
+			if (xfs_bmap_is_real_extent(&uirec)) {
+				error = xfs_iext_count_may_overflow(ip,
+						XFS_DATA_FORK,
+						XFS_IEXT_SWAP_RMAP_CNT);
+				if (error == -EFBIG)
+					error = xfs_iext_count_upgrade(tp, ip,
+							XFS_IEXT_SWAP_RMAP_CNT);
+				if (error)
+					goto out;
+			}
+
+			if (xfs_bmap_is_real_extent(&irec)) {
+				error = xfs_iext_count_may_overflow(tip,
+						XFS_DATA_FORK,
+						XFS_IEXT_SWAP_RMAP_CNT);
+				if (error == -EFBIG)
+					error = xfs_iext_count_upgrade(tp, ip,
+							XFS_IEXT_SWAP_RMAP_CNT);
+				if (error)
+					goto out;
+			}
+
+			/* Remove the mapping from the donor file. */
+			xfs_bmap_unmap_extent(tp, tip, &uirec);
+
+			/* Remove the mapping from the source file. */
+			xfs_bmap_unmap_extent(tp, ip, &irec);
+
+			/* Map the donor file's blocks into the source file. */
+			xfs_bmap_map_extent(tp, ip, &uirec);
+
+			/* Map the source file's blocks into the donor file. */
+			xfs_bmap_map_extent(tp, tip, &irec);
+
+			error = xfs_defer_finish(tpp);
+			tp = *tpp;
+			if (error)
+				goto out;
+
+			tirec.br_startoff += rlen;
+			if (tirec.br_startblock != HOLESTARTBLOCK &&
+			    tirec.br_startblock != DELAYSTARTBLOCK)
+				tirec.br_startblock += rlen;
+			tirec.br_blockcount -= rlen;
+		}
+
+		/* Roll on... */
+		count_fsb -= ilen;
+		offset_fsb += ilen;
+	}
+
+	tip->i_diflags2 = tip_flags2;
+	return 0;
+
+out:
+	trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
+	tip->i_diflags2 = tip_flags2;
+	return error;
+}
+
+/* Swap the extents of two files by swapping data forks. */
+STATIC int
+xfs_swap_extent_forks(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_inode	*tip,
+	int			*src_log_flags,
+	int			*target_log_flags)
+{
+	xfs_filblks_t		aforkblks = 0;
+	xfs_filblks_t		taforkblks = 0;
+	xfs_extnum_t		junk;
+	uint64_t		tmp;
+	int			error;
+
+	/*
+	 * Count the number of extended attribute blocks
+	 */
+	if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
+	    ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
+		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
+				&aforkblks);
+		if (error)
+			return error;
+	}
+	if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
+	    tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
+		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
+				&taforkblks);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Btree format (v3) inodes have the inode number stamped in the bmbt
+	 * block headers. We can't start changing the bmbt blocks until the
+	 * inode owner change is logged so recovery does the right thing in the
+	 * event of a crash. Set the owner change log flags now and leave the
+	 * bmbt scan as the last step.
+	 */
+	if (xfs_has_v3inodes(ip->i_mount)) {
+		if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
+			(*target_log_flags) |= XFS_ILOG_DOWNER;
+		if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
+			(*src_log_flags) |= XFS_ILOG_DOWNER;
+	}
+
+	/*
+	 * Swap the data forks of the inodes
+	 */
+	swap(ip->i_df, tip->i_df);
+
+	/*
+	 * Fix the on-disk inode values
+	 */
+	tmp = (uint64_t)ip->i_nblocks;
+	ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
+	tip->i_nblocks = tmp + taforkblks - aforkblks;
+
+	/*
+	 * The extents in the source inode could still contain speculative
+	 * preallocation beyond EOF (e.g. the file is open but not modified
+	 * while defrag is in progress). In that case, we need to copy over the
+	 * number of delalloc blocks the data fork in the source inode is
+	 * tracking beyond EOF so that when the fork is truncated away when the
+	 * temporary inode is unlinked we don't underrun the i_delayed_blks
+	 * counter on that inode.
+	 */
+	ASSERT(tip->i_delayed_blks == 0);
+	tip->i_delayed_blks = ip->i_delayed_blks;
+	ip->i_delayed_blks = 0;
+
+	switch (ip->i_df.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		(*src_log_flags) |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
+		       (*src_log_flags & XFS_ILOG_DOWNER));
+		(*src_log_flags) |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+	switch (tip->i_df.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		(*target_log_flags) |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		(*target_log_flags) |= XFS_ILOG_DBROOT;
+		ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
+		       (*target_log_flags & XFS_ILOG_DOWNER));
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * Fix up the owners of the bmbt blocks to refer to the current inode. The
+ * change owner scan attempts to order all modified buffers in the current
+ * transaction. In the event of ordered buffer failure, the offending buffer is
+ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
+ * the transaction in this case to replenish the fallback log reservation and
+ * restart the scan. This process repeats until the scan completes.
+ */
+static int
+xfs_swap_change_owner(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	struct xfs_inode	*tmpip)
+{
+	int			error;
+	struct xfs_trans	*tp = *tpp;
+
+	do {
+		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
+					      NULL);
+		/* success or fatal error */
+		if (error != -EAGAIN)
+			break;
+
+		error = xfs_trans_roll(tpp);
+		if (error)
+			break;
+		tp = *tpp;
+
+		/*
+		 * Redirty both inodes so they can relog and keep the log tail
+		 * moving forward.
+		 */
+		xfs_trans_ijoin(tp, ip, 0);
+		xfs_trans_ijoin(tp, tmpip, 0);
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
+	} while (true);
+
+	return error;
+}
+
+int
+xfs_swap_extents(
+	struct xfs_inode	*ip,	/* target inode */
+	struct xfs_inode	*tip,	/* tmp inode */
+	struct xfs_swapext	*sxp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_bstat	*sbp = &sxp->sx_stat;
+	int			src_log_flags, target_log_flags;
+	int			error = 0;
+	uint64_t		f;
+	int			resblks = 0;
+	unsigned int		flags = 0;
+
+	/*
+	 * Lock the inodes against other IO, page faults and truncate to
+	 * begin with.  Then we can ensure the inodes are flushed and have no
+	 * page cache safely. Once we have done this we can take the ilocks and
+	 * do the rest of the checks.
+	 */
+	lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
+	filemap_invalidate_lock_two(VFS_I(ip)->i_mapping,
+				    VFS_I(tip)->i_mapping);
+
+	/* Verify that both files have the same format */
+	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Verify both files are either real-time or non-realtime */
+	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		goto out_unlock;
+
+	error = xfs_qm_dqattach(tip);
+	if (error)
+		goto out_unlock;
+
+	error = xfs_swap_extent_flush(ip);
+	if (error)
+		goto out_unlock;
+	error = xfs_swap_extent_flush(tip);
+	if (error)
+		goto out_unlock;
+
+	if (xfs_inode_has_cow_data(tip)) {
+		error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
+		if (error)
+			goto out_unlock;
+	}
+
+	/*
+	 * Extent "swapping" with rmap requires a permanent reservation and
+	 * a block reservation because it's really just a remap operation
+	 * performed with log redo items!
+	 */
+	if (xfs_has_rmapbt(mp)) {
+		int		w = XFS_DATA_FORK;
+		uint32_t	ipnext = ip->i_df.if_nextents;
+		uint32_t	tipnext	= tip->i_df.if_nextents;
+
+		/*
+		 * Conceptually this shouldn't affect the shape of either bmbt,
+		 * but since we atomically move extents one by one, we reserve
+		 * enough space to rebuild both trees.
+		 */
+		resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w);
+		resblks +=  XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
+
+		/*
+		 * If either inode straddles a bmapbt block allocation boundary,
+		 * the rmapbt algorithm triggers repeated allocs and frees as
+		 * extents are remapped. This can exhaust the block reservation
+		 * prematurely and cause shutdown. Return freed blocks to the
+		 * transaction reservation to counter this behavior.
+		 */
+		flags |= XFS_TRANS_RES_FDBLKS;
+	}
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags,
+				&tp);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Lock and join the inodes to the tansaction so that transaction commit
+	 * or cancel will unlock the inodes from this point onwards.
+	 */
+	xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_trans_ijoin(tp, tip, 0);
+
+
+	/* Verify all data are being swapped */
+	if (sxp->sx_offset != 0 ||
+	    sxp->sx_length != ip->i_disk_size ||
+	    sxp->sx_length != tip->i_disk_size) {
+		error = -EFAULT;
+		goto out_trans_cancel;
+	}
+
+	trace_xfs_swap_extent_before(ip, 0);
+	trace_xfs_swap_extent_before(tip, 1);
+
+	/* check inode formats now that data is flushed */
+	error = xfs_swap_extents_check_format(ip, tip);
+	if (error) {
+		xfs_notice(mp,
+		    "%s: inode 0x%llx format is incompatible for exchanging.",
+				__func__, ip->i_ino);
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Compare the current change & modify times with that
+	 * passed in.  If they differ, we abort this swap.
+	 * This is the mechanism used to ensure the calling
+	 * process that the file was not changed out from
+	 * under it.
+	 */
+	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+		error = -EBUSY;
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Note the trickiness in setting the log flags - we set the owner log
+	 * flag on the opposite inode (i.e. the inode we are setting the new
+	 * owner to be) because once we swap the forks and log that, log
+	 * recovery is going to see the fork as owned by the swapped inode,
+	 * not the pre-swapped inodes.
+	 */
+	src_log_flags = XFS_ILOG_CORE;
+	target_log_flags = XFS_ILOG_CORE;
+
+	if (xfs_has_rmapbt(mp))
+		error = xfs_swap_extent_rmap(&tp, ip, tip);
+	else
+		error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
+				&target_log_flags);
+	if (error)
+		goto out_trans_cancel;
+
+	/* Do we have to swap reflink flags? */
+	if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
+	    (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
+		f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
+		ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+		ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
+		tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+		tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
+	}
+
+	/* Swap the cow forks. */
+	if (xfs_has_reflink(mp)) {
+		ASSERT(!ip->i_cowfp ||
+		       ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
+		ASSERT(!tip->i_cowfp ||
+		       tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
+
+		swap(ip->i_cowfp, tip->i_cowfp);
+
+		if (ip->i_cowfp && ip->i_cowfp->if_bytes)
+			xfs_inode_set_cowblocks_tag(ip);
+		else
+			xfs_inode_clear_cowblocks_tag(ip);
+		if (tip->i_cowfp && tip->i_cowfp->if_bytes)
+			xfs_inode_set_cowblocks_tag(tip);
+		else
+			xfs_inode_clear_cowblocks_tag(tip);
+	}
+
+	xfs_trans_log_inode(tp, ip,  src_log_flags);
+	xfs_trans_log_inode(tp, tip, target_log_flags);
+
+	/*
+	 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
+	 * have inode number owner values in the bmbt blocks that still refer to
+	 * the old inode. Scan each bmbt to fix up the owner values with the
+	 * inode number of the current inode.
+	 */
+	if (src_log_flags & XFS_ILOG_DOWNER) {
+		error = xfs_swap_change_owner(&tp, ip, tip);
+		if (error)
+			goto out_trans_cancel;
+	}
+	if (target_log_flags & XFS_ILOG_DOWNER) {
+		error = xfs_swap_change_owner(&tp, tip, ip);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (xfs_has_wsync(mp))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp);
+
+	trace_xfs_swap_extent_after(ip, 0);
+	trace_xfs_swap_extent_after(tip, 1);
+
+out_unlock_ilock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL);
+out_unlock:
+	filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping,
+				      VFS_I(tip)->i_mapping);
+	unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	goto out_unlock_ilock;
+}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
new file mode 100644
index 000000000..24b37d211
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.h
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BMAP_UTIL_H__
+#define	__XFS_BMAP_UTIL_H__
+
+/* Kernel only BMAP related definitions and functions */
+
+struct xfs_bmbt_irec;
+struct xfs_extent_free_item;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_bmalloca;
+
+#ifdef CONFIG_XFS_RT
+int	xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+#else /* !CONFIG_XFS_RT */
+/*
+ * Attempts to allocate RT extents when RT is disable indicates corruption and
+ * should trigger a shutdown.
+ */
+static inline int
+xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
+{
+	return -EFSCORRUPTED;
+}
+#endif /* CONFIG_XFS_RT */
+
+int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+		xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+
+struct kgetbmap {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_oflags;	/* output flags */
+};
+int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+		struct kgetbmap *out);
+
+/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
+int	xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
+			       struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
+			       int rt, int eof, int delay, int convert,
+			       xfs_fileoff_t *offp, xfs_extlen_t *lenp);
+void	xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+			     int whichfork, struct xfs_bmbt_irec *rec,
+			     int *is_empty);
+
+/* preallocation and hole punch interface */
+int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
+			     xfs_off_t len);
+int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
+			    xfs_off_t len);
+int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
+				xfs_off_t len);
+int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
+				xfs_off_t len);
+
+/* EOF block manipulation functions */
+bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+int	xfs_free_eofblocks(struct xfs_inode *ip);
+
+int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+			 struct xfs_swapext *sx);
+
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
+xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count);
+int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+			  int whichfork, xfs_extnum_t *nextents,
+			  xfs_filblks_t *count);
+
+int	xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
+			      xfs_off_t len);
+
+#endif	/* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
new file mode 100644
index 000000000..dde346450
--- /dev/null
+++ b/fs/xfs/xfs_buf.c
@@ -0,0 +1,2366 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include <linux/backing-dev.h>
+#include <linux/dax.h>
+
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_log_recover.h"
+#include "xfs_log_priv.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_ag.h"
+
+struct kmem_cache *xfs_buf_cache;
+
+/*
+ * Locking orders
+ *
+ * xfs_buf_ioacct_inc:
+ * xfs_buf_ioacct_dec:
+ *	b_sema (caller holds)
+ *	  b_lock
+ *
+ * xfs_buf_stale:
+ *	b_sema (caller holds)
+ *	  b_lock
+ *	    lru_lock
+ *
+ * xfs_buf_rele:
+ *	b_lock
+ *	  pag_buf_lock
+ *	    lru_lock
+ *
+ * xfs_buftarg_drain_rele
+ *	lru_lock
+ *	  b_lock (trylock due to inversion)
+ *
+ * xfs_buftarg_isolate
+ *	lru_lock
+ *	  b_lock (trylock due to inversion)
+ */
+
+static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
+
+static inline int
+xfs_buf_submit(
+	struct xfs_buf		*bp)
+{
+	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
+}
+
+static inline int
+xfs_buf_is_vmapped(
+	struct xfs_buf	*bp)
+{
+	/*
+	 * Return true if the buffer is vmapped.
+	 *
+	 * b_addr is null if the buffer is not mapped, but the code is clever
+	 * enough to know it doesn't have to map a single page, so the check has
+	 * to be both for b_addr and bp->b_page_count > 1.
+	 */
+	return bp->b_addr && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+	struct xfs_buf	*bp)
+{
+	return (bp->b_page_count * PAGE_SIZE);
+}
+
+/*
+ * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+ * this buffer. The count is incremented once per buffer (per hold cycle)
+ * because the corresponding decrement is deferred to buffer release. Buffers
+ * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+ * tracking adds unnecessary overhead. This is used for sychronization purposes
+ * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
+ * in-flight buffers.
+ *
+ * Buffers that are never released (e.g., superblock, iclog buffers) must set
+ * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+ * never reaches zero and unmount hangs indefinitely.
+ */
+static inline void
+xfs_buf_ioacct_inc(
+	struct xfs_buf	*bp)
+{
+	if (bp->b_flags & XBF_NO_IOACCT)
+		return;
+
+	ASSERT(bp->b_flags & XBF_ASYNC);
+	spin_lock(&bp->b_lock);
+	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
+		bp->b_state |= XFS_BSTATE_IN_FLIGHT;
+		percpu_counter_inc(&bp->b_target->bt_io_count);
+	}
+	spin_unlock(&bp->b_lock);
+}
+
+/*
+ * Clear the in-flight state on a buffer about to be released to the LRU or
+ * freed and unaccount from the buftarg.
+ */
+static inline void
+__xfs_buf_ioacct_dec(
+	struct xfs_buf	*bp)
+{
+	lockdep_assert_held(&bp->b_lock);
+
+	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
+		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
+		percpu_counter_dec(&bp->b_target->bt_io_count);
+	}
+}
+
+static inline void
+xfs_buf_ioacct_dec(
+	struct xfs_buf	*bp)
+{
+	spin_lock(&bp->b_lock);
+	__xfs_buf_ioacct_dec(bp);
+	spin_unlock(&bp->b_lock);
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+	struct xfs_buf	*bp)
+{
+	ASSERT(xfs_buf_islocked(bp));
+
+	bp->b_flags |= XBF_STALE;
+
+	/*
+	 * Clear the delwri status so that a delwri queue walker will not
+	 * flush this buffer to disk now that it is stale. The delwri queue has
+	 * a reference to the buffer, so this is safe to do.
+	 */
+	bp->b_flags &= ~_XBF_DELWRI_Q;
+
+	/*
+	 * Once the buffer is marked stale and unlocked, a subsequent lookup
+	 * could reset b_flags. There is no guarantee that the buffer is
+	 * unaccounted (released to LRU) before that occurs. Drop in-flight
+	 * status now to preserve accounting consistency.
+	 */
+	spin_lock(&bp->b_lock);
+	__xfs_buf_ioacct_dec(bp);
+
+	atomic_set(&bp->b_lru_ref, 0);
+	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
+	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+		atomic_dec(&bp->b_hold);
+
+	ASSERT(atomic_read(&bp->b_hold) >= 1);
+	spin_unlock(&bp->b_lock);
+}
+
+static int
+xfs_buf_get_maps(
+	struct xfs_buf		*bp,
+	int			map_count)
+{
+	ASSERT(bp->b_maps == NULL);
+	bp->b_map_count = map_count;
+
+	if (map_count == 1) {
+		bp->b_maps = &bp->__b_map;
+		return 0;
+	}
+
+	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
+				KM_NOFS);
+	if (!bp->b_maps)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ *	Frees b_pages if it was allocated.
+ */
+static void
+xfs_buf_free_maps(
+	struct xfs_buf	*bp)
+{
+	if (bp->b_maps != &bp->__b_map) {
+		kmem_free(bp->b_maps);
+		bp->b_maps = NULL;
+	}
+}
+
+static int
+_xfs_buf_alloc(
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp;
+	int			error;
+	int			i;
+
+	*bpp = NULL;
+	bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
+
+	/*
+	 * We don't want certain flags to appear in b_flags unless they are
+	 * specifically set by later operations on the buffer.
+	 */
+	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
+
+	atomic_set(&bp->b_hold, 1);
+	atomic_set(&bp->b_lru_ref, 1);
+	init_completion(&bp->b_iowait);
+	INIT_LIST_HEAD(&bp->b_lru);
+	INIT_LIST_HEAD(&bp->b_list);
+	INIT_LIST_HEAD(&bp->b_li_list);
+	sema_init(&bp->b_sema, 0); /* held, no waiters */
+	spin_lock_init(&bp->b_lock);
+	bp->b_target = target;
+	bp->b_mount = target->bt_mount;
+	bp->b_flags = flags;
+
+	/*
+	 * Set length and io_length to the same value initially.
+	 * I/O routines should use io_length, which will be the same in
+	 * most cases but may be reset (e.g. XFS recovery).
+	 */
+	error = xfs_buf_get_maps(bp, nmaps);
+	if (error)  {
+		kmem_cache_free(xfs_buf_cache, bp);
+		return error;
+	}
+
+	bp->b_rhash_key = map[0].bm_bn;
+	bp->b_length = 0;
+	for (i = 0; i < nmaps; i++) {
+		bp->b_maps[i].bm_bn = map[i].bm_bn;
+		bp->b_maps[i].bm_len = map[i].bm_len;
+		bp->b_length += map[i].bm_len;
+	}
+
+	atomic_set(&bp->b_pin_count, 0);
+	init_waitqueue_head(&bp->b_waiters);
+
+	XFS_STATS_INC(bp->b_mount, xb_create);
+	trace_xfs_buf_init(bp, _RET_IP_);
+
+	*bpp = bp;
+	return 0;
+}
+
+static void
+xfs_buf_free_pages(
+	struct xfs_buf	*bp)
+{
+	uint		i;
+
+	ASSERT(bp->b_flags & _XBF_PAGES);
+
+	if (xfs_buf_is_vmapped(bp))
+		vm_unmap_ram(bp->b_addr, bp->b_page_count);
+
+	for (i = 0; i < bp->b_page_count; i++) {
+		if (bp->b_pages[i])
+			__free_page(bp->b_pages[i]);
+	}
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += bp->b_page_count;
+
+	if (bp->b_pages != bp->b_page_array)
+		kmem_free(bp->b_pages);
+	bp->b_pages = NULL;
+	bp->b_flags &= ~_XBF_PAGES;
+}
+
+static void
+xfs_buf_free_callback(
+	struct callback_head	*cb)
+{
+	struct xfs_buf		*bp = container_of(cb, struct xfs_buf, b_rcu);
+
+	xfs_buf_free_maps(bp);
+	kmem_cache_free(xfs_buf_cache, bp);
+}
+
+static void
+xfs_buf_free(
+	struct xfs_buf		*bp)
+{
+	trace_xfs_buf_free(bp, _RET_IP_);
+
+	ASSERT(list_empty(&bp->b_lru));
+
+	if (bp->b_flags & _XBF_PAGES)
+		xfs_buf_free_pages(bp);
+	else if (bp->b_flags & _XBF_KMEM)
+		kmem_free(bp->b_addr);
+
+	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
+}
+
+static int
+xfs_buf_alloc_kmem(
+	struct xfs_buf	*bp,
+	xfs_buf_flags_t	flags)
+{
+	xfs_km_flags_t	kmflag_mask = KM_NOFS;
+	size_t		size = BBTOB(bp->b_length);
+
+	/* Assure zeroed buffer for non-read cases. */
+	if (!(flags & XBF_READ))
+		kmflag_mask |= KM_ZERO;
+
+	bp->b_addr = kmem_alloc(size, kmflag_mask);
+	if (!bp->b_addr)
+		return -ENOMEM;
+
+	if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
+	    ((unsigned long)bp->b_addr & PAGE_MASK)) {
+		/* b_addr spans two pages - use alloc_page instead */
+		kmem_free(bp->b_addr);
+		bp->b_addr = NULL;
+		return -ENOMEM;
+	}
+	bp->b_offset = offset_in_page(bp->b_addr);
+	bp->b_pages = bp->b_page_array;
+	bp->b_pages[0] = kmem_to_page(bp->b_addr);
+	bp->b_page_count = 1;
+	bp->b_flags |= _XBF_KMEM;
+	return 0;
+}
+
+static int
+xfs_buf_alloc_pages(
+	struct xfs_buf	*bp,
+	xfs_buf_flags_t	flags)
+{
+	gfp_t		gfp_mask = __GFP_NOWARN;
+	long		filled = 0;
+
+	if (flags & XBF_READ_AHEAD)
+		gfp_mask |= __GFP_NORETRY;
+	else
+		gfp_mask |= GFP_NOFS;
+
+	/* Make sure that we have a page list */
+	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
+	if (bp->b_page_count <= XB_PAGES) {
+		bp->b_pages = bp->b_page_array;
+	} else {
+		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
+					gfp_mask);
+		if (!bp->b_pages)
+			return -ENOMEM;
+	}
+	bp->b_flags |= _XBF_PAGES;
+
+	/* Assure zeroed buffer for non-read cases. */
+	if (!(flags & XBF_READ))
+		gfp_mask |= __GFP_ZERO;
+
+	/*
+	 * Bulk filling of pages can take multiple calls. Not filling the entire
+	 * array is not an allocation failure, so don't back off if we get at
+	 * least one extra page.
+	 */
+	for (;;) {
+		long	last = filled;
+
+		filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
+						bp->b_pages);
+		if (filled == bp->b_page_count) {
+			XFS_STATS_INC(bp->b_mount, xb_page_found);
+			break;
+		}
+
+		if (filled != last)
+			continue;
+
+		if (flags & XBF_READ_AHEAD) {
+			xfs_buf_free_pages(bp);
+			return -ENOMEM;
+		}
+
+		XFS_STATS_INC(bp->b_mount, xb_page_retries);
+		memalloc_retry_wait(gfp_mask);
+	}
+	return 0;
+}
+
+/*
+ *	Map buffer into kernel address-space if necessary.
+ */
+STATIC int
+_xfs_buf_map_pages(
+	struct xfs_buf		*bp,
+	xfs_buf_flags_t		flags)
+{
+	ASSERT(bp->b_flags & _XBF_PAGES);
+	if (bp->b_page_count == 1) {
+		/* A single page buffer is always mappable */
+		bp->b_addr = page_address(bp->b_pages[0]);
+	} else if (flags & XBF_UNMAPPED) {
+		bp->b_addr = NULL;
+	} else {
+		int retried = 0;
+		unsigned nofs_flag;
+
+		/*
+		 * vm_map_ram() will allocate auxiliary structures (e.g.
+		 * pagetables) with GFP_KERNEL, yet we are likely to be under
+		 * GFP_NOFS context here. Hence we need to tell memory reclaim
+		 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
+		 * memory reclaim re-entering the filesystem here and
+		 * potentially deadlocking.
+		 */
+		nofs_flag = memalloc_nofs_save();
+		do {
+			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+						-1);
+			if (bp->b_addr)
+				break;
+			vm_unmap_aliases();
+		} while (retried++ <= 1);
+		memalloc_nofs_restore(nofs_flag);
+
+		if (!bp->b_addr)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ *	Finding and Reading Buffers
+ */
+static int
+_xfs_buf_obj_cmp(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const struct xfs_buf_map	*map = arg->key;
+	const struct xfs_buf		*bp = obj;
+
+	/*
+	 * The key hashing in the lookup path depends on the key being the
+	 * first element of the compare_arg, make sure to assert this.
+	 */
+	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
+
+	if (bp->b_rhash_key != map->bm_bn)
+		return 1;
+
+	if (unlikely(bp->b_length != map->bm_len)) {
+		/*
+		 * found a block number match. If the range doesn't
+		 * match, the only way this is allowed is if the buffer
+		 * in the cache is stale and the transaction that made
+		 * it stale has not yet committed. i.e. we are
+		 * reallocating a busy extent. Skip this buffer and
+		 * continue searching for an exact match.
+		 */
+		ASSERT(bp->b_flags & XBF_STALE);
+		return 1;
+	}
+	return 0;
+}
+
+static const struct rhashtable_params xfs_buf_hash_params = {
+	.min_size		= 32,	/* empty AGs have minimal footprint */
+	.nelem_hint		= 16,
+	.key_len		= sizeof(xfs_daddr_t),
+	.key_offset		= offsetof(struct xfs_buf, b_rhash_key),
+	.head_offset		= offsetof(struct xfs_buf, b_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= _xfs_buf_obj_cmp,
+};
+
+int
+xfs_buf_hash_init(
+	struct xfs_perag	*pag)
+{
+	spin_lock_init(&pag->pag_buf_lock);
+	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+}
+
+void
+xfs_buf_hash_destroy(
+	struct xfs_perag	*pag)
+{
+	rhashtable_destroy(&pag->pag_buf_hash);
+}
+
+static int
+xfs_buf_map_verify(
+	struct xfs_buftarg	*btp,
+	struct xfs_buf_map	*map)
+{
+	xfs_daddr_t		eofs;
+
+	/* Check for IOs smaller than the sector size / not sector aligned */
+	ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
+	ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
+
+	/*
+	 * Corrupted block numbers can get through to here, unfortunately, so we
+	 * have to check that the buffer falls within the filesystem bounds.
+	 */
+	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+	if (map->bm_bn < 0 || map->bm_bn >= eofs) {
+		xfs_alert(btp->bt_mount,
+			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
+			  __func__, map->bm_bn, eofs);
+		WARN_ON(1);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+static int
+xfs_buf_find_lock(
+	struct xfs_buf          *bp,
+	xfs_buf_flags_t		flags)
+{
+	if (flags & XBF_TRYLOCK) {
+		if (!xfs_buf_trylock(bp)) {
+			XFS_STATS_INC(bp->b_mount, xb_busy_locked);
+			return -EAGAIN;
+		}
+	} else {
+		xfs_buf_lock(bp);
+		XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
+	}
+
+	/*
+	 * if the buffer is stale, clear all the external state associated with
+	 * it. We need to keep flags such as how we allocated the buffer memory
+	 * intact here.
+	 */
+	if (bp->b_flags & XBF_STALE) {
+		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_ops = NULL;
+	}
+	return 0;
+}
+
+static inline int
+xfs_buf_lookup(
+	struct xfs_perag	*pag,
+	struct xfs_buf_map	*map,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf          *bp;
+	int			error;
+
+	rcu_read_lock();
+	bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+	if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+	rcu_read_unlock();
+
+	error = xfs_buf_find_lock(bp, flags);
+	if (error) {
+		xfs_buf_rele(bp);
+		return error;
+	}
+
+	trace_xfs_buf_find(bp, flags, _RET_IP_);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Insert the new_bp into the hash table. This consumes the perag reference
+ * taken for the lookup regardless of the result of the insert.
+ */
+static int
+xfs_buf_find_insert(
+	struct xfs_buftarg	*btp,
+	struct xfs_perag	*pag,
+	struct xfs_buf_map	*cmap,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*new_bp;
+	struct xfs_buf		*bp;
+	int			error;
+
+	error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
+	if (error)
+		goto out_drop_pag;
+
+	/*
+	 * For buffers that fit entirely within a single page, first attempt to
+	 * allocate the memory from the heap to minimise memory usage. If we
+	 * can't get heap memory for these small buffers, we fall back to using
+	 * the page allocator.
+	 */
+	if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
+	    xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+		error = xfs_buf_alloc_pages(new_bp, flags);
+		if (error)
+			goto out_free_buf;
+	}
+
+	spin_lock(&pag->pag_buf_lock);
+	bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+			&new_bp->b_rhash_head, xfs_buf_hash_params);
+	if (IS_ERR(bp)) {
+		error = PTR_ERR(bp);
+		spin_unlock(&pag->pag_buf_lock);
+		goto out_free_buf;
+	}
+	if (bp) {
+		/* found an existing buffer */
+		atomic_inc(&bp->b_hold);
+		spin_unlock(&pag->pag_buf_lock);
+		error = xfs_buf_find_lock(bp, flags);
+		if (error)
+			xfs_buf_rele(bp);
+		else
+			*bpp = bp;
+		goto out_free_buf;
+	}
+
+	/* The new buffer keeps the perag reference until it is freed. */
+	new_bp->b_pag = pag;
+	spin_unlock(&pag->pag_buf_lock);
+	*bpp = new_bp;
+	return 0;
+
+out_free_buf:
+	xfs_buf_free(new_bp);
+out_drop_pag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
+ */
+int
+xfs_buf_get_map(
+	struct xfs_buftarg	*btp,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_perag	*pag;
+	struct xfs_buf		*bp = NULL;
+	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
+	int			error;
+	int			i;
+
+	for (i = 0; i < nmaps; i++)
+		cmap.bm_len += map[i].bm_len;
+
+	error = xfs_buf_map_verify(btp, &cmap);
+	if (error)
+		return error;
+
+	pag = xfs_perag_get(btp->bt_mount,
+			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+
+	error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+	if (error && error != -ENOENT)
+		goto out_put_perag;
+
+	/* cache hits always outnumber misses by at least 10:1 */
+	if (unlikely(!bp)) {
+		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
+
+		if (flags & XBF_INCORE)
+			goto out_put_perag;
+
+		/* xfs_buf_find_insert() consumes the perag reference. */
+		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+				flags, &bp);
+		if (error)
+			return error;
+	} else {
+		XFS_STATS_INC(btp->bt_mount, xb_get_locked);
+		xfs_perag_put(pag);
+	}
+
+	/* We do not hold a perag reference anymore. */
+	if (!bp->b_addr) {
+		error = _xfs_buf_map_pages(bp, flags);
+		if (unlikely(error)) {
+			xfs_warn_ratelimited(btp->bt_mount,
+				"%s: failed to map %u pages", __func__,
+				bp->b_page_count);
+			xfs_buf_relse(bp);
+			return error;
+		}
+	}
+
+	/*
+	 * Clear b_error if this is a lookup from a caller that doesn't expect
+	 * valid data to be found in the buffer.
+	 */
+	if (!(flags & XBF_READ))
+		xfs_buf_ioerror(bp, 0);
+
+	XFS_STATS_INC(btp->bt_mount, xb_get);
+	trace_xfs_buf_get(bp, flags, _RET_IP_);
+	*bpp = bp;
+	return 0;
+
+out_put_perag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+int
+_xfs_buf_read(
+	struct xfs_buf		*bp,
+	xfs_buf_flags_t		flags)
+{
+	ASSERT(!(flags & XBF_WRITE));
+	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
+
+	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
+	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
+
+	return xfs_buf_submit(bp);
+}
+
+/*
+ * Reverify a buffer found in cache without an attached ->b_ops.
+ *
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
+ *
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type.  If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
+ */
+int
+xfs_buf_reverify(
+	struct xfs_buf		*bp,
+	const struct xfs_buf_ops *ops)
+{
+	ASSERT(bp->b_flags & XBF_DONE);
+	ASSERT(bp->b_error == 0);
+
+	if (!ops || bp->b_ops)
+		return 0;
+
+	bp->b_ops = ops;
+	bp->b_ops->verify_read(bp);
+	if (bp->b_error)
+		bp->b_flags &= ~XBF_DONE;
+	return bp->b_error;
+}
+
+int
+xfs_buf_read_map(
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops,
+	xfs_failaddr_t		fa)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	flags |= XBF_READ;
+	*bpp = NULL;
+
+	error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
+	if (error)
+		return error;
+
+	trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+	if (!(bp->b_flags & XBF_DONE)) {
+		/* Initiate the buffer read and wait. */
+		XFS_STATS_INC(target->bt_mount, xb_get_read);
+		bp->b_ops = ops;
+		error = _xfs_buf_read(bp, flags);
+
+		/* Readahead iodone already dropped the buffer, so exit. */
+		if (flags & XBF_ASYNC)
+			return 0;
+	} else {
+		/* Buffer already read; all we need to do is check it. */
+		error = xfs_buf_reverify(bp, ops);
+
+		/* Readahead already finished; drop the buffer and exit. */
+		if (flags & XBF_ASYNC) {
+			xfs_buf_relse(bp);
+			return 0;
+		}
+
+		/* We do not want read in the flags */
+		bp->b_flags &= ~XBF_READ;
+		ASSERT(bp->b_ops != NULL || ops == NULL);
+	}
+
+	/*
+	 * If we've had a read error, then the contents of the buffer are
+	 * invalid and should not be used. To ensure that a followup read tries
+	 * to pull the buffer from disk again, we clear the XBF_DONE flag and
+	 * mark the buffer stale. This ensures that anyone who has a current
+	 * reference to the buffer will interpret it's contents correctly and
+	 * future cache lookups will also treat it as an empty, uninitialised
+	 * buffer.
+	 */
+	if (error) {
+		/*
+		 * Check against log shutdown for error reporting because
+		 * metadata writeback may require a read first and we need to
+		 * report errors in metadata writeback until the log is shut
+		 * down. High level transaction read functions already check
+		 * against mount shutdown, anyway, so we only need to be
+		 * concerned about low level IO interactions here.
+		 */
+		if (!xlog_is_shutdown(target->bt_mount->m_log))
+			xfs_buf_ioerror_alert(bp, fa);
+
+		bp->b_flags &= ~XBF_DONE;
+		xfs_buf_stale(bp);
+		xfs_buf_relse(bp);
+
+		/* bad CRC means corrupted metadata */
+		if (error == -EFSBADCRC)
+			error = -EFSCORRUPTED;
+		return error;
+	}
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ *	If we are not low on memory then do the readahead in a deadlock
+ *	safe manner.
+ */
+void
+xfs_buf_readahead_map(
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+
+	xfs_buf_read_map(target, map, nmaps,
+		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
+		     __this_address);
+}
+
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing. Uncached buffers always have
+ * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
+ * is cached or uncached during fault diagnosis.
+ */
+int
+xfs_buf_read_uncached(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		daddr,
+	size_t			numblks,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	*bpp = NULL;
+
+	error = xfs_buf_get_uncached(target, numblks, flags, &bp);
+	if (error)
+		return error;
+
+	/* set up the buffer for a read IO */
+	ASSERT(bp->b_map_count == 1);
+	bp->b_rhash_key = XFS_BUF_DADDR_NULL;
+	bp->b_maps[0].bm_bn = daddr;
+	bp->b_flags |= XBF_READ;
+	bp->b_ops = ops;
+
+	xfs_buf_submit(bp);
+	if (bp->b_error) {
+		error = bp->b_error;
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	*bpp = bp;
+	return 0;
+}
+
+int
+xfs_buf_get_uncached(
+	struct xfs_buftarg	*target,
+	size_t			numblks,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	int			error;
+	struct xfs_buf		*bp;
+	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
+
+	*bpp = NULL;
+
+	/* flags might contain irrelevant bits, pass only what we care about */
+	error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
+	if (error)
+		return error;
+
+	error = xfs_buf_alloc_pages(bp, flags);
+	if (error)
+		goto fail_free_buf;
+
+	error = _xfs_buf_map_pages(bp, 0);
+	if (unlikely(error)) {
+		xfs_warn(target->bt_mount,
+			"%s: failed to map pages", __func__);
+		goto fail_free_buf;
+	}
+
+	trace_xfs_buf_get_uncached(bp, _RET_IP_);
+	*bpp = bp;
+	return 0;
+
+fail_free_buf:
+	xfs_buf_free(bp);
+	return error;
+}
+
+/*
+ *	Increment reference count on buffer, to hold the buffer concurrently
+ *	with another thread which may release (free) the buffer asynchronously.
+ *	Must hold the buffer already to call this function.
+ */
+void
+xfs_buf_hold(
+	struct xfs_buf		*bp)
+{
+	trace_xfs_buf_hold(bp, _RET_IP_);
+	atomic_inc(&bp->b_hold);
+}
+
+/*
+ * Release a hold on the specified buffer. If the hold count is 1, the buffer is
+ * placed on LRU or freed (depending on b_lru_ref).
+ */
+void
+xfs_buf_rele(
+	struct xfs_buf		*bp)
+{
+	struct xfs_perag	*pag = bp->b_pag;
+	bool			release;
+	bool			freebuf = false;
+
+	trace_xfs_buf_rele(bp, _RET_IP_);
+
+	if (!pag) {
+		ASSERT(list_empty(&bp->b_lru));
+		if (atomic_dec_and_test(&bp->b_hold)) {
+			xfs_buf_ioacct_dec(bp);
+			xfs_buf_free(bp);
+		}
+		return;
+	}
+
+	ASSERT(atomic_read(&bp->b_hold) > 0);
+
+	/*
+	 * We grab the b_lock here first to serialise racing xfs_buf_rele()
+	 * calls. The pag_buf_lock being taken on the last reference only
+	 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
+	 * to last reference we drop here is not serialised against the last
+	 * reference until we take bp->b_lock. Hence if we don't grab b_lock
+	 * first, the last "release" reference can win the race to the lock and
+	 * free the buffer before the second-to-last reference is processed,
+	 * leading to a use-after-free scenario.
+	 */
+	spin_lock(&bp->b_lock);
+	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+	if (!release) {
+		/*
+		 * Drop the in-flight state if the buffer is already on the LRU
+		 * and it holds the only reference. This is racy because we
+		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+		 * ensures the decrement occurs only once per-buf.
+		 */
+		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+			__xfs_buf_ioacct_dec(bp);
+		goto out_unlock;
+	}
+
+	/* the last reference has been dropped ... */
+	__xfs_buf_ioacct_dec(bp);
+	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+		/*
+		 * If the buffer is added to the LRU take a new reference to the
+		 * buffer for the LRU and clear the (now stale) dispose list
+		 * state flag
+		 */
+		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+			bp->b_state &= ~XFS_BSTATE_DISPOSE;
+			atomic_inc(&bp->b_hold);
+		}
+		spin_unlock(&pag->pag_buf_lock);
+	} else {
+		/*
+		 * most of the time buffers will already be removed from the
+		 * LRU, so optimise that case by checking for the
+		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+		 * was on was the disposal list
+		 */
+		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+		} else {
+			ASSERT(list_empty(&bp->b_lru));
+		}
+
+		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
+				       xfs_buf_hash_params);
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
+		freebuf = true;
+	}
+
+out_unlock:
+	spin_unlock(&bp->b_lock);
+
+	if (freebuf)
+		xfs_buf_free(bp);
+}
+
+
+/*
+ *	Lock a buffer object, if it is not already locked.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we are
+ *	being asked to lock a buffer that has been reallocated. Because it is
+ *	pinned, we know that the log has not been pushed to disk and hence it
+ *	will still be locked.  Rather than continuing to have trylock attempts
+ *	fail until someone else pushes the log, push it ourselves before
+ *	returning.  This means that the xfsaild will not get stuck trying
+ *	to push on stale inode buffers.
+ */
+int
+xfs_buf_trylock(
+	struct xfs_buf		*bp)
+{
+	int			locked;
+
+	locked = down_trylock(&bp->b_sema) == 0;
+	if (locked)
+		trace_xfs_buf_trylock(bp, _RET_IP_);
+	else
+		trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+	return locked;
+}
+
+/*
+ *	Lock a buffer object.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we
+ *	are being asked to lock a buffer that has been reallocated. Because
+ *	it is pinned, we know that the log has not been pushed to disk and
+ *	hence it will still be locked. Rather than sleeping until someone
+ *	else pushes the log, push it ourselves before trying to get the lock.
+ */
+void
+xfs_buf_lock(
+	struct xfs_buf		*bp)
+{
+	trace_xfs_buf_lock(bp, _RET_IP_);
+
+	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+		xfs_log_force(bp->b_mount, 0);
+	down(&bp->b_sema);
+
+	trace_xfs_buf_lock_done(bp, _RET_IP_);
+}
+
+void
+xfs_buf_unlock(
+	struct xfs_buf		*bp)
+{
+	ASSERT(xfs_buf_islocked(bp));
+
+	up(&bp->b_sema);
+	trace_xfs_buf_unlock(bp, _RET_IP_);
+}
+
+STATIC void
+xfs_buf_wait_unpin(
+	struct xfs_buf		*bp)
+{
+	DECLARE_WAITQUEUE	(wait, current);
+
+	if (atomic_read(&bp->b_pin_count) == 0)
+		return;
+
+	add_wait_queue(&bp->b_waiters, &wait);
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&bp->b_pin_count) == 0)
+			break;
+		io_schedule();
+	}
+	remove_wait_queue(&bp->b_waiters, &wait);
+	set_current_state(TASK_RUNNING);
+}
+
+static void
+xfs_buf_ioerror_alert_ratelimited(
+	struct xfs_buf		*bp)
+{
+	static unsigned long	lasttime;
+	static struct xfs_buftarg *lasttarg;
+
+	if (bp->b_target != lasttarg ||
+	    time_after(jiffies, (lasttime + 5*HZ))) {
+		lasttime = jiffies;
+		xfs_buf_ioerror_alert(bp, __this_address);
+	}
+	lasttarg = bp->b_target;
+}
+
+/*
+ * Account for this latest trip around the retry handler, and decide if
+ * we've failed enough times to constitute a permanent failure.
+ */
+static bool
+xfs_buf_ioerror_permanent(
+	struct xfs_buf		*bp,
+	struct xfs_error_cfg	*cfg)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+
+	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+	    ++bp->b_retries > cfg->max_retries)
+		return true;
+	if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+		return true;
+
+	/* At unmount we may treat errors differently */
+	if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
+		return true;
+
+	return false;
+}
+
+/*
+ * On a sync write or shutdown we just want to stale the buffer and let the
+ * caller handle the error in bp->b_error appropriately.
+ *
+ * If the write was asynchronous then no one will be looking for the error.  If
+ * this is the first failure of this type, clear the error state and write the
+ * buffer out again. This means we always retry an async write failure at least
+ * once, but we also need to set the buffer up to behave correctly now for
+ * repeated failures.
+ *
+ * If we get repeated async write failures, then we take action according to the
+ * error configuration we have been set up to use.
+ *
+ * Returns true if this function took care of error handling and the caller must
+ * not touch the buffer again.  Return false if the caller should proceed with
+ * normal I/O completion handling.
+ */
+static bool
+xfs_buf_ioend_handle_error(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_error_cfg	*cfg;
+
+	/*
+	 * If we've already shutdown the journal because of I/O errors, there's
+	 * no point in giving this a retry.
+	 */
+	if (xlog_is_shutdown(mp->m_log))
+		goto out_stale;
+
+	xfs_buf_ioerror_alert_ratelimited(bp);
+
+	/*
+	 * We're not going to bother about retrying this during recovery.
+	 * One strike!
+	 */
+	if (bp->b_flags & _XBF_LOGRECOVERY) {
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		return false;
+	}
+
+	/*
+	 * Synchronous writes will have callers process the error.
+	 */
+	if (!(bp->b_flags & XBF_ASYNC))
+		goto out_stale;
+
+	trace_xfs_buf_iodone_async(bp, _RET_IP_);
+
+	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+	if (bp->b_last_error != bp->b_error ||
+	    !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
+		bp->b_last_error = bp->b_error;
+		if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+		    !bp->b_first_retry_time)
+			bp->b_first_retry_time = jiffies;
+		goto resubmit;
+	}
+
+	/*
+	 * Permanent error - we need to trigger a shutdown if we haven't already
+	 * to indicate that inconsistency will result from this action.
+	 */
+	if (xfs_buf_ioerror_permanent(bp, cfg)) {
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		goto out_stale;
+	}
+
+	/* Still considered a transient error. Caller will schedule retries. */
+	if (bp->b_flags & _XBF_INODES)
+		xfs_buf_inode_io_fail(bp);
+	else if (bp->b_flags & _XBF_DQUOTS)
+		xfs_buf_dquot_io_fail(bp);
+	else
+		ASSERT(list_empty(&bp->b_li_list));
+	xfs_buf_ioerror(bp, 0);
+	xfs_buf_relse(bp);
+	return true;
+
+resubmit:
+	xfs_buf_ioerror(bp, 0);
+	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+	xfs_buf_submit(bp);
+	return true;
+out_stale:
+	xfs_buf_stale(bp);
+	bp->b_flags |= XBF_DONE;
+	bp->b_flags &= ~XBF_WRITE;
+	trace_xfs_buf_error_relse(bp, _RET_IP_);
+	return false;
+}
+
+static void
+xfs_buf_ioend(
+	struct xfs_buf	*bp)
+{
+	trace_xfs_buf_iodone(bp, _RET_IP_);
+
+	/*
+	 * Pull in IO completion errors now. We are guaranteed to be running
+	 * single threaded, so we don't need the lock to read b_io_error.
+	 */
+	if (!bp->b_error && bp->b_io_error)
+		xfs_buf_ioerror(bp, bp->b_io_error);
+
+	if (bp->b_flags & XBF_READ) {
+		if (!bp->b_error && bp->b_ops)
+			bp->b_ops->verify_read(bp);
+		if (!bp->b_error)
+			bp->b_flags |= XBF_DONE;
+	} else {
+		if (!bp->b_error) {
+			bp->b_flags &= ~XBF_WRITE_FAIL;
+			bp->b_flags |= XBF_DONE;
+		}
+
+		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
+			return;
+
+		/* clear the retry state */
+		bp->b_last_error = 0;
+		bp->b_retries = 0;
+		bp->b_first_retry_time = 0;
+
+		/*
+		 * Note that for things like remote attribute buffers, there may
+		 * not be a buffer log item here, so processing the buffer log
+		 * item must remain optional.
+		 */
+		if (bp->b_log_item)
+			xfs_buf_item_done(bp);
+
+		if (bp->b_flags & _XBF_INODES)
+			xfs_buf_inode_iodone(bp);
+		else if (bp->b_flags & _XBF_DQUOTS)
+			xfs_buf_dquot_iodone(bp);
+
+	}
+
+	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
+			 _XBF_LOGRECOVERY);
+
+	if (bp->b_flags & XBF_ASYNC)
+		xfs_buf_relse(bp);
+	else
+		complete(&bp->b_iowait);
+}
+
+static void
+xfs_buf_ioend_work(
+	struct work_struct	*work)
+{
+	struct xfs_buf		*bp =
+		container_of(work, struct xfs_buf, b_ioend_work);
+
+	xfs_buf_ioend(bp);
+}
+
+static void
+xfs_buf_ioend_async(
+	struct xfs_buf	*bp)
+{
+	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
+	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+}
+
+void
+__xfs_buf_ioerror(
+	struct xfs_buf		*bp,
+	int			error,
+	xfs_failaddr_t		failaddr)
+{
+	ASSERT(error <= 0 && error >= -1000);
+	bp->b_error = error;
+	trace_xfs_buf_ioerror(bp, error, failaddr);
+}
+
+void
+xfs_buf_ioerror_alert(
+	struct xfs_buf		*bp,
+	xfs_failaddr_t		func)
+{
+	xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
+		"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
+				  func, (uint64_t)xfs_buf_daddr(bp),
+				  bp->b_length, -bp->b_error);
+}
+
+/*
+ * To simulate an I/O failure, the buffer must be locked and held with at least
+ * three references. The LRU reference is dropped by the stale call. The buf
+ * item reference is dropped via ioend processing. The third reference is owned
+ * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ */
+void
+xfs_buf_ioend_fail(
+	struct xfs_buf	*bp)
+{
+	bp->b_flags &= ~XBF_DONE;
+	xfs_buf_stale(bp);
+	xfs_buf_ioerror(bp, -EIO);
+	xfs_buf_ioend(bp);
+}
+
+int
+xfs_bwrite(
+	struct xfs_buf		*bp)
+{
+	int			error;
+
+	ASSERT(xfs_buf_islocked(bp));
+
+	bp->b_flags |= XBF_WRITE;
+	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+			 XBF_DONE);
+
+	error = xfs_buf_submit(bp);
+	if (error)
+		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
+	return error;
+}
+
+static void
+xfs_buf_bio_end_io(
+	struct bio		*bio)
+{
+	struct xfs_buf		*bp = (struct xfs_buf *)bio->bi_private;
+
+	if (!bio->bi_status &&
+	    (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
+	    XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+		bio->bi_status = BLK_STS_IOERR;
+
+	/*
+	 * don't overwrite existing errors - otherwise we can lose errors on
+	 * buffers that require multiple bios to complete.
+	 */
+	if (bio->bi_status) {
+		int error = blk_status_to_errno(bio->bi_status);
+
+		cmpxchg(&bp->b_io_error, 0, error);
+	}
+
+	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+		xfs_buf_ioend_async(bp);
+	bio_put(bio);
+}
+
+static void
+xfs_buf_ioapply_map(
+	struct xfs_buf	*bp,
+	int		map,
+	int		*buf_offset,
+	int		*count,
+	blk_opf_t	op)
+{
+	int		page_index;
+	unsigned int	total_nr_pages = bp->b_page_count;
+	int		nr_pages;
+	struct bio	*bio;
+	sector_t	sector =  bp->b_maps[map].bm_bn;
+	int		size;
+	int		offset;
+
+	/* skip the pages in the buffer before the start offset */
+	page_index = 0;
+	offset = *buf_offset;
+	while (offset >= PAGE_SIZE) {
+		page_index++;
+		offset -= PAGE_SIZE;
+	}
+
+	/*
+	 * Limit the IO size to the length of the current vector, and update the
+	 * remaining IO count for the next time around.
+	 */
+	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
+	*count -= size;
+	*buf_offset += size;
+
+next_chunk:
+	atomic_inc(&bp->b_io_remaining);
+	nr_pages = bio_max_segs(total_nr_pages);
+
+	bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_end_io = xfs_buf_bio_end_io;
+	bio->bi_private = bp;
+
+	for (; size && nr_pages; nr_pages--, page_index++) {
+		int	rbytes, nbytes = PAGE_SIZE - offset;
+
+		if (nbytes > size)
+			nbytes = size;
+
+		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
+				      offset);
+		if (rbytes < nbytes)
+			break;
+
+		offset = 0;
+		sector += BTOBB(nbytes);
+		size -= nbytes;
+		total_nr_pages--;
+	}
+
+	if (likely(bio->bi_iter.bi_size)) {
+		if (xfs_buf_is_vmapped(bp)) {
+			flush_kernel_vmap_range(bp->b_addr,
+						xfs_buf_vmap_len(bp));
+		}
+		submit_bio(bio);
+		if (size)
+			goto next_chunk;
+	} else {
+		/*
+		 * This is guaranteed not to be the last io reference count
+		 * because the caller (xfs_buf_submit) holds a count itself.
+		 */
+		atomic_dec(&bp->b_io_remaining);
+		xfs_buf_ioerror(bp, -EIO);
+		bio_put(bio);
+	}
+
+}
+
+STATIC void
+_xfs_buf_ioapply(
+	struct xfs_buf	*bp)
+{
+	struct blk_plug	plug;
+	blk_opf_t	op;
+	int		offset;
+	int		size;
+	int		i;
+
+	/*
+	 * Make sure we capture only current IO errors rather than stale errors
+	 * left over from previous use of the buffer (e.g. failed readahead).
+	 */
+	bp->b_error = 0;
+
+	if (bp->b_flags & XBF_WRITE) {
+		op = REQ_OP_WRITE;
+
+		/*
+		 * Run the write verifier callback function if it exists. If
+		 * this function fails it will mark the buffer with an error and
+		 * the IO should not be dispatched.
+		 */
+		if (bp->b_ops) {
+			bp->b_ops->verify_write(bp);
+			if (bp->b_error) {
+				xfs_force_shutdown(bp->b_mount,
+						   SHUTDOWN_CORRUPT_INCORE);
+				return;
+			}
+		} else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
+			struct xfs_mount *mp = bp->b_mount;
+
+			/*
+			 * non-crc filesystems don't attach verifiers during
+			 * log recovery, so don't warn for such filesystems.
+			 */
+			if (xfs_has_crc(mp)) {
+				xfs_warn(mp,
+					"%s: no buf ops on daddr 0x%llx len %d",
+					__func__, xfs_buf_daddr(bp),
+					bp->b_length);
+				xfs_hex_dump(bp->b_addr,
+						XFS_CORRUPTION_DUMP_LEN);
+				dump_stack();
+			}
+		}
+	} else {
+		op = REQ_OP_READ;
+		if (bp->b_flags & XBF_READ_AHEAD)
+			op |= REQ_RAHEAD;
+	}
+
+	/* we only use the buffer cache for meta-data */
+	op |= REQ_META;
+
+	/*
+	 * Walk all the vectors issuing IO on them. Set up the initial offset
+	 * into the buffer and the desired IO size before we start -
+	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
+	 * subsequent call.
+	 */
+	offset = bp->b_offset;
+	size = BBTOB(bp->b_length);
+	blk_start_plug(&plug);
+	for (i = 0; i < bp->b_map_count; i++) {
+		xfs_buf_ioapply_map(bp, i, &offset, &size, op);
+		if (bp->b_error)
+			break;
+		if (size <= 0)
+			break;	/* all done */
+	}
+	blk_finish_plug(&plug);
+}
+
+/*
+ * Wait for I/O completion of a sync buffer and return the I/O error code.
+ */
+static int
+xfs_buf_iowait(
+	struct xfs_buf	*bp)
+{
+	ASSERT(!(bp->b_flags & XBF_ASYNC));
+
+	trace_xfs_buf_iowait(bp, _RET_IP_);
+	wait_for_completion(&bp->b_iowait);
+	trace_xfs_buf_iowait_done(bp, _RET_IP_);
+
+	return bp->b_error;
+}
+
+/*
+ * Buffer I/O submission path, read or write. Asynchronous submission transfers
+ * the buffer lock ownership and the current reference to the IO. It is not
+ * safe to reference the buffer after a call to this function unless the caller
+ * holds an additional reference itself.
+ */
+static int
+__xfs_buf_submit(
+	struct xfs_buf	*bp,
+	bool		wait)
+{
+	int		error = 0;
+
+	trace_xfs_buf_submit(bp, _RET_IP_);
+
+	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+
+	/*
+	 * On log shutdown we stale and complete the buffer immediately. We can
+	 * be called to read the superblock before the log has been set up, so
+	 * be careful checking the log state.
+	 *
+	 * Checking the mount shutdown state here can result in the log tail
+	 * moving inappropriately on disk as the log may not yet be shut down.
+	 * i.e. failing this buffer on mount shutdown can remove it from the AIL
+	 * and move the tail of the log forwards without having written this
+	 * buffer to disk. This corrupts the log tail state in memory, and
+	 * because the log may not be shut down yet, it can then be propagated
+	 * to disk before the log is shutdown. Hence we check log shutdown
+	 * state here rather than mount state to avoid corrupting the log tail
+	 * on shutdown.
+	 */
+	if (bp->b_mount->m_log &&
+	    xlog_is_shutdown(bp->b_mount->m_log)) {
+		xfs_buf_ioend_fail(bp);
+		return -EIO;
+	}
+
+	/*
+	 * Grab a reference so the buffer does not go away underneath us. For
+	 * async buffers, I/O completion drops the callers reference, which
+	 * could occur before submission returns.
+	 */
+	xfs_buf_hold(bp);
+
+	if (bp->b_flags & XBF_WRITE)
+		xfs_buf_wait_unpin(bp);
+
+	/* clear the internal error state to avoid spurious errors */
+	bp->b_io_error = 0;
+
+	/*
+	 * Set the count to 1 initially, this will stop an I/O completion
+	 * callout which happens before we have started all the I/O from calling
+	 * xfs_buf_ioend too early.
+	 */
+	atomic_set(&bp->b_io_remaining, 1);
+	if (bp->b_flags & XBF_ASYNC)
+		xfs_buf_ioacct_inc(bp);
+	_xfs_buf_ioapply(bp);
+
+	/*
+	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
+	 * reference we took above. If we drop it to zero, run completion so
+	 * that we don't return to the caller with completion still pending.
+	 */
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+		if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
+			xfs_buf_ioend(bp);
+		else
+			xfs_buf_ioend_async(bp);
+	}
+
+	if (wait)
+		error = xfs_buf_iowait(bp);
+
+	/*
+	 * Release the hold that keeps the buffer referenced for the entire
+	 * I/O. Note that if the buffer is async, it is not safe to reference
+	 * after this release.
+	 */
+	xfs_buf_rele(bp);
+	return error;
+}
+
+void *
+xfs_buf_offset(
+	struct xfs_buf		*bp,
+	size_t			offset)
+{
+	struct page		*page;
+
+	if (bp->b_addr)
+		return bp->b_addr + offset;
+
+	page = bp->b_pages[offset >> PAGE_SHIFT];
+	return page_address(page) + (offset & (PAGE_SIZE-1));
+}
+
+void
+xfs_buf_zero(
+	struct xfs_buf		*bp,
+	size_t			boff,
+	size_t			bsize)
+{
+	size_t			bend;
+
+	bend = boff + bsize;
+	while (boff < bend) {
+		struct page	*page;
+		int		page_index, page_offset, csize;
+
+		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
+		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
+		page = bp->b_pages[page_index];
+		csize = min_t(size_t, PAGE_SIZE - page_offset,
+				      BBTOB(bp->b_length) - boff);
+
+		ASSERT((csize + page_offset) <= PAGE_SIZE);
+
+		memset(page_address(page) + page_offset, 0, csize);
+
+		boff += csize;
+	}
+}
+
+/*
+ * Log a message about and stale a buffer that a caller has decided is corrupt.
+ *
+ * This function should be called for the kinds of metadata corruption that
+ * cannot be detect from a verifier, such as incorrect inter-block relationship
+ * data.  Do /not/ call this function from a verifier function.
+ *
+ * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
+ * be marked stale, but b_error will not be set.  The caller is responsible for
+ * releasing the buffer or fixing it.
+ */
+void
+__xfs_buf_mark_corrupt(
+	struct xfs_buf		*bp,
+	xfs_failaddr_t		fa)
+{
+	ASSERT(bp->b_flags & XBF_DONE);
+
+	xfs_buf_corruption_error(bp, fa);
+	xfs_buf_stale(bp);
+}
+
+/*
+ *	Handling of buffer targets (buftargs).
+ */
+
+/*
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
+ */
+static enum lru_status
+xfs_buftarg_drain_rele(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
+
+	if (atomic_read(&bp->b_hold) > 1) {
+		/* need to wait, so skip it this pass */
+		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
+		return LRU_SKIP;
+	}
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+
+	/*
+	 * clear the LRU reference count so the buffer doesn't get
+	 * ignored in xfs_buf_rele().
+	 */
+	atomic_set(&bp->b_lru_ref, 0);
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
+}
+
+/*
+ * Wait for outstanding I/O on the buftarg to complete.
+ */
+void
+xfs_buftarg_wait(
+	struct xfs_buftarg	*btp)
+{
+	/*
+	 * First wait on the buftarg I/O count for all in-flight buffers to be
+	 * released. This is critical as new buffers do not make the LRU until
+	 * they are released.
+	 *
+	 * Next, flush the buffer workqueue to ensure all completion processing
+	 * has finished. Just waiting on buffer locks is not sufficient for
+	 * async IO as the reference count held over IO is not released until
+	 * after the buffer lock is dropped. Hence we need to ensure here that
+	 * all reference counts have been dropped before we start walking the
+	 * LRU list.
+	 */
+	while (percpu_counter_sum(&btp->bt_io_count))
+		delay(100);
+	flush_workqueue(btp->bt_mount->m_buf_workqueue);
+}
+
+void
+xfs_buftarg_drain(
+	struct xfs_buftarg	*btp)
+{
+	LIST_HEAD(dispose);
+	int			loop = 0;
+	bool			write_fail = false;
+
+	xfs_buftarg_wait(btp);
+
+	/* loop until there is nothing left on the lru list. */
+	while (list_lru_count(&btp->bt_lru)) {
+		list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
+			      &dispose, LONG_MAX);
+
+		while (!list_empty(&dispose)) {
+			struct xfs_buf *bp;
+			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+			list_del_init(&bp->b_lru);
+			if (bp->b_flags & XBF_WRITE_FAIL) {
+				write_fail = true;
+				xfs_buf_alert_ratelimited(bp,
+					"XFS: Corruption Alert",
+"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
+					(long long)xfs_buf_daddr(bp));
+			}
+			xfs_buf_rele(bp);
+		}
+		if (loop++ != 0)
+			delay(100);
+	}
+
+	/*
+	 * If one or more failed buffers were freed, that means dirty metadata
+	 * was thrown away. This should only ever happen after I/O completion
+	 * handling has elevated I/O error(s) to permanent failures and shuts
+	 * down the journal.
+	 */
+	if (write_fail) {
+		ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
+		xfs_alert(btp->bt_mount,
+	      "Please run xfs_repair to determine the extent of the problem.");
+	}
+}
+
+static enum lru_status
+xfs_buftarg_isolate(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
+
+	/*
+	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	/*
+	 * Decrement the b_lru_ref count unless the value is already
+	 * zero. If the value is already zero, we need to reclaim the
+	 * buffer, otherwise it gets another trip through the LRU.
+	 */
+	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+		spin_unlock(&bp->b_lock);
+		return LRU_ROTATE;
+	}
+
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
+}
+
+static unsigned long
+xfs_buftarg_shrink_scan(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	LIST_HEAD(dispose);
+	unsigned long		freed;
+
+	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
+				     xfs_buftarg_isolate, &dispose);
+
+	while (!list_empty(&dispose)) {
+		struct xfs_buf *bp;
+		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+		list_del_init(&bp->b_lru);
+		xfs_buf_rele(bp);
+	}
+
+	return freed;
+}
+
+static unsigned long
+xfs_buftarg_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	return list_lru_shrink_count(&btp->bt_lru, sc);
+}
+
+void
+xfs_free_buftarg(
+	struct xfs_buftarg	*btp)
+{
+	unregister_shrinker(&btp->bt_shrinker);
+	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+	percpu_counter_destroy(&btp->bt_io_count);
+	list_lru_destroy(&btp->bt_lru);
+
+	blkdev_issue_flush(btp->bt_bdev);
+	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
+
+	kmem_free(btp);
+}
+
+int
+xfs_setsize_buftarg(
+	xfs_buftarg_t		*btp,
+	unsigned int		sectorsize)
+{
+	/* Set up metadata sector size info */
+	btp->bt_meta_sectorsize = sectorsize;
+	btp->bt_meta_sectormask = sectorsize - 1;
+
+	if (set_blocksize(btp->bt_bdev, sectorsize)) {
+		xfs_warn(btp->bt_mount,
+			"Cannot set_blocksize to %u on device %pg",
+			sectorsize, btp->bt_bdev);
+		return -EINVAL;
+	}
+
+	/* Set up device logical sector size mask */
+	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
+
+	return 0;
+}
+
+/*
+ * When allocating the initial buffer target we have not yet
+ * read in the superblock, so don't know what sized sectors
+ * are being used at this early stage.  Play safe.
+ */
+STATIC int
+xfs_setsize_buftarg_early(
+	xfs_buftarg_t		*btp,
+	struct block_device	*bdev)
+{
+	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
+}
+
+struct xfs_buftarg *
+xfs_alloc_buftarg(
+	struct xfs_mount	*mp,
+	struct block_device	*bdev)
+{
+	xfs_buftarg_t		*btp;
+	const struct dax_holder_operations *ops = NULL;
+
+#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
+	ops = &xfs_dax_holder_operations;
+#endif
+	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
+
+	btp->bt_mount = mp;
+	btp->bt_dev =  bdev->bd_dev;
+	btp->bt_bdev = bdev;
+	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+					    mp, ops);
+
+	/*
+	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
+	 * per 30 seconds so as to not spam logs too much on repeated errors.
+	 */
+	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
+			     DEFAULT_RATELIMIT_BURST);
+
+	if (xfs_setsize_buftarg_early(btp, bdev))
+		goto error_free;
+
+	if (list_lru_init(&btp->bt_lru))
+		goto error_free;
+
+	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+		goto error_lru;
+
+	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
+	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
+	if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
+			      mp->m_super->s_id))
+		goto error_pcpu;
+	return btp;
+
+error_pcpu:
+	percpu_counter_destroy(&btp->bt_io_count);
+error_lru:
+	list_lru_destroy(&btp->bt_lru);
+error_free:
+	kmem_free(btp);
+	return NULL;
+}
+
+/*
+ * Cancel a delayed write list.
+ *
+ * Remove each buffer from the list, clear the delwri queue flag and drop the
+ * associated buffer reference.
+ */
+void
+xfs_buf_delwri_cancel(
+	struct list_head	*list)
+{
+	struct xfs_buf		*bp;
+
+	while (!list_empty(list)) {
+		bp = list_first_entry(list, struct xfs_buf, b_list);
+
+		xfs_buf_lock(bp);
+		bp->b_flags &= ~_XBF_DELWRI_Q;
+		list_del_init(&bp->b_list);
+		xfs_buf_relse(bp);
+	}
+}
+
+/*
+ * Add a buffer to the delayed write list.
+ *
+ * This queues a buffer for writeout if it hasn't already been.  Note that
+ * neither this routine nor the buffer list submission functions perform
+ * any internal synchronization.  It is expected that the lists are thread-local
+ * to the callers.
+ *
+ * Returns true if we queued up the buffer, or false if it already had
+ * been on the buffer list.
+ */
+bool
+xfs_buf_delwri_queue(
+	struct xfs_buf		*bp,
+	struct list_head	*list)
+{
+	ASSERT(xfs_buf_islocked(bp));
+	ASSERT(!(bp->b_flags & XBF_READ));
+
+	/*
+	 * If the buffer is already marked delwri it already is queued up
+	 * by someone else for imediate writeout.  Just ignore it in that
+	 * case.
+	 */
+	if (bp->b_flags & _XBF_DELWRI_Q) {
+		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
+		return false;
+	}
+
+	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
+
+	/*
+	 * If a buffer gets written out synchronously or marked stale while it
+	 * is on a delwri list we lazily remove it. To do this, the other party
+	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
+	 * It remains referenced and on the list.  In a rare corner case it
+	 * might get readded to a delwri list after the synchronous writeout, in
+	 * which case we need just need to re-add the flag here.
+	 */
+	bp->b_flags |= _XBF_DELWRI_Q;
+	if (list_empty(&bp->b_list)) {
+		atomic_inc(&bp->b_hold);
+		list_add_tail(&bp->b_list, list);
+	}
+
+	return true;
+}
+
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+	void			*priv,
+	const struct list_head	*a,
+	const struct list_head	*b)
+{
+	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
+	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
+	xfs_daddr_t		diff;
+
+	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * Submit buffers for write. If wait_list is specified, the buffers are
+ * submitted using sync I/O and placed on the wait list such that the caller can
+ * iowait each buffer. Otherwise async I/O is used and the buffers are released
+ * at I/O completion time. In either case, buffers remain locked until I/O
+ * completes and the buffer is released from the queue.
+ */
+static int
+xfs_buf_delwri_submit_buffers(
+	struct list_head	*buffer_list,
+	struct list_head	*wait_list)
+{
+	struct xfs_buf		*bp, *n;
+	int			pinned = 0;
+	struct blk_plug		plug;
+
+	list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+	blk_start_plug(&plug);
+	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+		if (!wait_list) {
+			if (!xfs_buf_trylock(bp))
+				continue;
+			if (xfs_buf_ispinned(bp)) {
+				xfs_buf_unlock(bp);
+				pinned++;
+				continue;
+			}
+		} else {
+			xfs_buf_lock(bp);
+		}
+
+		/*
+		 * Someone else might have written the buffer synchronously or
+		 * marked it stale in the meantime.  In that case only the
+		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
+		 * reference and remove it from the list here.
+		 */
+		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+			list_del_init(&bp->b_list);
+			xfs_buf_relse(bp);
+			continue;
+		}
+
+		trace_xfs_buf_delwri_split(bp, _RET_IP_);
+
+		/*
+		 * If we have a wait list, each buffer (and associated delwri
+		 * queue reference) transfers to it and is submitted
+		 * synchronously. Otherwise, drop the buffer from the delwri
+		 * queue and submit async.
+		 */
+		bp->b_flags &= ~_XBF_DELWRI_Q;
+		bp->b_flags |= XBF_WRITE;
+		if (wait_list) {
+			bp->b_flags &= ~XBF_ASYNC;
+			list_move_tail(&bp->b_list, wait_list);
+		} else {
+			bp->b_flags |= XBF_ASYNC;
+			list_del_init(&bp->b_list);
+		}
+		__xfs_buf_submit(bp, false);
+	}
+	blk_finish_plug(&plug);
+
+	return pinned;
+}
+
+/*
+ * Write out a buffer list asynchronously.
+ *
+ * This will take the @buffer_list, write all non-locked and non-pinned buffers
+ * out and not wait for I/O completion on any of the buffers.  This interface
+ * is only safely useable for callers that can track I/O completion by higher
+ * level means, e.g. AIL pushing as the @buffer_list is consumed in this
+ * function.
+ *
+ * Note: this function will skip buffers it would block on, and in doing so
+ * leaves them on @buffer_list so they can be retried on a later pass. As such,
+ * it is up to the caller to ensure that the buffer list is fully submitted or
+ * cancelled appropriately when they are finished with the list. Failure to
+ * cancel or resubmit the list until it is empty will result in leaked buffers
+ * at unmount time.
+ */
+int
+xfs_buf_delwri_submit_nowait(
+	struct list_head	*buffer_list)
+{
+	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
+}
+
+/*
+ * Write out a buffer list synchronously.
+ *
+ * This will take the @buffer_list, write all buffers out and wait for I/O
+ * completion on all of the buffers. @buffer_list is consumed by the function,
+ * so callers must have some other way of tracking buffers if they require such
+ * functionality.
+ */
+int
+xfs_buf_delwri_submit(
+	struct list_head	*buffer_list)
+{
+	LIST_HEAD		(wait_list);
+	int			error = 0, error2;
+	struct xfs_buf		*bp;
+
+	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
+
+	/* Wait for IO to complete. */
+	while (!list_empty(&wait_list)) {
+		bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
+
+		list_del_init(&bp->b_list);
+
+		/*
+		 * Wait on the locked buffer, check for errors and unlock and
+		 * release the delwri queue reference.
+		 */
+		error2 = xfs_buf_iowait(bp);
+		xfs_buf_relse(bp);
+		if (!error)
+			error = error2;
+	}
+
+	return error;
+}
+
+/*
+ * Push a single buffer on a delwri queue.
+ *
+ * The purpose of this function is to submit a single buffer of a delwri queue
+ * and return with the buffer still on the original queue. The waiting delwri
+ * buffer submission infrastructure guarantees transfer of the delwri queue
+ * buffer reference to a temporary wait list. We reuse this infrastructure to
+ * transfer the buffer back to the original queue.
+ *
+ * Note the buffer transitions from the queued state, to the submitted and wait
+ * listed state and back to the queued state during this call. The buffer
+ * locking and queue management logic between _delwri_pushbuf() and
+ * _delwri_queue() guarantee that the buffer cannot be queued to another list
+ * before returning.
+ */
+int
+xfs_buf_delwri_pushbuf(
+	struct xfs_buf		*bp,
+	struct list_head	*buffer_list)
+{
+	LIST_HEAD		(submit_list);
+	int			error;
+
+	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
+
+	/*
+	 * Isolate the buffer to a new local list so we can submit it for I/O
+	 * independently from the rest of the original list.
+	 */
+	xfs_buf_lock(bp);
+	list_move(&bp->b_list, &submit_list);
+	xfs_buf_unlock(bp);
+
+	/*
+	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
+	 * the buffer on the wait list with the original reference. Rather than
+	 * bounce the buffer from a local wait list back to the original list
+	 * after I/O completion, reuse the original list as the wait list.
+	 */
+	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
+
+	/*
+	 * The buffer is now locked, under I/O and wait listed on the original
+	 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
+	 * return with the buffer unlocked and on the original queue.
+	 */
+	error = xfs_buf_iowait(bp);
+	bp->b_flags |= _XBF_DELWRI_Q;
+	xfs_buf_unlock(bp);
+
+	return error;
+}
+
+void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
+{
+	/*
+	 * Set the lru reference count to 0 based on the error injection tag.
+	 * This allows userspace to disrupt buffer caching for debug/testing
+	 * purposes.
+	 */
+	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
+		lru_ref = 0;
+
+	atomic_set(&bp->b_lru_ref, lru_ref);
+}
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+	struct xfs_buf		*bp,
+	__be32			dmagic)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	int			idx;
+
+	idx = xfs_has_crc(mp);
+	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
+		return false;
+	return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+	struct xfs_buf		*bp,
+	__be16			dmagic)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	int			idx;
+
+	idx = xfs_has_crc(mp);
+	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
+		return false;
+	return dmagic == bp->b_ops->magic16[idx];
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
new file mode 100644
index 000000000..549c60942
--- /dev/null
+++ b/fs/xfs/xfs_buf.h
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/uio.h>
+#include <linux/list_lru.h>
+
+extern struct kmem_cache *xfs_buf_cache;
+
+/*
+ *	Base types
+ */
+struct xfs_buf;
+
+#define XFS_BUF_DADDR_NULL	((xfs_daddr_t) (-1LL))
+
+#define XBF_READ	 (1u << 0) /* buffer intended for reading from device */
+#define XBF_WRITE	 (1u << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD	 (1u << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT	 (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
+#define XBF_ASYNC	 (1u << 4) /* initiator will not wait for completion */
+#define XBF_DONE	 (1u << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE	 (1u << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL	 (1u << 7) /* async writes have failed on this buffer */
+
+/* buffer type flags for write callbacks */
+#define _XBF_INODES	 (1u << 16)/* inode buffer */
+#define _XBF_DQUOTS	 (1u << 17)/* dquot buffer */
+#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
+
+/* flags used only internally */
+#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
+#define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
+
+/* flags used only as arguments to access routines */
+#define XBF_INCORE	 (1u << 29)/* lookup only, return if found in cache */
+#define XBF_TRYLOCK	 (1u << 30)/* lock requested, but do not wait */
+#define XBF_UNMAPPED	 (1u << 31)/* do not map the buffer */
+
+
+typedef unsigned int xfs_buf_flags_t;
+
+#define XFS_BUF_FLAGS \
+	{ XBF_READ,		"READ" }, \
+	{ XBF_WRITE,		"WRITE" }, \
+	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
+	{ XBF_NO_IOACCT,	"NO_IOACCT" }, \
+	{ XBF_ASYNC,		"ASYNC" }, \
+	{ XBF_DONE,		"DONE" }, \
+	{ XBF_STALE,		"STALE" }, \
+	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
+	{ _XBF_INODES,		"INODES" }, \
+	{ _XBF_DQUOTS,		"DQUOTS" }, \
+	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
+	{ _XBF_PAGES,		"PAGES" }, \
+	{ _XBF_KMEM,		"KMEM" }, \
+	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
+	/* The following interface flags should never be set */ \
+	{ XBF_INCORE,		"INCORE" }, \
+	{ XBF_TRYLOCK,		"TRYLOCK" }, \
+	{ XBF_UNMAPPED,		"UNMAPPED" }
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
+#define XFS_BSTATE_IN_FLIGHT	 (1 << 1)	/* I/O in flight */
+
+/*
+ * The xfs_buftarg contains 2 notions of "sector size" -
+ *
+ * 1) The metadata sector size, which is the minimum unit and
+ *    alignment of IO which will be performed by metadata operations.
+ * 2) The device logical sector size
+ *
+ * The first is specified at mkfs time, and is stored on-disk in the
+ * superblock's sb_sectsize.
+ *
+ * The latter is derived from the underlying device, and controls direct IO
+ * alignment constraints.
+ */
+typedef struct xfs_buftarg {
+	dev_t			bt_dev;
+	struct block_device	*bt_bdev;
+	struct dax_device	*bt_daxdev;
+	u64			bt_dax_part_off;
+	struct xfs_mount	*bt_mount;
+	unsigned int		bt_meta_sectorsize;
+	size_t			bt_meta_sectormask;
+	size_t			bt_logical_sectorsize;
+	size_t			bt_logical_sectormask;
+
+	/* LRU control structures */
+	struct shrinker		bt_shrinker;
+	struct list_lru		bt_lru;
+
+	struct percpu_counter	bt_io_count;
+	struct ratelimit_state	bt_ioerror_rl;
+} xfs_buftarg_t;
+
+#define XB_PAGES	2
+
+struct xfs_buf_map {
+	xfs_daddr_t		bm_bn;	/* block number for I/O */
+	int			bm_len;	/* size of I/O */
+};
+
+#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
+	struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
+
+struct xfs_buf_ops {
+	char *name;
+	union {
+		__be32 magic[2];	/* v4 and v5 on disk magic values */
+		__be16 magic16[2];	/* v4 and v5 on disk magic values */
+	};
+	void (*verify_read)(struct xfs_buf *);
+	void (*verify_write)(struct xfs_buf *);
+	xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
+};
+
+struct xfs_buf {
+	/*
+	 * first cacheline holds all the fields needed for an uncontended cache
+	 * hit to be fully processed. The semaphore straddles the cacheline
+	 * boundary, but the counter and lock sits on the first cacheline,
+	 * which is the only bit that is touched if we hit the semaphore
+	 * fast-path on locking.
+	 */
+	struct rhash_head	b_rhash_head;	/* pag buffer hash node */
+
+	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
+	int			b_length;	/* size of buffer in BBs */
+	atomic_t		b_hold;		/* reference count */
+	atomic_t		b_lru_ref;	/* lru reclaim ref count */
+	xfs_buf_flags_t		b_flags;	/* status flags */
+	struct semaphore	b_sema;		/* semaphore for lockables */
+
+	/*
+	 * concurrent access to b_lru and b_lru_flags are protected by
+	 * bt_lru_lock and not by b_sema
+	 */
+	struct list_head	b_lru;		/* lru list */
+	spinlock_t		b_lock;		/* internal state lock */
+	unsigned int		b_state;	/* internal state flags */
+	int			b_io_error;	/* internal IO error state */
+	wait_queue_head_t	b_waiters;	/* unpin waiters */
+	struct list_head	b_list;
+	struct xfs_perag	*b_pag;		/* contains rbtree root */
+	struct xfs_mount	*b_mount;
+	struct xfs_buftarg	*b_target;	/* buffer target (device) */
+	void			*b_addr;	/* virtual address of buffer */
+	struct work_struct	b_ioend_work;
+	struct completion	b_iowait;	/* queue for I/O waiters */
+	struct xfs_buf_log_item	*b_log_item;
+	struct list_head	b_li_list;	/* Log items list head */
+	struct xfs_trans	*b_transp;
+	struct page		**b_pages;	/* array of page pointers */
+	struct page		*b_page_array[XB_PAGES]; /* inline pages */
+	struct xfs_buf_map	*b_maps;	/* compound buffer map */
+	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
+	int			b_map_count;
+	atomic_t		b_pin_count;	/* pin count */
+	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
+	unsigned int		b_page_count;	/* size of page array */
+	unsigned int		b_offset;	/* page offset of b_addr,
+						   only for _XBF_KMEM buffers */
+	int			b_error;	/* error code on I/O */
+
+	/*
+	 * async write failure retry count. Initialised to zero on the first
+	 * failure, then when it exceeds the maximum configured without a
+	 * success the write is considered to be failed permanently and the
+	 * iodone handler will take appropriate action.
+	 *
+	 * For retry timeouts, we record the jiffie of the first failure. This
+	 * means that we can change the retry timeout for buffers already under
+	 * I/O and thus avoid getting stuck in a retry loop with a long timeout.
+	 *
+	 * last_error is used to ensure that we are getting repeated errors, not
+	 * different errors. e.g. a block device might change ENOSPC to EIO when
+	 * a failure timeout occurs, so we want to re-initialise the error
+	 * retry behaviour appropriately when that happens.
+	 */
+	int			b_retries;
+	unsigned long		b_first_retry_time; /* in jiffies */
+	int			b_last_error;
+
+	const struct xfs_buf_ops	*b_ops;
+	struct rcu_head		b_rcu;
+};
+
+/* Finding and Reading Buffers */
+int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
+		int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
+int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
+		int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+		const struct xfs_buf_ops *ops, xfs_failaddr_t fa);
+void xfs_buf_readahead_map(struct xfs_buftarg *target,
+			       struct xfs_buf_map *map, int nmaps,
+			       const struct xfs_buf_ops *ops);
+
+static inline int
+xfs_buf_incore(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+	return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp);
+}
+
+static inline int
+xfs_buf_get(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	struct xfs_buf		**bpp)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+	return xfs_buf_get_map(target, &map, 1, 0, bpp);
+}
+
+static inline int
+xfs_buf_read(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+	return xfs_buf_read_map(target, &map, 1, flags, bpp, ops,
+			__builtin_return_address(0));
+}
+
+static inline void
+xfs_buf_readahead(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	const struct xfs_buf_ops *ops)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_buf_readahead_map(target, &map, 1, ops);
+}
+
+int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+		xfs_buf_flags_t flags, struct xfs_buf **bpp);
+int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
+		size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+		const struct xfs_buf_ops *ops);
+int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
+void xfs_buf_hold(struct xfs_buf *bp);
+
+/* Releasing Buffers */
+extern void xfs_buf_rele(struct xfs_buf *);
+
+/* Locking and Unlocking Buffers */
+extern int xfs_buf_trylock(struct xfs_buf *);
+extern void xfs_buf_lock(struct xfs_buf *);
+extern void xfs_buf_unlock(struct xfs_buf *);
+#define xfs_buf_islocked(bp) \
+	((bp)->b_sema.count <= 0)
+
+static inline void xfs_buf_relse(struct xfs_buf *bp)
+{
+	xfs_buf_unlock(bp);
+	xfs_buf_rele(bp);
+}
+
+/* Buffer Read and Write Routines */
+extern int xfs_bwrite(struct xfs_buf *bp);
+
+extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
+		xfs_failaddr_t failaddr);
+#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
+extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
+void xfs_buf_ioend_fail(struct xfs_buf *);
+void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
+void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
+#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
+
+/* Buffer Utility Routines */
+extern void *xfs_buf_offset(struct xfs_buf *, size_t);
+extern void xfs_buf_stale(struct xfs_buf *bp);
+
+/* Delayed Write Buffer Routines */
+extern void xfs_buf_delwri_cancel(struct list_head *);
+extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+extern int xfs_buf_delwri_submit(struct list_head *);
+extern int xfs_buf_delwri_submit_nowait(struct list_head *);
+extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
+
+static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
+{
+	return bp->b_maps[0].bm_bn;
+}
+
+void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
+
+/*
+ * If the buffer is already on the LRU, do nothing. Otherwise set the buffer
+ * up with a reference count of 0 so it will be tossed from the cache when
+ * released.
+ */
+static inline void xfs_buf_oneshot(struct xfs_buf *bp)
+{
+	if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1)
+		return;
+	atomic_set(&bp->b_lru_ref, 0);
+}
+
+static inline int xfs_buf_ispinned(struct xfs_buf *bp)
+{
+	return atomic_read(&bp->b_pin_count);
+}
+
+static inline int
+xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+	return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+				cksum_offset);
+}
+
+static inline void
+xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 cksum_offset);
+}
+
+/*
+ *	Handling of buftargs.
+ */
+struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
+		struct block_device *bdev);
+extern void xfs_free_buftarg(struct xfs_buftarg *);
+extern void xfs_buftarg_wait(struct xfs_buftarg *);
+extern void xfs_buftarg_drain(struct xfs_buftarg *);
+extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int);
+
+#define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
+#define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
+
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+
+#endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
new file mode 100644
index 000000000..522d450a9
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.c
@@ -0,0 +1,1046 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+
+
+struct kmem_cache	*xfs_buf_item_cache;
+
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_buf_log_item, bli_item);
+}
+
+/* Is this log iovec plausibly large enough to contain the buffer log format? */
+bool
+xfs_buf_log_check_iovec(
+	struct xfs_log_iovec		*iovec)
+{
+	struct xfs_buf_log_format	*blfp = iovec->i_addr;
+	char				*bmp_end;
+	char				*item_end;
+
+	if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
+		return false;
+
+	item_end = (char *)iovec->i_addr + iovec->i_len;
+	bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
+	return bmp_end <= item_end;
+}
+
+static inline int
+xfs_buf_log_format_size(
+	struct xfs_buf_log_format *blfp)
+{
+	return offsetof(struct xfs_buf_log_format, blf_data_map) +
+			(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+}
+
+static inline bool
+xfs_buf_item_straddle(
+	struct xfs_buf		*bp,
+	uint			offset,
+	int			first_bit,
+	int			nbits)
+{
+	void			*first, *last;
+
+	first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
+	last = xfs_buf_offset(bp,
+			offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
+
+	if (last - first != nbits * XFS_BLF_CHUNK)
+		return true;
+	return false;
+}
+
+/*
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item segment.
+ *
+ * It calculates this as 1 iovec for the buf log format structure and 1 for each
+ * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
+ * in a single iovec.
+ */
+STATIC void
+xfs_buf_item_size_segment(
+	struct xfs_buf_log_item		*bip,
+	struct xfs_buf_log_format	*blfp,
+	uint				offset,
+	int				*nvecs,
+	int				*nbytes)
+{
+	struct xfs_buf			*bp = bip->bli_buf;
+	int				first_bit;
+	int				nbits;
+	int				next_bit;
+	int				last_bit;
+
+	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+	if (first_bit == -1)
+		return;
+
+	(*nvecs)++;
+	*nbytes += xfs_buf_log_format_size(blfp);
+
+	do {
+		nbits = xfs_contig_bits(blfp->blf_data_map,
+					blfp->blf_map_size, first_bit);
+		ASSERT(nbits > 0);
+
+		/*
+		 * Straddling a page is rare because we don't log contiguous
+		 * chunks of unmapped buffers anywhere.
+		 */
+		if (nbits > 1 &&
+		    xfs_buf_item_straddle(bp, offset, first_bit, nbits))
+			goto slow_scan;
+
+		(*nvecs)++;
+		*nbytes += nbits * XFS_BLF_CHUNK;
+
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.  It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+					(uint)first_bit + nbits + 1);
+	} while (first_bit != -1);
+
+	return;
+
+slow_scan:
+	/* Count the first bit we jumped out of the above loop from */
+	(*nvecs)++;
+	*nbytes += XFS_BLF_CHUNK;
+	last_bit = first_bit;
+	while (last_bit != -1) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.  It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+					last_bit + 1);
+		/*
+		 * If we run out of bits, leave the loop,
+		 * else if we find a new set of bits bump the number of vecs,
+		 * else keep scanning the current set of bits.
+		 */
+		if (next_bit == -1) {
+			break;
+		} else if (next_bit != last_bit + 1 ||
+		           xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
+			last_bit = next_bit;
+			first_bit = next_bit;
+			(*nvecs)++;
+			nbits = 1;
+		} else {
+			last_bit++;
+			nbits++;
+		}
+		*nbytes += XFS_BLF_CHUNK;
+	}
+}
+
+/*
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item.
+ *
+ * Discontiguous buffers need a format structure per region that is being
+ * logged. This makes the changes in the buffer appear to log recovery as though
+ * they came from separate buffers, just like would occur if multiple buffers
+ * were used instead of a single discontiguous buffer. This enables
+ * discontiguous buffers to be in-memory constructs, completely transparent to
+ * what ends up on disk.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
+ * format structures. If the item has previously been logged and has dirty
+ * regions, we do not relog them in stale buffers. This has the effect of
+ * reducing the size of the relogged item by the amount of dirty data tracked
+ * by the log item. This can result in the committing transaction reducing the
+ * amount of space being consumed by the CIL.
+ */
+STATIC void
+xfs_buf_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	int			i;
+	int			bytes;
+	uint			offset = 0;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log is the buf log
+		 * format structure with the cancel flag in it as we are never
+		 * going to replay the changes tracked in the log item.
+		 */
+		trace_xfs_buf_item_size_stale(bip);
+		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+		*nvecs += bip->bli_format_count;
+		for (i = 0; i < bip->bli_format_count; i++) {
+			*nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
+		}
+		return;
+	}
+
+	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+
+	if (bip->bli_flags & XFS_BLI_ORDERED) {
+		/*
+		 * The buffer has been logged just to order it. It is not being
+		 * included in the transaction commit, so no vectors are used at
+		 * all.
+		 */
+		trace_xfs_buf_item_size_ordered(bip);
+		*nvecs = XFS_LOG_VEC_ORDERED;
+		return;
+	}
+
+	/*
+	 * The vector count is based on the number of buffer vectors we have
+	 * dirty bits in. This will only be greater than one when we have a
+	 * compound buffer with more than one segment dirty. Hence for compound
+	 * buffers we need to track which segment the dirty bits correspond to,
+	 * and when we move from one segment to the next increment the vector
+	 * count for the extra buf log format structure that will need to be
+	 * written.
+	 */
+	bytes = 0;
+	for (i = 0; i < bip->bli_format_count; i++) {
+		xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset,
+					  nvecs, &bytes);
+		offset += BBTOB(bp->b_maps[i].bm_len);
+	}
+
+	/*
+	 * Round up the buffer size required to minimise the number of memory
+	 * allocations that need to be done as this item grows when relogged by
+	 * repeated modifications.
+	 */
+	*nbytes = round_up(bytes, 512);
+	trace_xfs_buf_item_size(bip);
+}
+
+static inline void
+xfs_buf_item_copy_iovec(
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp,
+	struct xfs_buf		*bp,
+	uint			offset,
+	int			first_bit,
+	uint			nbits)
+{
+	offset += first_bit * XFS_BLF_CHUNK;
+	xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
+			xfs_buf_offset(bp, offset),
+			nbits * XFS_BLF_CHUNK);
+}
+
+static void
+xfs_buf_item_format_segment(
+	struct xfs_buf_log_item	*bip,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp,
+	uint			offset,
+	struct xfs_buf_log_format *blfp)
+{
+	struct xfs_buf		*bp = bip->bli_buf;
+	uint			base_size;
+	int			first_bit;
+	int			last_bit;
+	int			next_bit;
+	uint			nbits;
+
+	/* copy the flags across from the base format item */
+	blfp->blf_flags = bip->__bli_format.blf_flags;
+
+	/*
+	 * Base size is the actual size of the ondisk structure - it reflects
+	 * the actual size of the dirty bitmap rather than the size of the in
+	 * memory structure.
+	 */
+	base_size = xfs_buf_log_format_size(blfp);
+
+	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+	if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
+		/*
+		 * If the map is not be dirty in the transaction, mark
+		 * the size as zero and do not advance the vector pointer.
+		 */
+		return;
+	}
+
+	blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
+	blfp->blf_size = 1;
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		trace_xfs_buf_item_format_stale(bip);
+		ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
+		return;
+	}
+
+
+	/*
+	 * Fill in an iovec for each set of contiguous chunks.
+	 */
+	do {
+		ASSERT(first_bit >= 0);
+		nbits = xfs_contig_bits(blfp->blf_data_map,
+					blfp->blf_map_size, first_bit);
+		ASSERT(nbits > 0);
+
+		/*
+		 * Straddling a page is rare because we don't log contiguous
+		 * chunks of unmapped buffers anywhere.
+		 */
+		if (nbits > 1 &&
+		    xfs_buf_item_straddle(bp, offset, first_bit, nbits))
+			goto slow_scan;
+
+		xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+					first_bit, nbits);
+		blfp->blf_size++;
+
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.  It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+					(uint)first_bit + nbits + 1);
+	} while (first_bit != -1);
+
+	return;
+
+slow_scan:
+	ASSERT(bp->b_addr == NULL);
+	last_bit = first_bit;
+	nbits = 1;
+	for (;;) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.  It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+					(uint)last_bit + 1);
+		/*
+		 * If we run out of bits fill in the last iovec and get out of
+		 * the loop.  Else if we start a new set of bits then fill in
+		 * the iovec for the series we were looking at and start
+		 * counting the bits in the new one.  Else we're still in the
+		 * same set of bits so just keep counting and scanning.
+		 */
+		if (next_bit == -1) {
+			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+						first_bit, nbits);
+			blfp->blf_size++;
+			break;
+		} else if (next_bit != last_bit + 1 ||
+		           xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
+			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+						first_bit, nbits);
+			blfp->blf_size++;
+			first_bit = next_bit;
+			last_bit = next_bit;
+			nbits = 1;
+		} else {
+			last_bit++;
+			nbits++;
+		}
+	}
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item.  It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+STATIC void
+xfs_buf_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	struct xfs_log_iovec	*vecp = NULL;
+	uint			offset = 0;
+	int			i;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+	ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+	       (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+	        && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
+	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+
+
+	/*
+	 * If it is an inode buffer, transfer the in-memory state to the
+	 * format flags and clear the in-memory state.
+	 *
+	 * For buffer based inode allocation, we do not transfer
+	 * this state if the inode buffer allocation has not yet been committed
+	 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+	 * correct replay of the inode allocation.
+	 *
+	 * For icreate item based inode allocation, the buffers aren't written
+	 * to the journal during allocation, and hence we should always tag the
+	 * buffer as an inode buffer so that the correct unlinked list replay
+	 * occurs during recovery.
+	 */
+	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+		if (xfs_has_v3inodes(lip->li_log->l_mp) ||
+		    !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+		      xfs_log_item_in_current_chkpt(lip)))
+			bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+	}
+
+	for (i = 0; i < bip->bli_format_count; i++) {
+		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+					    &bip->bli_formats[i]);
+		offset += BBTOB(bp->b_maps[i].bm_len);
+	}
+
+	/*
+	 * Check to make sure everything is consistent.
+	 */
+	trace_xfs_buf_item_format(bip);
+}
+
+/*
+ * This is called to pin the buffer associated with the buf log item in memory
+ * so it cannot be written out.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
+ */
+STATIC void
+xfs_buf_item_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_ORDERED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+
+	trace_xfs_buf_item_pin(bip);
+
+	atomic_inc(&bip->bli_refcount);
+	atomic_inc(&bip->bli_buf->b_pin_count);
+}
+
+/*
+ * This is called to unpin the buffer associated with the buf log item which
+ * was previously pinned with a call to xfs_buf_item_pin().
+ */
+STATIC void
+xfs_buf_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	int			stale = bip->bli_flags & XFS_BLI_STALE;
+	int			freed;
+
+	ASSERT(bp->b_log_item == bip);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_buf_item_unpin(bip);
+
+	/*
+	 * Drop the bli ref associated with the pin and grab the hold required
+	 * for the I/O simulation failure in the abort case. We have to do this
+	 * before the pin count drops because the AIL doesn't acquire a bli
+	 * reference. Therefore if the refcount drops to zero, the bli could
+	 * still be AIL resident and the buffer submitted for I/O (and freed on
+	 * completion) at any point before we return. This can be removed once
+	 * the AIL properly holds a reference on the bli.
+	 */
+	freed = atomic_dec_and_test(&bip->bli_refcount);
+	if (freed && !stale && remove)
+		xfs_buf_hold(bp);
+	if (atomic_dec_and_test(&bp->b_pin_count))
+		wake_up_all(&bp->b_waiters);
+
+	 /* nothing to do but drop the pin count if the bli is active */
+	if (!freed)
+		return;
+
+	if (stale) {
+		ASSERT(bip->bli_flags & XFS_BLI_STALE);
+		ASSERT(xfs_buf_islocked(bp));
+		ASSERT(bp->b_flags & XBF_STALE);
+		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+		ASSERT(list_empty(&lip->li_trans));
+		ASSERT(!bp->b_transp);
+
+		trace_xfs_buf_item_unpin_stale(bip);
+
+		/*
+		 * If we get called here because of an IO error, we may or may
+		 * not have the item on the AIL. xfs_trans_ail_delete() will
+		 * take care of that situation. xfs_trans_ail_delete() drops
+		 * the AIL lock.
+		 */
+		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
+			xfs_buf_item_done(bp);
+			xfs_buf_inode_iodone(bp);
+			ASSERT(list_empty(&bp->b_li_list));
+		} else {
+			xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
+			xfs_buf_item_relse(bp);
+			ASSERT(bp->b_log_item == NULL);
+		}
+		xfs_buf_relse(bp);
+	} else if (remove) {
+		/*
+		 * The buffer must be locked and held by the caller to simulate
+		 * an async I/O failure. We acquired the hold for this case
+		 * before the buffer was unpinned.
+		 */
+		xfs_buf_lock(bp);
+		bp->b_flags |= XBF_ASYNC;
+		xfs_buf_ioend_fail(bp);
+	}
+}
+
+STATIC uint
+xfs_buf_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	uint			rval = XFS_ITEM_SUCCESS;
+
+	if (xfs_buf_ispinned(bp))
+		return XFS_ITEM_PINNED;
+	if (!xfs_buf_trylock(bp)) {
+		/*
+		 * If we have just raced with a buffer being pinned and it has
+		 * been marked stale, we could end up stalling until someone else
+		 * issues a log force to unpin the stale buffer. Check for the
+		 * race condition here so xfsaild recognizes the buffer is pinned
+		 * and queues a log force to move it along.
+		 */
+		if (xfs_buf_ispinned(bp))
+			return XFS_ITEM_PINNED;
+		return XFS_ITEM_LOCKED;
+	}
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+
+	trace_xfs_buf_item_push(bip);
+
+	/* has a previous flush failed due to IO errors? */
+	if (bp->b_flags & XBF_WRITE_FAIL) {
+		xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
+	    "Failing async write on buffer block 0x%llx. Retrying async write.",
+					  (long long)xfs_buf_daddr(bp));
+	}
+
+	if (!xfs_buf_delwri_queue(bp, buffer_list))
+		rval = XFS_ITEM_FLUSHING;
+	xfs_buf_unlock(bp);
+	return rval;
+}
+
+/*
+ * Drop the buffer log item refcount and take appropriate action. This helper
+ * determines whether the bli must be freed or not, since a decrement to zero
+ * does not necessarily mean the bli is unused.
+ *
+ * Return true if the bli is freed, false otherwise.
+ */
+bool
+xfs_buf_item_put(
+	struct xfs_buf_log_item	*bip)
+{
+	struct xfs_log_item	*lip = &bip->bli_item;
+	bool			aborted;
+	bool			dirty;
+
+	/* drop the bli ref and return if it wasn't the last one */
+	if (!atomic_dec_and_test(&bip->bli_refcount))
+		return false;
+
+	/*
+	 * We dropped the last ref and must free the item if clean or aborted.
+	 * If the bli is dirty and non-aborted, the buffer was clean in the
+	 * transaction but still awaiting writeback from previous changes. In
+	 * that case, the bli is freed on buffer writeback completion.
+	 */
+	aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
+			xlog_is_shutdown(lip->li_log);
+	dirty = bip->bli_flags & XFS_BLI_DIRTY;
+	if (dirty && !aborted)
+		return false;
+
+	/*
+	 * The bli is aborted or clean. An aborted item may be in the AIL
+	 * regardless of dirty state.  For example, consider an aborted
+	 * transaction that invalidated a dirty bli and cleared the dirty
+	 * state.
+	 */
+	if (aborted)
+		xfs_trans_ail_delete(lip, 0);
+	xfs_buf_item_relse(bip->bli_buf);
+	return true;
+}
+
+/*
+ * Release the buffer associated with the buf log item.  If there is no dirty
+ * logged data associated with the buffer recorded in the buf log item, then
+ * free the buf log item and remove the reference to it in the buffer.
+ *
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
+ *
+ * We unconditionally drop the transaction's reference to the log item. If the
+ * item was logged, then another reference was taken when it was pinned, so we
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
+ *
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
+ * if necessary but do not unlock the buffer.  This is for support of
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
+ * free the item.
+ */
+STATIC void
+xfs_buf_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	bool			released;
+	bool			hold = bip->bli_flags & XFS_BLI_HOLD;
+	bool			stale = bip->bli_flags & XFS_BLI_STALE;
+#if defined(DEBUG) || defined(XFS_WARN)
+	bool			ordered = bip->bli_flags & XFS_BLI_ORDERED;
+	bool			dirty = bip->bli_flags & XFS_BLI_DIRTY;
+	bool			aborted = test_bit(XFS_LI_ABORTED,
+						   &lip->li_flags);
+#endif
+
+	trace_xfs_buf_item_release(bip);
+
+	/*
+	 * The bli dirty state should match whether the blf has logged segments
+	 * except for ordered buffers, where only the bli should be dirty.
+	 */
+	ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
+	       (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
+	ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
+
+	/*
+	 * Clear the buffer's association with this transaction and
+	 * per-transaction state from the bli, which has been copied above.
+	 */
+	bp->b_transp = NULL;
+	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+
+	/*
+	 * Unref the item and unlock the buffer unless held or stale. Stale
+	 * buffers remain locked until final unpin unless the bli is freed by
+	 * the unref call. The latter implies shutdown because buffer
+	 * invalidation dirties the bli and transaction.
+	 */
+	released = xfs_buf_item_put(bip);
+	if (hold || (stale && !released))
+		return;
+	ASSERT(!stale || aborted);
+	xfs_buf_relse(bp);
+}
+
+STATIC void
+xfs_buf_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_csn_t		seq)
+{
+	return xfs_buf_item_release(lip);
+}
+
+/*
+ * This is called to find out where the oldest active copy of the
+ * buf log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.
+ * We always re-log all the dirty data in a buffer, so usually the
+ * latest copy in the on disk log is the only one that matters.  For
+ * those cases we simply return the given lsn.
+ *
+ * The one exception to this is for buffers full of newly allocated
+ * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
+ * flag set, indicating that only the di_next_unlinked fields from the
+ * inodes in the buffers will be replayed during recovery.  If the
+ * original newly allocated inode images have not yet been flushed
+ * when the buffer is so relogged, then we need to make sure that we
+ * keep the old images in the 'active' portion of the log.  We do this
+ * by returning the original lsn of that transaction here rather than
+ * the current one.
+ */
+STATIC xfs_lsn_t
+xfs_buf_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+
+	trace_xfs_buf_item_committed(bip);
+
+	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
+		return lip->li_lsn;
+	return lsn;
+}
+
+static const struct xfs_item_ops xfs_buf_item_ops = {
+	.iop_size	= xfs_buf_item_size,
+	.iop_format	= xfs_buf_item_format,
+	.iop_pin	= xfs_buf_item_pin,
+	.iop_unpin	= xfs_buf_item_unpin,
+	.iop_release	= xfs_buf_item_release,
+	.iop_committing	= xfs_buf_item_committing,
+	.iop_committed	= xfs_buf_item_committed,
+	.iop_push	= xfs_buf_item_push,
+};
+
+STATIC void
+xfs_buf_item_get_format(
+	struct xfs_buf_log_item	*bip,
+	int			count)
+{
+	ASSERT(bip->bli_formats == NULL);
+	bip->bli_format_count = count;
+
+	if (count == 1) {
+		bip->bli_formats = &bip->__bli_format;
+		return;
+	}
+
+	bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
+				0);
+}
+
+STATIC void
+xfs_buf_item_free_format(
+	struct xfs_buf_log_item	*bip)
+{
+	if (bip->bli_formats != &bip->__bli_format) {
+		kmem_free(bip->bli_formats);
+		bip->bli_formats = NULL;
+	}
+}
+
+/*
+ * Allocate a new buf log item to go with the given buffer.
+ * Set the buffer's b_log_item field to point to the new
+ * buf log item.
+ */
+int
+xfs_buf_item_init(
+	struct xfs_buf	*bp,
+	struct xfs_mount *mp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	int			chunks;
+	int			map_size;
+	int			i;
+
+	/*
+	 * Check to see if there is already a buf log item for
+	 * this buffer. If we do already have one, there is
+	 * nothing to do here so return.
+	 */
+	ASSERT(bp->b_mount == mp);
+	if (bip) {
+		ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+		ASSERT(!bp->b_transp);
+		ASSERT(bip->bli_buf == bp);
+		return 0;
+	}
+
+	bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
+	bip->bli_buf = bp;
+
+	/*
+	 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
+	 * can be divided into. Make sure not to truncate any pieces.
+	 * map_size is the size of the bitmap needed to describe the
+	 * chunks of the buffer.
+	 *
+	 * Discontiguous buffer support follows the layout of the underlying
+	 * buffer. This makes the implementation as simple as possible.
+	 */
+	xfs_buf_item_get_format(bip, bp->b_map_count);
+
+	for (i = 0; i < bip->bli_format_count; i++) {
+		chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
+				      XFS_BLF_CHUNK);
+		map_size = DIV_ROUND_UP(chunks, NBWORD);
+
+		if (map_size > XFS_BLF_DATAMAP_SIZE) {
+			kmem_cache_free(xfs_buf_item_cache, bip);
+			xfs_err(mp,
+	"buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
+					map_size,
+					BBTOB(bp->b_maps[i].bm_len));
+			return -EFSCORRUPTED;
+		}
+
+		bip->bli_formats[i].blf_type = XFS_LI_BUF;
+		bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
+		bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
+		bip->bli_formats[i].blf_map_size = map_size;
+	}
+
+	bp->b_log_item = bip;
+	xfs_buf_hold(bp);
+	return 0;
+}
+
+
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+static void
+xfs_buf_item_log_segment(
+	uint			first,
+	uint			last,
+	uint			*map)
+{
+	uint		first_bit;
+	uint		last_bit;
+	uint		bits_to_set;
+	uint		bits_set;
+	uint		word_num;
+	uint		*wordp;
+	uint		bit;
+	uint		end_bit;
+	uint		mask;
+
+	ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
+	ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
+
+	/*
+	 * Convert byte offsets to bit numbers.
+	 */
+	first_bit = first >> XFS_BLF_SHIFT;
+	last_bit = last >> XFS_BLF_SHIFT;
+
+	/*
+	 * Calculate the total number of bits to be set.
+	 */
+	bits_to_set = last_bit - first_bit + 1;
+
+	/*
+	 * Get a pointer to the first word in the bitmap
+	 * to set a bit in.
+	 */
+	word_num = first_bit >> BIT_TO_WORD_SHIFT;
+	wordp = &map[word_num];
+
+	/*
+	 * Calculate the starting bit in the first word.
+	 */
+	bit = first_bit & (uint)(NBWORD - 1);
+
+	/*
+	 * First set any bits in the first word of our range.
+	 * If it starts at bit 0 of the word, it will be
+	 * set below rather than here.  That is what the variable
+	 * bit tells us. The variable bits_set tracks the number
+	 * of bits that have been set so far.  End_bit is the number
+	 * of the last bit to be set in this word plus one.
+	 */
+	if (bit) {
+		end_bit = min(bit + bits_to_set, (uint)NBWORD);
+		mask = ((1U << (end_bit - bit)) - 1) << bit;
+		*wordp |= mask;
+		wordp++;
+		bits_set = end_bit - bit;
+	} else {
+		bits_set = 0;
+	}
+
+	/*
+	 * Now set bits a whole word at a time that are between
+	 * first_bit and last_bit.
+	 */
+	while ((bits_to_set - bits_set) >= NBWORD) {
+		*wordp = 0xffffffff;
+		bits_set += NBWORD;
+		wordp++;
+	}
+
+	/*
+	 * Finally, set any bits left to be set in one last partial word.
+	 */
+	end_bit = bits_to_set - bits_set;
+	if (end_bit) {
+		mask = (1U << end_bit) - 1;
+		*wordp |= mask;
+	}
+}
+
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+	struct xfs_buf_log_item	*bip,
+	uint			first,
+	uint			last)
+{
+	int			i;
+	uint			start;
+	uint			end;
+	struct xfs_buf		*bp = bip->bli_buf;
+
+	/*
+	 * walk each buffer segment and mark them dirty appropriately.
+	 */
+	start = 0;
+	for (i = 0; i < bip->bli_format_count; i++) {
+		if (start > last)
+			break;
+		end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
+
+		/* skip to the map that includes the first byte to log */
+		if (first > end) {
+			start += BBTOB(bp->b_maps[i].bm_len);
+			continue;
+		}
+
+		/*
+		 * Trim the range to this segment and mark it in the bitmap.
+		 * Note that we must convert buffer offsets to segment relative
+		 * offsets (e.g., the first byte of each segment is byte 0 of
+		 * that segment).
+		 */
+		if (first < start)
+			first = start;
+		if (end > last)
+			end = last;
+		xfs_buf_item_log_segment(first - start, end - start,
+					 &bip->bli_formats[i].blf_data_map[0]);
+
+		start += BBTOB(bp->b_maps[i].bm_len);
+	}
+}
+
+
+/*
+ * Return true if the buffer has any ranges logged/dirtied by a transaction,
+ * false otherwise.
+ */
+bool
+xfs_buf_item_dirty_format(
+	struct xfs_buf_log_item	*bip)
+{
+	int			i;
+
+	for (i = 0; i < bip->bli_format_count; i++) {
+		if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+			     bip->bli_formats[i].blf_map_size))
+			return true;
+	}
+
+	return false;
+}
+
+STATIC void
+xfs_buf_item_free(
+	struct xfs_buf_log_item	*bip)
+{
+	xfs_buf_item_free_format(bip);
+	kmem_free(bip->bli_item.li_lv_shadow);
+	kmem_cache_free(xfs_buf_item_cache, bip);
+}
+
+/*
+ * xfs_buf_item_relse() is called when the buf log item is no longer needed.
+ */
+void
+xfs_buf_item_relse(
+	struct xfs_buf	*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	trace_xfs_buf_item_relse(bp, _RET_IP_);
+	ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
+
+	bp->b_log_item = NULL;
+	xfs_buf_rele(bp);
+	xfs_buf_item_free(bip);
+}
+
+void
+xfs_buf_item_done(
+	struct xfs_buf		*bp)
+{
+	/*
+	 * If we are forcibly shutting down, this may well be off the AIL
+	 * already. That's because we simulate the log-committed callbacks to
+	 * unpin these buffers. Or we may never have put this item on AIL
+	 * because of the transaction was aborted forcibly.
+	 * xfs_trans_ail_delete() takes care of these.
+	 *
+	 * Either way, AIL is useless if we're forcing a shutdown.
+	 *
+	 * Note that log recovery writes might have buffer items that are not on
+	 * the AIL even when the file system is not shut down.
+	 */
+	xfs_trans_ail_delete(&bp->b_log_item->bli_item,
+			     (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
+			     SHUTDOWN_CORRUPT_INCORE);
+	xfs_buf_item_relse(bp);
+}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
new file mode 100644
index 000000000..4d8a6aece
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_BUF_ITEM_H__
+#define	__XFS_BUF_ITEM_H__
+
+/* kernel only definitions */
+
+struct xfs_buf;
+struct xfs_mount;
+
+/* buf log item flags */
+#define	XFS_BLI_HOLD		(1u << 0)
+#define	XFS_BLI_DIRTY		(1u << 1)
+#define	XFS_BLI_STALE		(1u << 2)
+#define	XFS_BLI_LOGGED		(1u << 3)
+#define	XFS_BLI_INODE_ALLOC_BUF	(1u << 4)
+#define XFS_BLI_STALE_INODE	(1u << 5)
+#define	XFS_BLI_INODE_BUF	(1u << 6)
+#define	XFS_BLI_ORDERED		(1u << 7)
+
+#define XFS_BLI_FLAGS \
+	{ XFS_BLI_HOLD,		"HOLD" }, \
+	{ XFS_BLI_DIRTY,	"DIRTY" }, \
+	{ XFS_BLI_STALE,	"STALE" }, \
+	{ XFS_BLI_LOGGED,	"LOGGED" }, \
+	{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
+	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }, \
+	{ XFS_BLI_INODE_BUF,	"INODE_BUF" }, \
+	{ XFS_BLI_ORDERED,	"ORDERED" }
+
+/*
+ * This is the in core log item structure used to track information
+ * needed to log buffers.  It tracks how many times the lock has been
+ * locked, and which 128 byte chunks of the buffer are dirty.
+ */
+struct xfs_buf_log_item {
+	struct xfs_log_item	bli_item;	/* common item structure */
+	struct xfs_buf		*bli_buf;	/* real buffer pointer */
+	unsigned int		bli_flags;	/* misc flags */
+	unsigned int		bli_recur;	/* lock recursion count */
+	atomic_t		bli_refcount;	/* cnt of tp refs */
+	int			bli_format_count;	/* count of headers */
+	struct xfs_buf_log_format *bli_formats;	/* array of in-log header ptrs */
+	struct xfs_buf_log_format __bli_format;	/* embedded in-log header */
+};
+
+int	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void	xfs_buf_item_done(struct xfs_buf *bp);
+void	xfs_buf_item_relse(struct xfs_buf *);
+bool	xfs_buf_item_put(struct xfs_buf_log_item *);
+void	xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
+bool	xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
+void	xfs_buf_inode_iodone(struct xfs_buf *);
+void	xfs_buf_inode_io_fail(struct xfs_buf *bp);
+#ifdef CONFIG_XFS_QUOTA
+void	xfs_buf_dquot_iodone(struct xfs_buf *);
+void	xfs_buf_dquot_io_fail(struct xfs_buf *bp);
+#else
+static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
+{
+}
+static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
+{
+}
+#endif /* CONFIG_XFS_QUOTA */
+void	xfs_buf_iodone(struct xfs_buf *);
+bool	xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
+
+extern struct kmem_cache	*xfs_buf_item_cache;
+
+#endif	/* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
new file mode 100644
index 000000000..43167f543
--- /dev/null
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -0,0 +1,1071 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_error.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_quota.h"
+
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define	XLOG_BC_TABLE_SIZE	64
+
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+	((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
+/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+	xfs_daddr_t		bc_blkno;
+	uint			bc_len;
+	int			bc_refcount;
+	struct list_head	bc_list;
+};
+
+static struct xfs_buf_cancel *
+xlog_find_buffer_cancelled(
+	struct xlog		*log,
+	xfs_daddr_t		blkno,
+	uint			len)
+{
+	struct list_head	*bucket;
+	struct xfs_buf_cancel	*bcp;
+
+	if (!log->l_buf_cancel_table)
+		return NULL;
+
+	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+	list_for_each_entry(bcp, bucket, bc_list) {
+		if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+			return bcp;
+	}
+
+	return NULL;
+}
+
+static bool
+xlog_add_buffer_cancelled(
+	struct xlog		*log,
+	xfs_daddr_t		blkno,
+	uint			len)
+{
+	struct xfs_buf_cancel	*bcp;
+
+	/*
+	 * If we find an existing cancel record, this indicates that the buffer
+	 * was cancelled multiple times.  To ensure that during pass 2 we keep
+	 * the record in the table until we reach its last occurrence in the
+	 * log, a reference count is kept to tell how many times we expect to
+	 * see this record during the second pass.
+	 */
+	bcp = xlog_find_buffer_cancelled(log, blkno, len);
+	if (bcp) {
+		bcp->bc_refcount++;
+		return false;
+	}
+
+	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
+	bcp->bc_blkno = blkno;
+	bcp->bc_len = len;
+	bcp->bc_refcount = 1;
+	list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno));
+	return true;
+}
+
+/*
+ * Check if there is and entry for blkno, len in the buffer cancel record table.
+ */
+bool
+xlog_is_buffer_cancelled(
+	struct xlog		*log,
+	xfs_daddr_t		blkno,
+	uint			len)
+{
+	return xlog_find_buffer_cancelled(log, blkno, len) != NULL;
+}
+
+/*
+ * Check if there is and entry for blkno, len in the buffer cancel record table,
+ * and decremented the reference count on it if there is one.
+ *
+ * Remove the cancel record once the refcount hits zero, so that if the same
+ * buffer is re-used again after its last cancellation we actually replay the
+ * changes made at that point.
+ */
+static bool
+xlog_put_buffer_cancelled(
+	struct xlog		*log,
+	xfs_daddr_t		blkno,
+	uint			len)
+{
+	struct xfs_buf_cancel	*bcp;
+
+	bcp = xlog_find_buffer_cancelled(log, blkno, len);
+	if (!bcp) {
+		ASSERT(0);
+		return false;
+	}
+
+	if (--bcp->bc_refcount == 0) {
+		list_del(&bcp->bc_list);
+		kmem_free(bcp);
+	}
+	return true;
+}
+
+/* log buffer item recovery */
+
+/*
+ * Sort buffer items for log recovery.  Most buffer items should end up on the
+ * buffer list and are recovered first, with the following exceptions:
+ *
+ * 1. XFS_BLF_CANCEL buffers must be processed last because some log items
+ *    might depend on the incor ecancellation record, and replaying a cancelled
+ *    buffer item can remove the incore record.
+ *
+ * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that
+ *    we replay di_next_unlinked only after flushing the inode 'free' state
+ *    to the inode buffer.
+ *
+ * See xlog_recover_reorder_trans for more details.
+ */
+STATIC enum xlog_recover_reorder
+xlog_recover_buf_reorder(
+	struct xlog_recover_item	*item)
+{
+	struct xfs_buf_log_format	*buf_f = item->ri_buf[0].i_addr;
+
+	if (buf_f->blf_flags & XFS_BLF_CANCEL)
+		return XLOG_REORDER_CANCEL_LIST;
+	if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+		return XLOG_REORDER_INODE_BUFFER_LIST;
+	return XLOG_REORDER_BUFFER_LIST;
+}
+
+STATIC void
+xlog_recover_buf_ra_pass2(
+	struct xlog                     *log,
+	struct xlog_recover_item        *item)
+{
+	struct xfs_buf_log_format	*buf_f = item->ri_buf[0].i_addr;
+
+	xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL);
+}
+
+/*
+ * Build up the table of buf cancel records so that we don't replay cancelled
+ * data in the second pass.
+ */
+static int
+xlog_recover_buf_commit_pass1(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	struct xfs_buf_log_format	*bf = item->ri_buf[0].i_addr;
+
+	if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
+		xfs_err(log->l_mp, "bad buffer log item size (%d)",
+				item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	if (!(bf->blf_flags & XFS_BLF_CANCEL))
+		trace_xfs_log_recover_buf_not_cancel(log, bf);
+	else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len))
+		trace_xfs_log_recover_buf_cancel_add(log, bf);
+	else
+		trace_xfs_log_recover_buf_cancel_ref_inc(log, bf);
+	return 0;
+}
+
+/*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ *	the first 16 bits of the buffer (inode buffer, dquot buffer),
+ *	the first 32 bits of the buffer (most blocks),
+ *	inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recover_validate_buf_type(
+	struct xfs_mount		*mp,
+	struct xfs_buf			*bp,
+	struct xfs_buf_log_format	*buf_f,
+	xfs_lsn_t			current_lsn)
+{
+	struct xfs_da_blkinfo		*info = bp->b_addr;
+	uint32_t			magic32;
+	uint16_t			magic16;
+	uint16_t			magicda;
+	char				*warnmsg = NULL;
+
+	/*
+	 * We can only do post recovery validation on items on CRC enabled
+	 * fielsystems as we need to know when the buffer was written to be able
+	 * to determine if we should have replayed the item. If we replay old
+	 * metadata over a newer buffer, then it will enter a temporarily
+	 * inconsistent state resulting in verification failures. Hence for now
+	 * just avoid the verification stage for non-crc filesystems
+	 */
+	if (!xfs_has_crc(mp))
+		return;
+
+	magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+	magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+	magicda = be16_to_cpu(info->magic);
+	switch (xfs_blft_from_flags(buf_f)) {
+	case XFS_BLFT_BTREE_BUF:
+		switch (magic32) {
+		case XFS_ABTB_CRC_MAGIC:
+		case XFS_ABTB_MAGIC:
+			bp->b_ops = &xfs_bnobt_buf_ops;
+			break;
+		case XFS_ABTC_CRC_MAGIC:
+		case XFS_ABTC_MAGIC:
+			bp->b_ops = &xfs_cntbt_buf_ops;
+			break;
+		case XFS_IBT_CRC_MAGIC:
+		case XFS_IBT_MAGIC:
+			bp->b_ops = &xfs_inobt_buf_ops;
+			break;
+		case XFS_FIBT_CRC_MAGIC:
+		case XFS_FIBT_MAGIC:
+			bp->b_ops = &xfs_finobt_buf_ops;
+			break;
+		case XFS_BMAP_CRC_MAGIC:
+		case XFS_BMAP_MAGIC:
+			bp->b_ops = &xfs_bmbt_buf_ops;
+			break;
+		case XFS_RMAP_CRC_MAGIC:
+			bp->b_ops = &xfs_rmapbt_buf_ops;
+			break;
+		case XFS_REFC_CRC_MAGIC:
+			bp->b_ops = &xfs_refcountbt_buf_ops;
+			break;
+		default:
+			warnmsg = "Bad btree block magic!";
+			break;
+		}
+		break;
+	case XFS_BLFT_AGF_BUF:
+		if (magic32 != XFS_AGF_MAGIC) {
+			warnmsg = "Bad AGF block magic!";
+			break;
+		}
+		bp->b_ops = &xfs_agf_buf_ops;
+		break;
+	case XFS_BLFT_AGFL_BUF:
+		if (magic32 != XFS_AGFL_MAGIC) {
+			warnmsg = "Bad AGFL block magic!";
+			break;
+		}
+		bp->b_ops = &xfs_agfl_buf_ops;
+		break;
+	case XFS_BLFT_AGI_BUF:
+		if (magic32 != XFS_AGI_MAGIC) {
+			warnmsg = "Bad AGI block magic!";
+			break;
+		}
+		bp->b_ops = &xfs_agi_buf_ops;
+		break;
+	case XFS_BLFT_UDQUOT_BUF:
+	case XFS_BLFT_PDQUOT_BUF:
+	case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+		if (magic16 != XFS_DQUOT_MAGIC) {
+			warnmsg = "Bad DQUOT block magic!";
+			break;
+		}
+		bp->b_ops = &xfs_dquot_buf_ops;
+#else
+		xfs_alert(mp,
+	"Trying to recover dquots without QUOTA support built in!");
+		ASSERT(0);
+#endif
+		break;
+	case XFS_BLFT_DINO_BUF:
+		if (magic16 != XFS_DINODE_MAGIC) {
+			warnmsg = "Bad INODE block magic!";
+			break;
+		}
+		bp->b_ops = &xfs_inode_buf_ops;
+		break;
+	case XFS_BLFT_SYMLINK_BUF:
+		if (magic32 != XFS_SYMLINK_MAGIC) {
+			warnmsg = "Bad symlink block magic!";
+			break;
+		}
+		bp->b_ops = &xfs_symlink_buf_ops;
+		break;
+	case XFS_BLFT_DIR_BLOCK_BUF:
+		if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+		    magic32 != XFS_DIR3_BLOCK_MAGIC) {
+			warnmsg = "Bad dir block magic!";
+			break;
+		}
+		bp->b_ops = &xfs_dir3_block_buf_ops;
+		break;
+	case XFS_BLFT_DIR_DATA_BUF:
+		if (magic32 != XFS_DIR2_DATA_MAGIC &&
+		    magic32 != XFS_DIR3_DATA_MAGIC) {
+			warnmsg = "Bad dir data magic!";
+			break;
+		}
+		bp->b_ops = &xfs_dir3_data_buf_ops;
+		break;
+	case XFS_BLFT_DIR_FREE_BUF:
+		if (magic32 != XFS_DIR2_FREE_MAGIC &&
+		    magic32 != XFS_DIR3_FREE_MAGIC) {
+			warnmsg = "Bad dir3 free magic!";
+			break;
+		}
+		bp->b_ops = &xfs_dir3_free_buf_ops;
+		break;
+	case XFS_BLFT_DIR_LEAF1_BUF:
+		if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+		    magicda != XFS_DIR3_LEAF1_MAGIC) {
+			warnmsg = "Bad dir leaf1 magic!";
+			break;
+		}
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		break;
+	case XFS_BLFT_DIR_LEAFN_BUF:
+		if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+		    magicda != XFS_DIR3_LEAFN_MAGIC) {
+			warnmsg = "Bad dir leafn magic!";
+			break;
+		}
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		break;
+	case XFS_BLFT_DA_NODE_BUF:
+		if (magicda != XFS_DA_NODE_MAGIC &&
+		    magicda != XFS_DA3_NODE_MAGIC) {
+			warnmsg = "Bad da node magic!";
+			break;
+		}
+		bp->b_ops = &xfs_da3_node_buf_ops;
+		break;
+	case XFS_BLFT_ATTR_LEAF_BUF:
+		if (magicda != XFS_ATTR_LEAF_MAGIC &&
+		    magicda != XFS_ATTR3_LEAF_MAGIC) {
+			warnmsg = "Bad attr leaf magic!";
+			break;
+		}
+		bp->b_ops = &xfs_attr3_leaf_buf_ops;
+		break;
+	case XFS_BLFT_ATTR_RMT_BUF:
+		if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+			warnmsg = "Bad attr remote magic!";
+			break;
+		}
+		bp->b_ops = &xfs_attr3_rmt_buf_ops;
+		break;
+	case XFS_BLFT_SB_BUF:
+		if (magic32 != XFS_SB_MAGIC) {
+			warnmsg = "Bad SB block magic!";
+			break;
+		}
+		bp->b_ops = &xfs_sb_buf_ops;
+		break;
+#ifdef CONFIG_XFS_RT
+	case XFS_BLFT_RTBITMAP_BUF:
+	case XFS_BLFT_RTSUMMARY_BUF:
+		/* no magic numbers for verification of RT buffers */
+		bp->b_ops = &xfs_rtbuf_ops;
+		break;
+#endif /* CONFIG_XFS_RT */
+	default:
+		xfs_warn(mp, "Unknown buffer type %d!",
+			 xfs_blft_from_flags(buf_f));
+		break;
+	}
+
+	/*
+	 * Nothing else to do in the case of a NULL current LSN as this means
+	 * the buffer is more recent than the change in the log and will be
+	 * skipped.
+	 */
+	if (current_lsn == NULLCOMMITLSN)
+		return;
+
+	if (warnmsg) {
+		xfs_warn(mp, warnmsg);
+		ASSERT(0);
+	}
+
+	/*
+	 * We must update the metadata LSN of the buffer as it is written out to
+	 * ensure that older transactions never replay over this one and corrupt
+	 * the buffer. This can occur if log recovery is interrupted at some
+	 * point after the current transaction completes, at which point a
+	 * subsequent mount starts recovery from the beginning.
+	 *
+	 * Write verifiers update the metadata LSN from log items attached to
+	 * the buffer. Therefore, initialize a bli purely to carry the LSN to
+	 * the verifier.
+	 */
+	if (bp->b_ops) {
+		struct xfs_buf_log_item	*bip;
+
+		bp->b_flags |= _XBF_LOGRECOVERY;
+		xfs_buf_item_init(bp, mp);
+		bip = bp->b_log_item;
+		bip->bli_item.li_lsn = current_lsn;
+	}
+}
+
+/*
+ * Perform a 'normal' buffer recovery.  Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer.  The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+STATIC void
+xlog_recover_do_reg_buffer(
+	struct xfs_mount		*mp,
+	struct xlog_recover_item	*item,
+	struct xfs_buf			*bp,
+	struct xfs_buf_log_format	*buf_f,
+	xfs_lsn_t			current_lsn)
+{
+	int			i;
+	int			bit;
+	int			nbits;
+	xfs_failaddr_t		fa;
+	const size_t		size_disk_dquot = sizeof(struct xfs_disk_dquot);
+
+	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
+	bit = 0;
+	i = 1;  /* 0 is the buf format structure */
+	while (1) {
+		bit = xfs_next_bit(buf_f->blf_data_map,
+				   buf_f->blf_map_size, bit);
+		if (bit == -1)
+			break;
+		nbits = xfs_contig_bits(buf_f->blf_data_map,
+					buf_f->blf_map_size, bit);
+		ASSERT(nbits > 0);
+		ASSERT(item->ri_buf[i].i_addr != NULL);
+		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+		ASSERT(BBTOB(bp->b_length) >=
+		       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
+
+		/*
+		 * The dirty regions logged in the buffer, even though
+		 * contiguous, may span multiple chunks. This is because the
+		 * dirty region may span a physical page boundary in a buffer
+		 * and hence be split into two separate vectors for writing into
+		 * the log. Hence we need to trim nbits back to the length of
+		 * the current region being copied out of the log.
+		 */
+		if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+			nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+
+		/*
+		 * Do a sanity check if this is a dquot buffer. Just checking
+		 * the first dquot in the buffer should do. XXXThis is
+		 * probably a good thing to do for other buf types also.
+		 */
+		fa = NULL;
+		if (buf_f->blf_flags &
+		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+			if (item->ri_buf[i].i_addr == NULL) {
+				xfs_alert(mp,
+					"XFS: NULL dquot in %s.", __func__);
+				goto next;
+			}
+			if (item->ri_buf[i].i_len < size_disk_dquot) {
+				xfs_alert(mp,
+					"XFS: dquot too small (%d) in %s.",
+					item->ri_buf[i].i_len, __func__);
+				goto next;
+			}
+			fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1);
+			if (fa) {
+				xfs_alert(mp,
+	"dquot corrupt at %pS trying to replay into block 0x%llx",
+					fa, xfs_buf_daddr(bp));
+				goto next;
+			}
+		}
+
+		memcpy(xfs_buf_offset(bp,
+			(uint)bit << XFS_BLF_SHIFT),	/* dest */
+			item->ri_buf[i].i_addr,		/* source */
+			nbits<<XFS_BLF_SHIFT);		/* length */
+ next:
+		i++;
+		bit += nbits;
+	}
+
+	/* Shouldn't be any more regions */
+	ASSERT(i == item->ri_total);
+
+	xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
+}
+
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ *
+ * Return false if the buffer was tossed and true if we recovered the buffer to
+ * indicate to the caller if the buffer needs writing.
+ */
+STATIC bool
+xlog_recover_do_dquot_buffer(
+	struct xfs_mount		*mp,
+	struct xlog			*log,
+	struct xlog_recover_item	*item,
+	struct xfs_buf			*bp,
+	struct xfs_buf_log_format	*buf_f)
+{
+	uint			type;
+
+	trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
+	/*
+	 * Filesystems are required to send in quota flags at mount time.
+	 */
+	if (!mp->m_qflags)
+		return false;
+
+	type = 0;
+	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
+		type |= XFS_DQTYPE_USER;
+	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
+		type |= XFS_DQTYPE_PROJ;
+	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
+		type |= XFS_DQTYPE_GROUP;
+	/*
+	 * This type of quotas was turned off, so ignore this buffer
+	 */
+	if (log->l_quotaoffs_flag & type)
+		return false;
+
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
+	return true;
+}
+
+/*
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
+ *
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes.  In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(
+	struct xfs_mount		*mp,
+	struct xlog_recover_item	*item,
+	struct xfs_buf			*bp,
+	struct xfs_buf_log_format	*buf_f)
+{
+	int				i;
+	int				item_index = 0;
+	int				bit = 0;
+	int				nbits = 0;
+	int				reg_buf_offset = 0;
+	int				reg_buf_bytes = 0;
+	int				next_unlinked_offset;
+	int				inodes_per_buf;
+	xfs_agino_t			*logged_nextp;
+	xfs_agino_t			*buffer_nextp;
+
+	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
+	/*
+	 * Post recovery validation only works properly on CRC enabled
+	 * filesystems.
+	 */
+	if (xfs_has_crc(mp))
+		bp->b_ops = &xfs_inode_buf_ops;
+
+	inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
+	for (i = 0; i < inodes_per_buf; i++) {
+		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+			offsetof(struct xfs_dinode, di_next_unlinked);
+
+		while (next_unlinked_offset >=
+		       (reg_buf_offset + reg_buf_bytes)) {
+			/*
+			 * The next di_next_unlinked field is beyond
+			 * the current logged region.  Find the next
+			 * logged region that contains or is beyond
+			 * the current di_next_unlinked field.
+			 */
+			bit += nbits;
+			bit = xfs_next_bit(buf_f->blf_data_map,
+					   buf_f->blf_map_size, bit);
+
+			/*
+			 * If there are no more logged regions in the
+			 * buffer, then we're done.
+			 */
+			if (bit == -1)
+				return 0;
+
+			nbits = xfs_contig_bits(buf_f->blf_data_map,
+						buf_f->blf_map_size, bit);
+			ASSERT(nbits > 0);
+			reg_buf_offset = bit << XFS_BLF_SHIFT;
+			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
+			item_index++;
+		}
+
+		/*
+		 * If the current logged region starts after the current
+		 * di_next_unlinked field, then move on to the next
+		 * di_next_unlinked field.
+		 */
+		if (next_unlinked_offset < reg_buf_offset)
+			continue;
+
+		ASSERT(item->ri_buf[item_index].i_addr != NULL);
+		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+		ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
+
+		/*
+		 * The current logged region contains a copy of the
+		 * current di_next_unlinked field.  Extract its value
+		 * and copy it to the buffer copy.
+		 */
+		logged_nextp = item->ri_buf[item_index].i_addr +
+				next_unlinked_offset - reg_buf_offset;
+		if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
+			xfs_alert(mp,
+		"Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
+		"Trying to replay bad (0) inode di_next_unlinked field.",
+				item, bp);
+			return -EFSCORRUPTED;
+		}
+
+		buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
+		*buffer_nextp = *logged_nextp;
+
+		/*
+		 * If necessary, recalculate the CRC in the on-disk inode. We
+		 * have to leave the inode in a consistent state for whoever
+		 * reads it next....
+		 */
+		xfs_dinode_calc_crc(mp,
+				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
+	}
+
+	return 0;
+}
+
+/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number.  If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct xfs_buf_log_format *buf_f)
+{
+	uint32_t		magic32;
+	uint16_t		magic16;
+	uint16_t		magicda;
+	void			*blk = bp->b_addr;
+	uuid_t			*uuid;
+	xfs_lsn_t		lsn = -1;
+	uint16_t		blft;
+
+	/* v4 filesystems always recover immediately */
+	if (!xfs_has_crc(mp))
+		goto recover_immediately;
+
+	/*
+	 * realtime bitmap and summary file blocks do not have magic numbers or
+	 * UUIDs, so we must recover them immediately.
+	 */
+	blft = xfs_blft_from_flags(buf_f);
+	if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF)
+		goto recover_immediately;
+
+	magic32 = be32_to_cpu(*(__be32 *)blk);
+	switch (magic32) {
+	case XFS_ABTB_CRC_MAGIC:
+	case XFS_ABTC_CRC_MAGIC:
+	case XFS_ABTB_MAGIC:
+	case XFS_ABTC_MAGIC:
+	case XFS_RMAP_CRC_MAGIC:
+	case XFS_REFC_CRC_MAGIC:
+	case XFS_FIBT_CRC_MAGIC:
+	case XFS_FIBT_MAGIC:
+	case XFS_IBT_CRC_MAGIC:
+	case XFS_IBT_MAGIC: {
+		struct xfs_btree_block *btb = blk;
+
+		lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+		uuid = &btb->bb_u.s.bb_uuid;
+		break;
+	}
+	case XFS_BMAP_CRC_MAGIC:
+	case XFS_BMAP_MAGIC: {
+		struct xfs_btree_block *btb = blk;
+
+		lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+		uuid = &btb->bb_u.l.bb_uuid;
+		break;
+	}
+	case XFS_AGF_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+		uuid = &((struct xfs_agf *)blk)->agf_uuid;
+		break;
+	case XFS_AGFL_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+		uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+		break;
+	case XFS_AGI_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+		uuid = &((struct xfs_agi *)blk)->agi_uuid;
+		break;
+	case XFS_SYMLINK_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+		uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+		break;
+	case XFS_DIR3_BLOCK_MAGIC:
+	case XFS_DIR3_DATA_MAGIC:
+	case XFS_DIR3_FREE_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+		uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+		break;
+	case XFS_ATTR3_RMT_MAGIC:
+		/*
+		 * Remote attr blocks are written synchronously, rather than
+		 * being logged. That means they do not contain a valid LSN
+		 * (i.e. transactionally ordered) in them, and hence any time we
+		 * see a buffer to replay over the top of a remote attribute
+		 * block we should simply do so.
+		 */
+		goto recover_immediately;
+	case XFS_SB_MAGIC:
+		/*
+		 * superblock uuids are magic. We may or may not have a
+		 * sb_meta_uuid on disk, but it will be set in the in-core
+		 * superblock. We set the uuid pointer for verification
+		 * according to the superblock feature mask to ensure we check
+		 * the relevant UUID in the superblock.
+		 */
+		lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+		if (xfs_has_metauuid(mp))
+			uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
+		else
+			uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+		break;
+	default:
+		break;
+	}
+
+	if (lsn != (xfs_lsn_t)-1) {
+		if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
+			goto recover_immediately;
+		return lsn;
+	}
+
+	magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+	switch (magicda) {
+	case XFS_DIR3_LEAF1_MAGIC:
+	case XFS_DIR3_LEAFN_MAGIC:
+	case XFS_ATTR3_LEAF_MAGIC:
+	case XFS_DA3_NODE_MAGIC:
+		lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+		uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+		break;
+	default:
+		break;
+	}
+
+	if (lsn != (xfs_lsn_t)-1) {
+		if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
+			goto recover_immediately;
+		return lsn;
+	}
+
+	/*
+	 * We do individual object checks on dquot and inode buffers as they
+	 * have their own individual LSN records. Also, we could have a stale
+	 * buffer here, so we have to at least recognise these buffer types.
+	 *
+	 * A notd complexity here is inode unlinked list processing - it logs
+	 * the inode directly in the buffer, but we don't know which inodes have
+	 * been modified, and there is no global buffer LSN. Hence we need to
+	 * recover all inode buffer types immediately. This problem will be
+	 * fixed by logical logging of the unlinked list modifications.
+	 */
+	magic16 = be16_to_cpu(*(__be16 *)blk);
+	switch (magic16) {
+	case XFS_DQUOT_MAGIC:
+	case XFS_DINODE_MAGIC:
+		goto recover_immediately;
+	default:
+		break;
+	}
+
+	/* unknown buffer contents, recover immediately */
+
+recover_immediately:
+	return (xfs_lsn_t)-1;
+
+}
+
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently.  Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields.  This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery.  During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table.  See xlog_recover_buf_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_buf_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			current_lsn)
+{
+	struct xfs_buf_log_format	*buf_f = item->ri_buf[0].i_addr;
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_buf			*bp;
+	int				error;
+	uint				buf_flags;
+	xfs_lsn_t			lsn;
+
+	/*
+	 * In this pass we only want to recover all the buffers which have
+	 * not been cancelled and are not cancellation buffers themselves.
+	 */
+	if (buf_f->blf_flags & XFS_BLF_CANCEL) {
+		if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno,
+				buf_f->blf_len))
+			goto cancelled;
+	} else {
+
+		if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno,
+				buf_f->blf_len))
+			goto cancelled;
+	}
+
+	trace_xfs_log_recover_buf_recover(log, buf_f);
+
+	buf_flags = 0;
+	if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+		buf_flags |= XBF_UNMAPPED;
+
+	error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+			  buf_flags, &bp, NULL);
+	if (error)
+		return error;
+
+	/*
+	 * Recover the buffer only if we get an LSN from it and it's less than
+	 * the lsn of the transaction we are replaying.
+	 *
+	 * Note that we have to be extremely careful of readahead here.
+	 * Readahead does not attach verfiers to the buffers so if we don't
+	 * actually do any replay after readahead because of the LSN we found
+	 * in the buffer if more recent than that current transaction then we
+	 * need to attach the verifier directly. Failure to do so can lead to
+	 * future recovery actions (e.g. EFI and unlinked list recovery) can
+	 * operate on the buffers and they won't get the verifier attached. This
+	 * can lead to blocks on disk having the correct content but a stale
+	 * CRC.
+	 *
+	 * It is safe to assume these clean buffers are currently up to date.
+	 * If the buffer is dirtied by a later transaction being replayed, then
+	 * the verifier will be reset to match whatever recover turns that
+	 * buffer into.
+	 */
+	lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f);
+	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+		trace_xfs_log_recover_buf_skip(log, buf_f);
+		xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
+
+		/*
+		 * We're skipping replay of this buffer log item due to the log
+		 * item LSN being behind the ondisk buffer.  Verify the buffer
+		 * contents since we aren't going to run the write verifier.
+		 */
+		if (bp->b_ops) {
+			bp->b_ops->verify_read(bp);
+			error = bp->b_error;
+		}
+		goto out_release;
+	}
+
+	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+		if (error)
+			goto out_release;
+	} else if (buf_f->blf_flags &
+		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+		bool	dirty;
+
+		dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+		if (!dirty)
+			goto out_release;
+	} else {
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
+	}
+
+	/*
+	 * Perform delayed write on the buffer.  Asynchronous writes will be
+	 * slower when taking into account all the buffers to be flushed.
+	 *
+	 * Also make sure that only inode buffers with good sizes stay in
+	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
+	 * or inode_cluster_size bytes, whichever is bigger.  The inode
+	 * buffers in the log can be a different size if the log was generated
+	 * by an older kernel using unclustered inode buffers or a newer kernel
+	 * running with a different inode cluster size.  Regardless, if
+	 * the inode buffer size isn't max(blocksize, inode_cluster_size)
+	 * for *our* value of inode_cluster_size, then we need to keep
+	 * the buffer out of the buffer cache so that the buffer won't
+	 * overlap with future reads of those inodes.
+	 */
+	if (XFS_DINODE_MAGIC ==
+	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
+	    (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
+		xfs_buf_stale(bp);
+		error = xfs_bwrite(bp);
+	} else {
+		ASSERT(bp->b_mount == mp);
+		bp->b_flags |= _XBF_LOGRECOVERY;
+		xfs_buf_delwri_queue(bp, buffer_list);
+	}
+
+out_release:
+	xfs_buf_relse(bp);
+	return error;
+cancelled:
+	trace_xfs_log_recover_buf_cancel(log, buf_f);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_buf_item_ops = {
+	.item_type		= XFS_LI_BUF,
+	.reorder		= xlog_recover_buf_reorder,
+	.ra_pass2		= xlog_recover_buf_ra_pass2,
+	.commit_pass1		= xlog_recover_buf_commit_pass1,
+	.commit_pass2		= xlog_recover_buf_commit_pass2,
+};
+
+#ifdef DEBUG
+void
+xlog_check_buf_cancel_table(
+	struct xlog	*log)
+{
+	int		i;
+
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+		ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+}
+#endif
+
+int
+xlog_alloc_buf_cancel_table(
+	struct xlog	*log)
+{
+	void		*p;
+	int		i;
+
+	ASSERT(log->l_buf_cancel_table == NULL);
+
+	p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head),
+			  GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	log->l_buf_cancel_table = p;
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
+	return 0;
+}
+
+void
+xlog_free_buf_cancel_table(
+	struct xlog	*log)
+{
+	int		i;
+
+	if (!log->l_buf_cancel_table)
+		return;
+
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) {
+		struct xfs_buf_cancel	*bc;
+
+		while ((bc = list_first_entry_or_null(
+				&log->l_buf_cancel_table[i],
+				struct xfs_buf_cancel, bc_list))) {
+			list_del(&bc->bc_list);
+			kmem_free(bc);
+		}
+	}
+
+	kmem_free(log->l_buf_cancel_table);
+	log->l_buf_cancel_table = NULL;
+}
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
new file mode 100644
index 000000000..9f3ceb461
--- /dev/null
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+
+/*
+ * Directory file type support functions
+ */
+static unsigned char xfs_dir3_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
+	DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
+};
+
+unsigned char
+xfs_dir3_get_dtype(
+	struct xfs_mount	*mp,
+	uint8_t			filetype)
+{
+	if (!xfs_has_ftype(mp))
+		return DT_UNKNOWN;
+
+	if (filetype >= XFS_DIR3_FT_MAX)
+		return DT_UNKNOWN;
+
+	return xfs_dir3_filetype_table[filetype];
+}
+
+STATIC int
+xfs_dir2_sf_getdents(
+	struct xfs_da_args	*args,
+	struct dir_context	*ctx)
+{
+	int			i;		/* shortform entry number */
+	struct xfs_inode	*dp = args->dp;	/* incore directory inode */
+	struct xfs_mount	*mp = dp->i_mount;
+	xfs_dir2_dataptr_t	off;		/* current entry's offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	xfs_dir2_dataptr_t	dot_offset;
+	xfs_dir2_dataptr_t	dotdot_offset;
+	xfs_ino_t		ino;
+	struct xfs_da_geometry	*geo = args->geo;
+
+	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
+		return 0;
+
+	/*
+	 * Precalculate offsets for "." and ".." as we will always need them.
+	 * This relies on the fact that directories always start with the
+	 * entries for "." and "..".
+	 */
+	dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+			geo->data_entry_offset);
+	dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+			geo->data_entry_offset +
+			xfs_dir2_data_entsize(mp, sizeof(".") - 1));
+
+	/*
+	 * Put . entry unless we're starting past it.
+	 */
+	if (ctx->pos <= dot_offset) {
+		ctx->pos = dot_offset & 0x7fffffff;
+		if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
+			return 0;
+	}
+
+	/*
+	 * Put .. entry unless we're starting past it.
+	 */
+	if (ctx->pos <= dotdot_offset) {
+		ino = xfs_dir2_sf_get_parent_ino(sfp);
+		ctx->pos = dotdot_offset & 0x7fffffff;
+		if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
+			return 0;
+	}
+
+	/*
+	 * Loop while there are more entries and put'ing works.
+	 */
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	for (i = 0; i < sfp->count; i++) {
+		uint8_t filetype;
+
+		off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+				xfs_dir2_sf_get_offset(sfep));
+
+		if (ctx->pos > off) {
+			sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+			continue;
+		}
+
+		ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+		filetype = xfs_dir2_sf_get_ftype(mp, sfep);
+		ctx->pos = off & 0x7fffffff;
+		if (XFS_IS_CORRUPT(dp->i_mount,
+				   !xfs_dir2_namecheck(sfep->name,
+						       sfep->namelen)))
+			return -EFSCORRUPTED;
+		if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
+			    xfs_dir3_get_dtype(mp, filetype)))
+			return 0;
+		sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+	}
+
+	ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+								0x7fffffff;
+	return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+STATIC int
+xfs_dir2_block_getdents(
+	struct xfs_da_args	*args,
+	struct dir_context	*ctx,
+	unsigned int		*lock_mode)
+{
+	struct xfs_inode	*dp = args->dp;	/* incore directory inode */
+	struct xfs_buf		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+	int			wantoff;	/* starting block offset */
+	xfs_off_t		cook;
+	struct xfs_da_geometry	*geo = args->geo;
+	unsigned int		offset, next_offset;
+	unsigned int		end;
+
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
+		return 0;
+
+	error = xfs_dir3_block_read(args->trans, dp, &bp);
+	if (error)
+		return error;
+
+	xfs_iunlock(dp, *lock_mode);
+	*lock_mode = 0;
+
+	/*
+	 * Extract the byte offset we start at from the seek pointer.
+	 * We'll skip entries before this.
+	 */
+	wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
+	xfs_dir3_data_check(dp, bp);
+
+	/*
+	 * Loop over the data portion of the block.
+	 * Each object is a real entry (dep) or an unused one (dup).
+	 */
+	end = xfs_dir3_data_end_offset(geo, bp->b_addr);
+	for (offset = geo->data_entry_offset;
+	     offset < end;
+	     offset = next_offset) {
+		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
+		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;
+		uint8_t filetype;
+
+		/*
+		 * Unused, skip it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			next_offset = offset + be16_to_cpu(dup->length);
+			continue;
+		}
+
+		/*
+		 * Bump pointer for the next iteration.
+		 */
+		next_offset = offset +
+			xfs_dir2_data_entsize(dp->i_mount, dep->namelen);
+
+		/*
+		 * The entry is before the desired starting point, skip it.
+		 */
+		if (offset < wantoff)
+			continue;
+
+		cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, offset);
+
+		ctx->pos = cook & 0x7fffffff;
+		filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
+		/*
+		 * If it didn't fit, set the final offset to here & return.
+		 */
+		if (XFS_IS_CORRUPT(dp->i_mount,
+				   !xfs_dir2_namecheck(dep->name,
+						       dep->namelen))) {
+			error = -EFSCORRUPTED;
+			goto out_rele;
+		}
+		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+			    be64_to_cpu(dep->inumber),
+			    xfs_dir3_get_dtype(dp->i_mount, filetype)))
+			goto out_rele;
+	}
+
+	/*
+	 * Reached the end of the block.
+	 * Set the offset to a non-existent block 1 and return.
+	 */
+	ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+								0x7fffffff;
+out_rele:
+	xfs_trans_brelse(args->trans, bp);
+	return error;
+}
+
+/*
+ * Read a directory block and initiate readahead for blocks beyond that.
+ * We maintain a sliding readahead window of the remaining space in the
+ * buffer rounded up to the nearest block.
+ */
+STATIC int
+xfs_dir2_leaf_readbuf(
+	struct xfs_da_args	*args,
+	size_t			bufsize,
+	xfs_dir2_off_t		*cur_off,
+	xfs_dablk_t		*ra_blk,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_buf		*bp = NULL;
+	struct xfs_da_geometry	*geo = args->geo;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
+	struct xfs_bmbt_irec	map;
+	struct blk_plug		plug;
+	xfs_dir2_off_t		new_off;
+	xfs_dablk_t		next_ra;
+	xfs_dablk_t		map_off;
+	xfs_dablk_t		last_da;
+	struct xfs_iext_cursor	icur;
+	int			ra_want;
+	int			error = 0;
+
+	error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
+	if (error)
+		goto out;
+
+	/*
+	 * Look for mapped directory blocks at or above the current offset.
+	 * Truncate down to the nearest directory block to start the scanning
+	 * operation.
+	 */
+	last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
+	map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
+	if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map))
+		goto out;
+	if (map.br_startoff >= last_da)
+		goto out;
+	xfs_trim_extent(&map, map_off, last_da - map_off);
+
+	/* Read the directory block of that first mapping. */
+	new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
+	if (new_off > *cur_off)
+		*cur_off = new_off;
+	error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp);
+	if (error)
+		goto out;
+
+	/*
+	 * Start readahead for the next bufsize's worth of dir data blocks.
+	 * We may have already issued readahead for some of that range;
+	 * ra_blk tracks the last block we tried to read(ahead).
+	 */
+	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
+	if (*ra_blk >= last_da)
+		goto out;
+	else if (*ra_blk == 0)
+		*ra_blk = map.br_startoff;
+	next_ra = map.br_startoff + geo->fsbcount;
+	if (next_ra >= last_da)
+		goto out_no_ra;
+	if (map.br_blockcount < geo->fsbcount &&
+	    !xfs_iext_next_extent(ifp, &icur, &map))
+		goto out_no_ra;
+	if (map.br_startoff >= last_da)
+		goto out_no_ra;
+	xfs_trim_extent(&map, next_ra, last_da - next_ra);
+
+	/* Start ra for each dir (not fs) block that has a mapping. */
+	blk_start_plug(&plug);
+	while (ra_want > 0) {
+		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
+		while (ra_want > 0 &&
+		       next_ra < map.br_startoff + map.br_blockcount) {
+			if (next_ra >= last_da) {
+				*ra_blk = last_da;
+				break;
+			}
+			if (next_ra > *ra_blk) {
+				xfs_dir3_data_readahead(dp, next_ra,
+							XFS_DABUF_MAP_HOLE_OK);
+				*ra_blk = next_ra;
+			}
+			ra_want -= geo->fsbcount;
+			next_ra += geo->fsbcount;
+		}
+		if (!xfs_iext_next_extent(ifp, &icur, &map)) {
+			*ra_blk = last_da;
+			break;
+		}
+	}
+	blk_finish_plug(&plug);
+
+out:
+	*bpp = bp;
+	return error;
+out_no_ra:
+	*ra_blk = last_da;
+	goto out;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+STATIC int
+xfs_dir2_leaf_getdents(
+	struct xfs_da_args	*args,
+	struct dir_context	*ctx,
+	size_t			bufsize,
+	unsigned int		*lock_mode)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp = NULL;	/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	struct xfs_da_geometry	*geo = args->geo;
+	xfs_dablk_t		rablk = 0;	/* current readahead block */
+	xfs_dir2_off_t		curoff;		/* current overall offset */
+	int			length;		/* temporary length value */
+	int			byteoff;	/* offset in current block */
+	unsigned int		offset = 0;
+	int			error = 0;	/* error return value */
+
+	/*
+	 * If the offset is at or past the largest allowed value,
+	 * give up right away.
+	 */
+	if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
+		return 0;
+
+	/*
+	 * Inside the loop we keep the main offset value as a byte offset
+	 * in the directory file.
+	 */
+	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
+
+	/*
+	 * Loop over directory entries until we reach the end offset.
+	 * Get more blocks and readahead as necessary.
+	 */
+	while (curoff < XFS_DIR2_LEAF_OFFSET) {
+		uint8_t filetype;
+
+		/*
+		 * If we have no buffer, or we're off the end of the
+		 * current buffer, need to get another one.
+		 */
+		if (!bp || offset >= geo->blksize) {
+			if (bp) {
+				xfs_trans_brelse(args->trans, bp);
+				bp = NULL;
+			}
+
+			if (*lock_mode == 0)
+				*lock_mode = xfs_ilock_data_map_shared(dp);
+			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
+					&rablk, &bp);
+			if (error || !bp)
+				break;
+
+			xfs_iunlock(dp, *lock_mode);
+			*lock_mode = 0;
+
+			xfs_dir3_data_check(dp, bp);
+			/*
+			 * Find our position in the block.
+			 */
+			offset = geo->data_entry_offset;
+			byteoff = xfs_dir2_byte_to_off(geo, curoff);
+			/*
+			 * Skip past the header.
+			 */
+			if (byteoff == 0)
+				curoff += geo->data_entry_offset;
+			/*
+			 * Skip past entries until we reach our offset.
+			 */
+			else {
+				while (offset < byteoff) {
+					dup = bp->b_addr + offset;
+
+					if (be16_to_cpu(dup->freetag)
+						  == XFS_DIR2_DATA_FREE_TAG) {
+
+						length = be16_to_cpu(dup->length);
+						offset += length;
+						continue;
+					}
+					dep = bp->b_addr + offset;
+					length = xfs_dir2_data_entsize(mp,
+							dep->namelen);
+					offset += length;
+				}
+				/*
+				 * Now set our real offset.
+				 */
+				curoff =
+					xfs_dir2_db_off_to_byte(geo,
+					    xfs_dir2_byte_to_db(geo, curoff),
+					    offset);
+				if (offset >= geo->blksize)
+					continue;
+			}
+		}
+
+		/*
+		 * We have a pointer to an entry.  Is it a live one?
+		 */
+		dup = bp->b_addr + offset;
+
+		/*
+		 * No, it's unused, skip over it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			length = be16_to_cpu(dup->length);
+			offset += length;
+			curoff += length;
+			continue;
+		}
+
+		dep = bp->b_addr + offset;
+		length = xfs_dir2_data_entsize(mp, dep->namelen);
+		filetype = xfs_dir2_data_get_ftype(mp, dep);
+
+		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+		if (XFS_IS_CORRUPT(dp->i_mount,
+				   !xfs_dir2_namecheck(dep->name,
+						       dep->namelen))) {
+			error = -EFSCORRUPTED;
+			break;
+		}
+		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+			    be64_to_cpu(dep->inumber),
+			    xfs_dir3_get_dtype(dp->i_mount, filetype)))
+			break;
+
+		/*
+		 * Advance to next entry in the block.
+		 */
+		offset += length;
+		curoff += length;
+		/* bufsize may have just been a guess; don't go negative */
+		bufsize = bufsize > length ? bufsize - length : 0;
+	}
+
+	/*
+	 * All done.  Set output offset value to current offset.
+	 */
+	if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
+		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+	else
+		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+	if (bp)
+		xfs_trans_brelse(args->trans, bp);
+	return error;
+}
+
+/*
+ * Read a directory.
+ *
+ * If supplied, the transaction collects locked dir buffers to avoid
+ * nested buffer deadlocks.  This function does not dirty the
+ * transaction.  The caller must hold the IOLOCK (shared or exclusive)
+ * before calling this function.
+ */
+int
+xfs_readdir(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct dir_context	*ctx,
+	size_t			bufsize)
+{
+	struct xfs_da_args	args = { NULL };
+	unsigned int		lock_mode;
+	bool			isblock;
+	int			error;
+
+	trace_xfs_readdir(dp);
+
+	if (xfs_is_shutdown(dp->i_mount))
+		return -EIO;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+	ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
+
+	args.dp = dp;
+	args.geo = dp->i_mount->m_dir_geo;
+	args.trans = tp;
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+		return xfs_dir2_sf_getdents(&args, ctx);
+
+	lock_mode = xfs_ilock_data_map_shared(dp);
+	error = xfs_dir2_isblock(&args, &isblock);
+	if (error)
+		goto out_unlock;
+
+	if (isblock) {
+		error = xfs_dir2_block_getdents(&args, ctx, &lock_mode);
+		goto out_unlock;
+	}
+
+	error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode);
+
+out_unlock:
+	if (lock_mode)
+		xfs_iunlock(dp, lock_mode);
+	return error;
+}
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
new file mode 100644
index 000000000..bfc829c07
--- /dev/null
+++ b/fs/xfs/xfs_discard.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_discard.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+
+STATIC int
+xfs_trim_extents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_daddr_t		start,
+	xfs_daddr_t		end,
+	xfs_daddr_t		minlen,
+	uint64_t		*blocks_trimmed)
+{
+	struct block_device	*bdev = mp->m_ddev_targp->bt_bdev;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	struct xfs_agf		*agf;
+	struct xfs_perag	*pag;
+	int			error;
+	int			i;
+
+	pag = xfs_perag_get(mp, agno);
+
+	/*
+	 * Force out the log.  This means any transactions that might have freed
+	 * space before we take the AGF buffer lock are now on disk, and the
+	 * volatile disk cache is flushed.
+	 */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
+	if (error)
+		goto out_put_perag;
+	agf = agbp->b_addr;
+
+	cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
+
+	/*
+	 * Look up the longest btree in the AGF and start with it.
+	 */
+	error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i);
+	if (error)
+		goto out_del_cursor;
+
+	/*
+	 * Loop until we are done with all extents that are large
+	 * enough to be worth discarding.
+	 */
+	while (i) {
+		xfs_agblock_t	fbno;
+		xfs_extlen_t	flen;
+		xfs_daddr_t	dbno;
+		xfs_extlen_t	dlen;
+
+		error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+		if (error)
+			goto out_del_cursor;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			error = -EFSCORRUPTED;
+			goto out_del_cursor;
+		}
+		ASSERT(flen <= be32_to_cpu(agf->agf_longest));
+
+		/*
+		 * use daddr format for all range/len calculations as that is
+		 * the format the range/len variables are supplied in by
+		 * userspace.
+		 */
+		dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
+		dlen = XFS_FSB_TO_BB(mp, flen);
+
+		/*
+		 * Too small?  Give up.
+		 */
+		if (dlen < minlen) {
+			trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+			goto out_del_cursor;
+		}
+
+		/*
+		 * If the extent is entirely outside of the range we are
+		 * supposed to discard skip it.  Do not bother to trim
+		 * down partially overlapping ranges for now.
+		 */
+		if (dbno + dlen < start || dbno > end) {
+			trace_xfs_discard_exclude(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		/*
+		 * If any blocks in the range are still busy, skip the
+		 * discard and try again the next time.
+		 */
+		if (xfs_extent_busy_search(mp, pag, fbno, flen)) {
+			trace_xfs_discard_busy(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		trace_xfs_discard_extent(mp, agno, fbno, flen);
+		error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
+		if (error)
+			goto out_del_cursor;
+		*blocks_trimmed += flen;
+
+next_extent:
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto out_del_cursor;
+
+		if (fatal_signal_pending(current)) {
+			error = -ERESTARTSYS;
+			goto out_del_cursor;
+		}
+	}
+
+out_del_cursor:
+	xfs_btree_del_cursor(cur, error);
+	xfs_buf_relse(agbp);
+out_put_perag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * trim a range of the filesystem.
+ *
+ * Note: the parameters passed from userspace are byte ranges into the
+ * filesystem which does not match to the format we use for filesystem block
+ * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
+ * is a linear address range. Hence we need to use DADDR based conversions and
+ * comparisons for determining the correct offset and regions to trim.
+ */
+int
+xfs_ioc_trim(
+	struct xfs_mount		*mp,
+	struct fstrim_range __user	*urange)
+{
+	unsigned int		granularity =
+		bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
+	struct fstrim_range	range;
+	xfs_daddr_t		start, end, minlen;
+	xfs_agnumber_t		start_agno, end_agno, agno;
+	uint64_t		blocks_trimmed = 0;
+	int			error, last_error = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev))
+		return -EOPNOTSUPP;
+
+	/*
+	 * We haven't recovered the log, so we cannot use our bnobt-guided
+	 * storage zapping commands.
+	 */
+	if (xfs_has_norecovery(mp))
+		return -EROFS;
+
+	if (copy_from_user(&range, urange, sizeof(range)))
+		return -EFAULT;
+
+	range.minlen = max_t(u64, granularity, range.minlen);
+	minlen = BTOBB(range.minlen);
+	/*
+	 * Truncating down the len isn't actually quite correct, but using
+	 * BBTOB would mean we trivially get overflows for values
+	 * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+	 * used by the fstrim application.  In the end it really doesn't
+	 * matter as trimming blocks is an advisory interface.
+	 */
+	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
+	    range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
+	    range.len < mp->m_sb.sb_blocksize)
+		return -EINVAL;
+
+	start = BTOBB(range.start);
+	end = start + BTOBBT(range.len) - 1;
+
+	if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
+		end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
+
+	start_agno = xfs_daddr_to_agno(mp, start);
+	end_agno = xfs_daddr_to_agno(mp, end);
+
+	for (agno = start_agno; agno <= end_agno; agno++) {
+		error = xfs_trim_extents(mp, agno, start, end, minlen,
+					  &blocks_trimmed);
+		if (error) {
+			last_error = error;
+			if (error == -ERESTARTSYS)
+				break;
+		}
+	}
+
+	if (last_error)
+		return last_error;
+
+	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+	if (copy_to_user(urange, &range, sizeof(range)))
+		return -EFAULT;
+	return 0;
+}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
new file mode 100644
index 000000000..de92d9cc9
--- /dev/null
+++ b/fs/xfs/xfs_discard.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+struct list_head;
+
+extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
new file mode 100644
index 000000000..8fb90da89
--- /dev/null
+++ b/fs/xfs/xfs_dquot.c
@@ -0,0 +1,1395 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_error.h"
+
+/*
+ * Lock order:
+ *
+ * ip->i_lock
+ *   qi->qi_tree_lock
+ *     dquot->q_qlock (xfs_dqlock() and friends)
+ *       dquot->q_flush (xfs_dqflock() and friends)
+ *       qi->qi_lru_lock
+ *
+ * If two dquots need to be locked the order is user before group/project,
+ * otherwise by the lowest id first, see xfs_dqlock2.
+ */
+
+struct kmem_cache		*xfs_dqtrx_cache;
+static struct kmem_cache	*xfs_dquot_cache;
+
+static struct lock_class_key xfs_dquot_group_class;
+static struct lock_class_key xfs_dquot_project_class;
+
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+	struct xfs_dquot	*dqp)
+{
+	ASSERT(list_empty(&dqp->q_lru));
+
+	kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
+	mutex_destroy(&dqp->q_qlock);
+
+	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
+	kmem_cache_free(xfs_dquot_cache, dqp);
+}
+
+/*
+ * If default limits are in force, push them into the dquot now.
+ * We overwrite the dquot limits only if they are zero and this
+ * is not the root dquot.
+ */
+void
+xfs_qm_adjust_dqlimits(
+	struct xfs_dquot	*dq)
+{
+	struct xfs_mount	*mp = dq->q_mount;
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_def_quota	*defq;
+	int			prealloc = 0;
+
+	ASSERT(dq->q_id);
+	defq = xfs_get_defquota(q, xfs_dquot_type(dq));
+
+	if (!dq->q_blk.softlimit) {
+		dq->q_blk.softlimit = defq->blk.soft;
+		prealloc = 1;
+	}
+	if (!dq->q_blk.hardlimit) {
+		dq->q_blk.hardlimit = defq->blk.hard;
+		prealloc = 1;
+	}
+	if (!dq->q_ino.softlimit)
+		dq->q_ino.softlimit = defq->ino.soft;
+	if (!dq->q_ino.hardlimit)
+		dq->q_ino.hardlimit = defq->ino.hard;
+	if (!dq->q_rtb.softlimit)
+		dq->q_rtb.softlimit = defq->rtb.soft;
+	if (!dq->q_rtb.hardlimit)
+		dq->q_rtb.hardlimit = defq->rtb.hard;
+
+	if (prealloc)
+		xfs_dquot_set_prealloc_limits(dq);
+}
+
+/* Set the expiration time of a quota's grace period. */
+time64_t
+xfs_dquot_set_timeout(
+	struct xfs_mount	*mp,
+	time64_t		timeout)
+{
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+
+	return clamp_t(time64_t, timeout, qi->qi_expiry_min,
+					  qi->qi_expiry_max);
+}
+
+/* Set the length of the default grace period. */
+time64_t
+xfs_dquot_set_grace_period(
+	time64_t		grace)
+{
+	return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX);
+}
+
+/*
+ * Determine if this quota counter is over either limit and set the quota
+ * timers as appropriate.
+ */
+static inline void
+xfs_qm_adjust_res_timer(
+	struct xfs_mount	*mp,
+	struct xfs_dquot_res	*res,
+	struct xfs_quota_limits	*qlim)
+{
+	ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit);
+
+	if ((res->softlimit && res->count > res->softlimit) ||
+	    (res->hardlimit && res->count > res->hardlimit)) {
+		if (res->timer == 0)
+			res->timer = xfs_dquot_set_timeout(mp,
+					ktime_get_real_seconds() + qlim->time);
+	} else {
+		res->timer = 0;
+	}
+}
+
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded.  They do
+ * get reset to zero, however, when we find the count to be under
+ * the soft limit (they are only ever set non-zero via userspace).
+ */
+void
+xfs_qm_adjust_dqtimers(
+	struct xfs_dquot	*dq)
+{
+	struct xfs_mount	*mp = dq->q_mount;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct xfs_def_quota	*defq;
+
+	ASSERT(dq->q_id);
+	defq = xfs_get_defquota(qi, xfs_dquot_type(dq));
+
+	xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk);
+	xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino);
+	xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb);
+}
+
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+	struct xfs_trans	*tp,
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type,
+	struct xfs_buf		*bp)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_dqblk	*d;
+	xfs_dqid_t		curid;
+	unsigned int		qflag;
+	unsigned int		blftype;
+	int			i;
+
+	ASSERT(tp);
+	ASSERT(xfs_buf_islocked(bp));
+
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		qflag = XFS_UQUOTA_CHKD;
+		blftype = XFS_BLF_UDQUOT_BUF;
+		break;
+	case XFS_DQTYPE_PROJ:
+		qflag = XFS_PQUOTA_CHKD;
+		blftype = XFS_BLF_PDQUOT_BUF;
+		break;
+	case XFS_DQTYPE_GROUP:
+		qflag = XFS_GQUOTA_CHKD;
+		blftype = XFS_BLF_GDQUOT_BUF;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	d = bp->b_addr;
+
+	/*
+	 * ID of the first dquot in the block - id's are zero based.
+	 */
+	curid = id - (id % q->qi_dqperchunk);
+	memset(d, 0, BBTOB(q->qi_dqchunklen));
+	for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
+		d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+		d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+		d->dd_diskdq.d_id = cpu_to_be32(curid);
+		d->dd_diskdq.d_type = type;
+		if (curid > 0 && xfs_has_bigtime(mp))
+			d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME;
+		if (xfs_has_crc(mp)) {
+			uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
+			xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+					 XFS_DQUOT_CRC_OFF);
+		}
+	}
+
+	xfs_trans_dquot_buf(tp, bp, blftype);
+
+	/*
+	 * quotacheck uses delayed writes to update all the dquots on disk in an
+	 * efficient manner instead of logging the individual dquot changes as
+	 * they are made. However if we log the buffer allocated here and crash
+	 * after quotacheck while the logged initialisation is still in the
+	 * active region of the log, log recovery can replay the dquot buffer
+	 * initialisation over the top of the checked dquots and corrupt quota
+	 * accounting.
+	 *
+	 * To avoid this problem, quotacheck cannot log the initialised buffer.
+	 * We must still dirty the buffer and write it back before the
+	 * allocation transaction clears the log. Therefore, mark the buffer as
+	 * ordered instead of logging it directly. This is safe for quotacheck
+	 * because it detects and repairs allocated but initialized dquot blocks
+	 * in the quota inodes.
+	 */
+	if (!(mp->m_qflags & qflag))
+		xfs_trans_ordered_buf(tp, bp);
+	else
+		xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+}
+
+/*
+ * Initialize the dynamic speculative preallocation thresholds. The lo/hi
+ * watermarks correspond to the soft and hard limits by default. If a soft limit
+ * is not specified, we use 95% of the hard limit.
+ */
+void
+xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
+{
+	uint64_t space;
+
+	dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit;
+	dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit;
+	if (!dqp->q_prealloc_lo_wmark) {
+		dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
+		do_div(dqp->q_prealloc_lo_wmark, 100);
+		dqp->q_prealloc_lo_wmark *= 95;
+	}
+
+	space = dqp->q_prealloc_hi_wmark;
+
+	do_div(space, 100);
+	dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
+	dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
+	dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
+}
+
+/*
+ * Ensure that the given in-core dquot has a buffer on disk backing it, and
+ * return the buffer locked and held. This is called when the bmapi finds a
+ * hole.
+ */
+STATIC int
+xfs_dquot_disk_alloc(
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_bmbt_irec	map;
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_buf		*bp;
+	xfs_dqtype_t		qtype = xfs_dquot_type(dqp);
+	struct xfs_inode	*quotip = xfs_quota_inode(mp, qtype);
+	int			nmaps = 1;
+	int			error;
+
+	trace_xfs_dqalloc(dqp);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+			XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(quotip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, quotip, 0);
+
+	if (!xfs_this_quota_on(dqp->q_mount, qtype)) {
+		/*
+		 * Return if this type of quotas is turned off while we didn't
+		 * have an inode lock
+		 */
+		error = -ESRCH;
+		goto err_cancel;
+	}
+
+	error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
+			XFS_IEXT_ADD_NOSPLIT_CNT);
+	if (error == -EFBIG)
+		error = xfs_iext_count_upgrade(tp, quotip,
+				XFS_IEXT_ADD_NOSPLIT_CNT);
+	if (error)
+		goto err_cancel;
+
+	/* Create the block mapping. */
+	error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
+			XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
+			&nmaps);
+	if (error)
+		goto err_cancel;
+
+	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+	ASSERT(nmaps == 1);
+	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+	       (map.br_startblock != HOLESTARTBLOCK));
+
+	/*
+	 * Keep track of the blkno to save a lookup later
+	 */
+	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+	/* now we can just get the buffer (there's nothing to read yet) */
+	error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno,
+			mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+	if (error)
+		goto err_cancel;
+	bp->b_ops = &xfs_dquot_buf_ops;
+
+	/*
+	 * Make a chunk of dquots out of this buffer and log
+	 * the entire thing.
+	 */
+	xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp);
+	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+
+	/*
+	 * Hold the buffer and join it to the dfops so that we'll still own
+	 * the buffer when we return to the caller.  The buffer disposal on
+	 * error must be paid attention to very carefully, as it has been
+	 * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota
+	 * code when allocating a new dquot record" in 2005, and the later
+	 * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep
+	 * the buffer locked across the _defer_finish call.  We can now do
+	 * this correctly with xfs_defer_bjoin.
+	 *
+	 * Above, we allocated a disk block for the dquot information and used
+	 * get_buf to initialize the dquot. If the _defer_finish fails, the old
+	 * transaction is gone but the new buffer is not joined or held to any
+	 * transaction, so we must _buf_relse it.
+	 *
+	 * If everything succeeds, the caller of this function is returned a
+	 * buffer that is locked and held to the transaction.  The caller
+	 * is responsible for unlocking any buffer passed back, either
+	 * manually or by committing the transaction.  On error, the buffer is
+	 * released and not passed back.
+	 *
+	 * Keep the quota inode ILOCKed until after the transaction commit to
+	 * maintain the atomicity of bmap/rmap updates.
+	 */
+	xfs_trans_bhold(tp, bp);
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+	if (error) {
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	*bpp = bp;
+	return 0;
+
+err_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Read in the in-core dquot's on-disk metadata and return the buffer.
+ * Returns ENOENT to signal a hole.
+ */
+STATIC int
+xfs_dquot_disk_read(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_bmbt_irec	map;
+	struct xfs_buf		*bp;
+	xfs_dqtype_t		qtype = xfs_dquot_type(dqp);
+	struct xfs_inode	*quotip = xfs_quota_inode(mp, qtype);
+	uint			lock_mode;
+	int			nmaps = 1;
+	int			error;
+
+	lock_mode = xfs_ilock_data_map_shared(quotip);
+	if (!xfs_this_quota_on(mp, qtype)) {
+		/*
+		 * Return if this type of quotas is turned off while we
+		 * didn't have the quota inode lock.
+		 */
+		xfs_iunlock(quotip, lock_mode);
+		return -ESRCH;
+	}
+
+	/*
+	 * Find the block map; no allocations yet
+	 */
+	error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
+			XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
+	xfs_iunlock(quotip, lock_mode);
+	if (error)
+		return error;
+
+	ASSERT(nmaps == 1);
+	ASSERT(map.br_blockcount >= 1);
+	ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+	if (map.br_startblock == HOLESTARTBLOCK)
+		return -ENOENT;
+
+	trace_xfs_dqtobp_read(dqp);
+
+	/*
+	 * store the blkno etc so that we don't have to do the
+	 * mapping all the time
+	 */
+	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+			mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+			&xfs_dquot_buf_ops);
+	if (error) {
+		ASSERT(bp == NULL);
+		return error;
+	}
+
+	ASSERT(xfs_buf_islocked(bp));
+	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+	*bpp = bp;
+
+	return 0;
+}
+
+/* Allocate and initialize everything we need for an incore dquot. */
+STATIC struct xfs_dquot *
+xfs_dquot_alloc(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type)
+{
+	struct xfs_dquot	*dqp;
+
+	dqp = kmem_cache_zalloc(xfs_dquot_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+	dqp->q_type = type;
+	dqp->q_id = id;
+	dqp->q_mount = mp;
+	INIT_LIST_HEAD(&dqp->q_lru);
+	mutex_init(&dqp->q_qlock);
+	init_waitqueue_head(&dqp->q_pinwait);
+	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+	/*
+	 * Offset of dquot in the (fixed sized) dquot chunk.
+	 */
+	dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+			sizeof(struct xfs_dqblk);
+
+	/*
+	 * Because we want to use a counting completion, complete
+	 * the flush completion once to allow a single access to
+	 * the flush completion without blocking.
+	 */
+	init_completion(&dqp->q_flush);
+	complete(&dqp->q_flush);
+
+	/*
+	 * Make sure group quotas have a different lock class than user
+	 * quotas.
+	 */
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		/* uses the default lock class */
+		break;
+	case XFS_DQTYPE_GROUP:
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
+		break;
+	case XFS_DQTYPE_PROJ:
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	xfs_qm_dquot_logitem_init(dqp);
+
+	XFS_STATS_INC(mp, xs_qm_dquot);
+	return dqp;
+}
+
+/* Check the ondisk dquot's id and type match what the incore dquot expects. */
+static bool
+xfs_dquot_check_type(
+	struct xfs_dquot	*dqp,
+	struct xfs_disk_dquot	*ddqp)
+{
+	uint8_t			ddqp_type;
+	uint8_t			dqp_type;
+
+	ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK;
+	dqp_type = xfs_dquot_type(dqp);
+
+	if (be32_to_cpu(ddqp->d_id) != dqp->q_id)
+		return false;
+
+	/*
+	 * V5 filesystems always expect an exact type match.  V4 filesystems
+	 * expect an exact match for user dquots and for non-root group and
+	 * project dquots.
+	 */
+	if (xfs_has_crc(dqp->q_mount) ||
+	    dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0)
+		return ddqp_type == dqp_type;
+
+	/*
+	 * V4 filesystems support either group or project quotas, but not both
+	 * at the same time.  The non-user quota file can be switched between
+	 * group and project quota uses depending on the mount options, which
+	 * means that we can encounter the other type when we try to load quota
+	 * defaults.  Quotacheck will soon reset the entire quota file
+	 * (including the root dquot) anyway, but don't log scary corruption
+	 * reports to dmesg.
+	 */
+	return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ;
+}
+
+/* Copy the in-core quota fields in from the on-disk buffer. */
+STATIC int
+xfs_dquot_from_disk(
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_disk_dquot	*ddqp = bp->b_addr + dqp->q_bufoffset;
+
+	/*
+	 * Ensure that we got the type and ID we were looking for.
+	 * Everything else was checked by the dquot buffer verifier.
+	 */
+	if (!xfs_dquot_check_type(dqp, ddqp)) {
+		xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR,
+			  "Metadata corruption detected at %pS, quota %u",
+			  __this_address, dqp->q_id);
+		xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
+		return -EFSCORRUPTED;
+	}
+
+	/* copy everything from disk dquot to the incore dquot */
+	dqp->q_type = ddqp->d_type;
+	dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+	dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+	dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+	dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+	dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+	dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+
+	dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount);
+	dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
+	dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
+
+	dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
+	dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
+	dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
+
+	/*
+	 * Reservation counters are defined as reservation plus current usage
+	 * to avoid having to add every time.
+	 */
+	dqp->q_blk.reserved = dqp->q_blk.count;
+	dqp->q_ino.reserved = dqp->q_ino.count;
+	dqp->q_rtb.reserved = dqp->q_rtb.count;
+
+	/* initialize the dquot speculative prealloc thresholds */
+	xfs_dquot_set_prealloc_limits(dqp);
+	return 0;
+}
+
+/* Copy the in-core quota fields into the on-disk buffer. */
+void
+xfs_dquot_to_disk(
+	struct xfs_disk_dquot	*ddqp,
+	struct xfs_dquot	*dqp)
+{
+	ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+	ddqp->d_version = XFS_DQUOT_VERSION;
+	ddqp->d_type = dqp->q_type;
+	ddqp->d_id = cpu_to_be32(dqp->q_id);
+	ddqp->d_pad0 = 0;
+	ddqp->d_pad = 0;
+
+	ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit);
+	ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit);
+	ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit);
+	ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit);
+	ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit);
+	ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit);
+
+	ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count);
+	ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
+	ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
+
+	ddqp->d_bwarns = 0;
+	ddqp->d_iwarns = 0;
+	ddqp->d_rtbwarns = 0;
+
+	ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
+	ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
+	ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer);
+}
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.  If @can_alloc is true, fill any
+ * holes in the on-disk metadata.
+ */
+static int
+xfs_qm_dqread(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type,
+	bool			can_alloc,
+	struct xfs_dquot	**dqpp)
+{
+	struct xfs_dquot	*dqp;
+	struct xfs_buf		*bp;
+	int			error;
+
+	dqp = xfs_dquot_alloc(mp, id, type);
+	trace_xfs_dqread(dqp);
+
+	/* Try to read the buffer, allocating if necessary. */
+	error = xfs_dquot_disk_read(mp, dqp, &bp);
+	if (error == -ENOENT && can_alloc)
+		error = xfs_dquot_disk_alloc(dqp, &bp);
+	if (error)
+		goto err;
+
+	/*
+	 * At this point we should have a clean locked buffer.  Copy the data
+	 * to the incore dquot and release the buffer since the incore dquot
+	 * has its own locking protocol so we needn't tie up the buffer any
+	 * further.
+	 */
+	ASSERT(xfs_buf_islocked(bp));
+	error = xfs_dquot_from_disk(dqp, bp);
+	xfs_buf_relse(bp);
+	if (error)
+		goto err;
+
+	*dqpp = dqp;
+	return error;
+
+err:
+	trace_xfs_dqread_fail(dqp);
+	xfs_qm_dqdestroy(dqp);
+	*dqpp = NULL;
+	return error;
+}
+
+/*
+ * Advance to the next id in the current chunk, or if at the
+ * end of the chunk, skip ahead to first id in next allocated chunk
+ * using the SEEK_DATA interface.
+ */
+static int
+xfs_dq_get_next_id(
+	struct xfs_mount	*mp,
+	xfs_dqtype_t		type,
+	xfs_dqid_t		*id)
+{
+	struct xfs_inode	*quotip = xfs_quota_inode(mp, type);
+	xfs_dqid_t		next_id = *id + 1; /* simple advance */
+	uint			lock_flags;
+	struct xfs_bmbt_irec	got;
+	struct xfs_iext_cursor	cur;
+	xfs_fsblock_t		start;
+	int			error = 0;
+
+	/* If we'd wrap past the max ID, stop */
+	if (next_id < *id)
+		return -ENOENT;
+
+	/* If new ID is within the current chunk, advancing it sufficed */
+	if (next_id % mp->m_quotainfo->qi_dqperchunk) {
+		*id = next_id;
+		return 0;
+	}
+
+	/* Nope, next_id is now past the current chunk, so find the next one */
+	start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
+
+	lock_flags = xfs_ilock_data_map_shared(quotip);
+	error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &cur, &got)) {
+		/* contiguous chunk, bump startoff for the id calculation */
+		if (got.br_startoff < start)
+			got.br_startoff = start;
+		*id = got.br_startoff * mp->m_quotainfo->qi_dqperchunk;
+	} else {
+		error = -ENOENT;
+	}
+
+	xfs_iunlock(quotip, lock_flags);
+
+	return error;
+}
+
+/*
+ * Look up the dquot in the in-core cache.  If found, the dquot is returned
+ * locked and ready to go.
+ */
+static struct xfs_dquot *
+xfs_qm_dqget_cache_lookup(
+	struct xfs_mount	*mp,
+	struct xfs_quotainfo	*qi,
+	struct radix_tree_root	*tree,
+	xfs_dqid_t		id)
+{
+	struct xfs_dquot	*dqp;
+
+restart:
+	mutex_lock(&qi->qi_tree_lock);
+	dqp = radix_tree_lookup(tree, id);
+	if (!dqp) {
+		mutex_unlock(&qi->qi_tree_lock);
+		XFS_STATS_INC(mp, xs_qm_dqcachemisses);
+		return NULL;
+	}
+
+	xfs_dqlock(dqp);
+	if (dqp->q_flags & XFS_DQFLAG_FREEING) {
+		xfs_dqunlock(dqp);
+		mutex_unlock(&qi->qi_tree_lock);
+		trace_xfs_dqget_freeing(dqp);
+		delay(1);
+		goto restart;
+	}
+
+	dqp->q_nrefs++;
+	mutex_unlock(&qi->qi_tree_lock);
+
+	trace_xfs_dqget_hit(dqp);
+	XFS_STATS_INC(mp, xs_qm_dqcachehits);
+	return dqp;
+}
+
+/*
+ * Try to insert a new dquot into the in-core cache.  If an error occurs the
+ * caller should throw away the dquot and start over.  Otherwise, the dquot
+ * is returned locked (and held by the cache) as if there had been a cache
+ * hit.
+ */
+static int
+xfs_qm_dqget_cache_insert(
+	struct xfs_mount	*mp,
+	struct xfs_quotainfo	*qi,
+	struct radix_tree_root	*tree,
+	xfs_dqid_t		id,
+	struct xfs_dquot	*dqp)
+{
+	int			error;
+
+	mutex_lock(&qi->qi_tree_lock);
+	error = radix_tree_insert(tree, id, dqp);
+	if (unlikely(error)) {
+		/* Duplicate found!  Caller must try again. */
+		WARN_ON(error != -EEXIST);
+		mutex_unlock(&qi->qi_tree_lock);
+		trace_xfs_dqget_dup(dqp);
+		return error;
+	}
+
+	/* Return a locked dquot to the caller, with a reference taken. */
+	xfs_dqlock(dqp);
+	dqp->q_nrefs = 1;
+
+	qi->qi_dquots++;
+	mutex_unlock(&qi->qi_tree_lock);
+
+	return 0;
+}
+
+/* Check our input parameters. */
+static int
+xfs_qm_dqget_checks(
+	struct xfs_mount	*mp,
+	xfs_dqtype_t		type)
+{
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		if (!XFS_IS_UQUOTA_ON(mp))
+			return -ESRCH;
+		return 0;
+	case XFS_DQTYPE_GROUP:
+		if (!XFS_IS_GQUOTA_ON(mp))
+			return -ESRCH;
+		return 0;
+	case XFS_DQTYPE_PROJ:
+		if (!XFS_IS_PQUOTA_ON(mp))
+			return -ESRCH;
+		return 0;
+	default:
+		WARN_ON_ONCE(0);
+		return -EINVAL;
+	}
+}
+
+/*
+ * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a
+ * locked dquot, doing an allocation (if requested) as needed.
+ */
+int
+xfs_qm_dqget(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type,
+	bool			can_alloc,
+	struct xfs_dquot	**O_dqpp)
+{
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct radix_tree_root	*tree = xfs_dquot_tree(qi, type);
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	error = xfs_qm_dqget_checks(mp, type);
+	if (error)
+		return error;
+
+restart:
+	dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
+	if (dqp) {
+		*O_dqpp = dqp;
+		return 0;
+	}
+
+	error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+	if (error)
+		return error;
+
+	error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+	if (error) {
+		/*
+		 * Duplicate found. Just throw away the new dquot and start
+		 * over.
+		 */
+		xfs_qm_dqdestroy(dqp);
+		XFS_STATS_INC(mp, xs_qm_dquot_dups);
+		goto restart;
+	}
+
+	trace_xfs_dqget_miss(dqp);
+	*O_dqpp = dqp;
+	return 0;
+}
+
+/*
+ * Given a dquot id and type, read and initialize a dquot from the on-disk
+ * metadata.  This function is only for use during quota initialization so
+ * it ignores the dquot cache assuming that the dquot shrinker isn't set up.
+ * The caller is responsible for _qm_dqdestroy'ing the returned dquot.
+ */
+int
+xfs_qm_dqget_uncached(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type,
+	struct xfs_dquot	**dqpp)
+{
+	int			error;
+
+	error = xfs_qm_dqget_checks(mp, type);
+	if (error)
+		return error;
+
+	return xfs_qm_dqread(mp, id, type, 0, dqpp);
+}
+
+/* Return the quota id for a given inode and type. */
+xfs_dqid_t
+xfs_qm_id_for_quotatype(
+	struct xfs_inode	*ip,
+	xfs_dqtype_t		type)
+{
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		return i_uid_read(VFS_I(ip));
+	case XFS_DQTYPE_GROUP:
+		return i_gid_read(VFS_I(ip));
+	case XFS_DQTYPE_PROJ:
+		return ip->i_projid;
+	}
+	ASSERT(0);
+	return 0;
+}
+
+/*
+ * Return the dquot for a given inode and type.  If @can_alloc is true, then
+ * allocate blocks if needed.  The inode's ILOCK must be held and it must not
+ * have already had an inode attached.
+ */
+int
+xfs_qm_dqget_inode(
+	struct xfs_inode	*ip,
+	xfs_dqtype_t		type,
+	bool			can_alloc,
+	struct xfs_dquot	**O_dqpp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct radix_tree_root	*tree = xfs_dquot_tree(qi, type);
+	struct xfs_dquot	*dqp;
+	xfs_dqid_t		id;
+	int			error;
+
+	error = xfs_qm_dqget_checks(mp, type);
+	if (error)
+		return error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_inode_dquot(ip, type) == NULL);
+
+	id = xfs_qm_id_for_quotatype(ip, type);
+
+restart:
+	dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
+	if (dqp) {
+		*O_dqpp = dqp;
+		return 0;
+	}
+
+	/*
+	 * Dquot cache miss. We don't want to keep the inode lock across
+	 * a (potential) disk read. Also we don't want to deal with the lock
+	 * ordering between quotainode and this inode. OTOH, dropping the inode
+	 * lock here means dealing with a chown that can happen before
+	 * we re-acquire the lock.
+	 */
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		return error;
+
+	/*
+	 * A dquot could be attached to this inode by now, since we had
+	 * dropped the ilock.
+	 */
+	if (xfs_this_quota_on(mp, type)) {
+		struct xfs_dquot	*dqp1;
+
+		dqp1 = xfs_inode_dquot(ip, type);
+		if (dqp1) {
+			xfs_qm_dqdestroy(dqp);
+			dqp = dqp1;
+			xfs_dqlock(dqp);
+			goto dqret;
+		}
+	} else {
+		/* inode stays locked on return */
+		xfs_qm_dqdestroy(dqp);
+		return -ESRCH;
+	}
+
+	error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+	if (error) {
+		/*
+		 * Duplicate found. Just throw away the new dquot and start
+		 * over.
+		 */
+		xfs_qm_dqdestroy(dqp);
+		XFS_STATS_INC(mp, xs_qm_dquot_dups);
+		goto restart;
+	}
+
+dqret:
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	trace_xfs_dqget_miss(dqp);
+	*O_dqpp = dqp;
+	return 0;
+}
+
+/*
+ * Starting at @id and progressing upwards, look for an initialized incore
+ * dquot, lock it, and return it.
+ */
+int
+xfs_qm_dqget_next(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type,
+	struct xfs_dquot	**dqpp)
+{
+	struct xfs_dquot	*dqp;
+	int			error = 0;
+
+	*dqpp = NULL;
+	for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) {
+		error = xfs_qm_dqget(mp, id, type, false, &dqp);
+		if (error == -ENOENT)
+			continue;
+		else if (error != 0)
+			break;
+
+		if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+			*dqpp = dqp;
+			return 0;
+		}
+
+		xfs_qm_dqput(dqp);
+	}
+
+	return error;
+}
+
+/*
+ * Release a reference to the dquot (decrement ref-count) and unlock it.
+ *
+ * If there is a group quota attached to this dquot, carefully release that
+ * too without tripping over deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+	struct xfs_dquot	*dqp)
+{
+	ASSERT(dqp->q_nrefs > 0);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	trace_xfs_dqput(dqp);
+
+	if (--dqp->q_nrefs == 0) {
+		struct xfs_quotainfo	*qi = dqp->q_mount->m_quotainfo;
+		trace_xfs_dqput_free(dqp);
+
+		if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+			XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
+	}
+	xfs_dqunlock(dqp);
+}
+
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+	struct xfs_dquot	*dqp)
+{
+	if (!dqp)
+		return;
+
+	trace_xfs_dqrele(dqp);
+
+	xfs_dqlock(dqp);
+	/*
+	 * We don't care to flush it if the dquot is dirty here.
+	 * That will create stutters that we want to avoid.
+	 * Instead we do a delayed write when we try to reclaim
+	 * a dirty dquot. Also xfs_sync will take part of the burden...
+	 */
+	xfs_qm_dqput(dqp);
+}
+
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+static void
+xfs_qm_dqflush_done(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dq_logitem	*qip = (struct xfs_dq_logitem *)lip;
+	struct xfs_dquot	*dqp = qip->qli_dquot;
+	struct xfs_ail		*ailp = lip->li_ailp;
+	xfs_lsn_t		tail_lsn;
+
+	/*
+	 * We only want to pull the item from the AIL if its
+	 * location in the log has not changed since we started the flush.
+	 * Thus, we only bother if the dquot's lsn has
+	 * not changed. First we check the lsn outside the lock
+	 * since it's cheaper, and then we recheck while
+	 * holding the lock before removing the dquot from the AIL.
+	 */
+	if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
+	    ((lip->li_lsn == qip->qli_flush_lsn) ||
+	     test_bit(XFS_LI_FAILED, &lip->li_flags))) {
+
+		spin_lock(&ailp->ail_lock);
+		xfs_clear_li_failed(lip);
+		if (lip->li_lsn == qip->qli_flush_lsn) {
+			/* xfs_ail_update_finish() drops the AIL lock */
+			tail_lsn = xfs_ail_delete_one(ailp, lip);
+			xfs_ail_update_finish(ailp, tail_lsn);
+		} else {
+			spin_unlock(&ailp->ail_lock);
+		}
+	}
+
+	/*
+	 * Release the dq's flush lock since we're done with it.
+	 */
+	xfs_dqfunlock(dqp);
+}
+
+void
+xfs_buf_dquot_iodone(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*lip, *n;
+
+	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+		list_del_init(&lip->li_bio_list);
+		xfs_qm_dqflush_done(lip);
+	}
+}
+
+void
+xfs_buf_dquot_io_fail(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*lip;
+
+	spin_lock(&bp->b_mount->m_ail->ail_lock);
+	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+		xfs_set_li_failed(lip, bp);
+	spin_unlock(&bp->b_mount->m_ail->ail_lock);
+}
+
+/* Check incore dquot for errors before we flush. */
+static xfs_failaddr_t
+xfs_qm_dqflush_check(
+	struct xfs_dquot	*dqp)
+{
+	xfs_dqtype_t		type = xfs_dquot_type(dqp);
+
+	if (type != XFS_DQTYPE_USER &&
+	    type != XFS_DQTYPE_GROUP &&
+	    type != XFS_DQTYPE_PROJ)
+		return __this_address;
+
+	if (dqp->q_id == 0)
+		return NULL;
+
+	if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit &&
+	    !dqp->q_blk.timer)
+		return __this_address;
+
+	if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit &&
+	    !dqp->q_ino.timer)
+		return __this_address;
+
+	if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit &&
+	    !dqp->q_rtb.timer)
+		return __this_address;
+
+	/* bigtime flag should never be set on root dquots */
+	if (dqp->q_type & XFS_DQTYPE_BIGTIME) {
+		if (!xfs_has_bigtime(dqp->q_mount))
+			return __this_address;
+		if (dqp->q_id == 0)
+			return __this_address;
+	}
+
+	return NULL;
+}
+
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_log_item	*lip = &dqp->q_logitem.qli_item;
+	struct xfs_buf		*bp;
+	struct xfs_dqblk	*dqblk;
+	xfs_failaddr_t		fa;
+	int			error;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(!completion_done(&dqp->q_flush));
+
+	trace_xfs_dqflush(dqp);
+
+	*bpp = NULL;
+
+	xfs_qm_dqunpin_wait(dqp);
+
+	/*
+	 * Get the buffer containing the on-disk dquot
+	 */
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+				   mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+				   &bp, &xfs_dquot_buf_ops);
+	if (error == -EAGAIN)
+		goto out_unlock;
+	if (error)
+		goto out_abort;
+
+	fa = xfs_qm_dqflush_check(dqp);
+	if (fa) {
+		xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
+				dqp->q_id, fa);
+		xfs_buf_relse(bp);
+		error = -EFSCORRUPTED;
+		goto out_abort;
+	}
+
+	/* Flush the incore dquot to the ondisk buffer. */
+	dqblk = bp->b_addr + dqp->q_bufoffset;
+	xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp);
+
+	/*
+	 * Clear the dirty field and remember the flush lsn for later use.
+	 */
+	dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
+
+	xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+					&dqp->q_logitem.qli_item.li_lsn);
+
+	/*
+	 * copy the lsn into the on-disk dquot now while we have the in memory
+	 * dquot here. This can't be done later in the write verifier as we
+	 * can't get access to the log item at that point in time.
+	 *
+	 * We also calculate the CRC here so that the on-disk dquot in the
+	 * buffer always has a valid CRC. This ensures there is no possibility
+	 * of a dquot without an up-to-date CRC getting to disk.
+	 */
+	if (xfs_has_crc(mp)) {
+		dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+		xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
+	/*
+	 * Attach the dquot to the buffer so that we can remove this dquot from
+	 * the AIL and release the flush lock once the dquot is synced to disk.
+	 */
+	bp->b_flags |= _XBF_DQUOTS;
+	list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list);
+
+	/*
+	 * If the buffer is pinned then push on the log so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (xfs_buf_ispinned(bp)) {
+		trace_xfs_dqflush_force(dqp);
+		xfs_log_force(mp, 0);
+	}
+
+	trace_xfs_dqflush_done(dqp);
+	*bpp = bp;
+	return 0;
+
+out_abort:
+	dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
+	xfs_trans_ail_delete(lip, 0);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+out_unlock:
+	xfs_dqfunlock(dqp);
+	return error;
+}
+
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
+void
+xfs_dqlock2(
+	struct xfs_dquot	*d1,
+	struct xfs_dquot	*d2)
+{
+	if (d1 && d2) {
+		ASSERT(d1 != d2);
+		if (d1->q_id > d2->q_id) {
+			mutex_lock(&d2->q_qlock);
+			mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
+		} else {
+			mutex_lock(&d1->q_qlock);
+			mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
+		}
+	} else if (d1) {
+		mutex_lock(&d1->q_qlock);
+	} else if (d2) {
+		mutex_lock(&d2->q_qlock);
+	}
+}
+
+int __init
+xfs_qm_init(void)
+{
+	xfs_dquot_cache = kmem_cache_create("xfs_dquot",
+					  sizeof(struct xfs_dquot),
+					  0, 0, NULL);
+	if (!xfs_dquot_cache)
+		goto out;
+
+	xfs_dqtrx_cache = kmem_cache_create("xfs_dqtrx",
+					     sizeof(struct xfs_dquot_acct),
+					     0, 0, NULL);
+	if (!xfs_dqtrx_cache)
+		goto out_free_dquot_cache;
+
+	return 0;
+
+out_free_dquot_cache:
+	kmem_cache_destroy(xfs_dquot_cache);
+out:
+	return -ENOMEM;
+}
+
+void
+xfs_qm_exit(void)
+{
+	kmem_cache_destroy(xfs_dqtrx_cache);
+	kmem_cache_destroy(xfs_dquot_cache);
+}
+
+/*
+ * Iterate every dquot of a particular type.  The caller must ensure that the
+ * particular quota type is active.  iter_fn can return negative error codes,
+ * or -ECANCELED to indicate that it wants to stop iterating.
+ */
+int
+xfs_qm_dqiterate(
+	struct xfs_mount	*mp,
+	xfs_dqtype_t		type,
+	xfs_qm_dqiterate_fn	iter_fn,
+	void			*priv)
+{
+	struct xfs_dquot	*dq;
+	xfs_dqid_t		id = 0;
+	int			error;
+
+	do {
+		error = xfs_qm_dqget_next(mp, id, type, &dq);
+		if (error == -ENOENT)
+			return 0;
+		if (error)
+			return error;
+
+		error = iter_fn(dq, type, priv);
+		id = dq->q_id;
+		xfs_qm_dqput(dq);
+	} while (error == 0 && id != 0);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
new file mode 100644
index 000000000..80c8f851a
--- /dev/null
+++ b/fs/xfs/xfs_dquot.h
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+
+enum {
+	XFS_QLOWSP_1_PCNT = 0,
+	XFS_QLOWSP_3_PCNT,
+	XFS_QLOWSP_5_PCNT,
+	XFS_QLOWSP_MAX
+};
+
+struct xfs_dquot_res {
+	/* Total resources allocated and reserved. */
+	xfs_qcnt_t		reserved;
+
+	/* Total resources allocated. */
+	xfs_qcnt_t		count;
+
+	/* Absolute and preferred limits. */
+	xfs_qcnt_t		hardlimit;
+	xfs_qcnt_t		softlimit;
+
+	/*
+	 * For root dquots, this is the default grace period, in seconds.
+	 * Otherwise, this is when the quota grace period expires,
+	 * in seconds since the Unix epoch.
+	 */
+	time64_t		timer;
+};
+
+static inline bool
+xfs_dquot_res_over_limits(
+	const struct xfs_dquot_res	*qres)
+{
+	if ((qres->softlimit && qres->softlimit < qres->reserved) ||
+	    (qres->hardlimit && qres->hardlimit < qres->reserved))
+		return true;
+	return false;
+}
+
+/*
+ * The incore dquot structure
+ */
+struct xfs_dquot {
+	struct list_head	q_lru;
+	struct xfs_mount	*q_mount;
+	xfs_dqtype_t		q_type;
+	uint16_t		q_flags;
+	xfs_dqid_t		q_id;
+	uint			q_nrefs;
+	int			q_bufoffset;
+	xfs_daddr_t		q_blkno;
+	xfs_fileoff_t		q_fileoffset;
+
+	struct xfs_dquot_res	q_blk;	/* regular blocks */
+	struct xfs_dquot_res	q_ino;	/* inodes */
+	struct xfs_dquot_res	q_rtb;	/* realtime blocks */
+
+	struct xfs_dq_logitem	q_logitem;
+
+	xfs_qcnt_t		q_prealloc_lo_wmark;
+	xfs_qcnt_t		q_prealloc_hi_wmark;
+	int64_t			q_low_space[XFS_QLOWSP_MAX];
+	struct mutex		q_qlock;
+	struct completion	q_flush;
+	atomic_t		q_pincount;
+	struct wait_queue_head	q_pinwait;
+};
+
+/*
+ * Lock hierarchy for q_qlock:
+ *	XFS_QLOCK_NORMAL is the implicit default,
+ *	XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+	XFS_QLOCK_NORMAL = 0,
+	XFS_QLOCK_NESTED,
+};
+
+/*
+ * Manage the q_flush completion queue embedded in the dquot. This completion
+ * queue synchronizes processes attempting to flush the in-core dquot back to
+ * disk.
+ */
+static inline void xfs_dqflock(struct xfs_dquot *dqp)
+{
+	wait_for_completion(&dqp->q_flush);
+}
+
+static inline bool xfs_dqflock_nowait(struct xfs_dquot *dqp)
+{
+	return try_wait_for_completion(&dqp->q_flush);
+}
+
+static inline void xfs_dqfunlock(struct xfs_dquot *dqp)
+{
+	complete(&dqp->q_flush);
+}
+
+static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp)
+{
+	return mutex_trylock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqlock(struct xfs_dquot *dqp)
+{
+	mutex_lock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqunlock(struct xfs_dquot *dqp)
+{
+	mutex_unlock(&dqp->q_qlock);
+}
+
+static inline int
+xfs_dquot_type(const struct xfs_dquot *dqp)
+{
+	return dqp->q_type & XFS_DQTYPE_REC_MASK;
+}
+
+static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type)
+{
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		return XFS_IS_UQUOTA_ON(mp);
+	case XFS_DQTYPE_GROUP:
+		return XFS_IS_GQUOTA_ON(mp);
+	case XFS_DQTYPE_PROJ:
+		return XFS_IS_PQUOTA_ON(mp);
+	default:
+		return 0;
+	}
+}
+
+static inline struct xfs_dquot *xfs_inode_dquot(
+	struct xfs_inode	*ip,
+	xfs_dqtype_t		type)
+{
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		return ip->i_udquot;
+	case XFS_DQTYPE_GROUP:
+		return ip->i_gdquot;
+	case XFS_DQTYPE_PROJ:
+		return ip->i_pdquot;
+	default:
+		return NULL;
+	}
+}
+
+/* Decide if the dquot's limits are actually being enforced. */
+static inline bool
+xfs_dquot_is_enforced(
+	const struct xfs_dquot	*dqp)
+{
+	switch (xfs_dquot_type(dqp)) {
+	case XFS_DQTYPE_USER:
+		return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount);
+	case XFS_DQTYPE_GROUP:
+		return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount);
+	case XFS_DQTYPE_PROJ:
+		return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount);
+	}
+	ASSERT(0);
+	return false;
+}
+
+/*
+ * Check whether a dquot is under low free space conditions. We assume the quota
+ * is enabled and enforced.
+ */
+static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
+{
+	int64_t freesp;
+
+	freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved;
+	if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
+		return true;
+
+	return false;
+}
+
+void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
+
+#define XFS_DQ_IS_LOCKED(dqp)	(mutex_is_locked(&((dqp)->q_qlock)))
+#define XFS_DQ_IS_DIRTY(dqp)	((dqp)->q_flags & XFS_DQFLAG_DIRTY)
+
+void		xfs_qm_dqdestroy(struct xfs_dquot *dqp);
+int		xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp);
+void		xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
+void		xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
+void		xfs_qm_adjust_dqlimits(struct xfs_dquot *d);
+xfs_dqid_t	xfs_qm_id_for_quotatype(struct xfs_inode *ip,
+				xfs_dqtype_t type);
+int		xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
+				xfs_dqtype_t type, bool can_alloc,
+				struct xfs_dquot **dqpp);
+int		xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type,
+				bool can_alloc, struct xfs_dquot **dqpp);
+int		xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
+				xfs_dqtype_t type, struct xfs_dquot **dqpp);
+int		xfs_qm_dqget_uncached(struct xfs_mount *mp,
+				xfs_dqid_t id, xfs_dqtype_t type,
+				struct xfs_dquot **dqpp);
+void		xfs_qm_dqput(struct xfs_dquot *dqp);
+
+void		xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+
+void		xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+
+static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
+{
+	xfs_dqlock(dqp);
+	dqp->q_nrefs++;
+	xfs_dqunlock(dqp);
+	return dqp;
+}
+
+typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
+		xfs_dqtype_t type, void *priv);
+int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
+		xfs_qm_dqiterate_fn iter_fn, void *priv);
+
+time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
+time64_t xfs_dquot_set_grace_period(time64_t grace);
+
+#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
new file mode 100644
index 000000000..6a1aae799
--- /dev/null
+++ b/fs/xfs/xfs_dquot_item.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_log.h"
+
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
+
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 2;
+	*nbytes += sizeof(struct xfs_dq_logformat) +
+		   sizeof(struct xfs_disk_dquot);
+}
+
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_disk_dquot	ddq;
+	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+	struct xfs_dq_logformat	*qlf;
+
+	qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
+	qlf->qlf_type = XFS_LI_DQUOT;
+	qlf->qlf_size = 2;
+	qlf->qlf_id = qlip->qli_dquot->q_id;
+	qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
+	qlf->qlf_len = 1;
+	qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
+	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
+
+	xfs_dquot_to_disk(&ddq, qlip->qli_dquot);
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq,
+			sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Increment the pin count of the given dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	atomic_inc(&dqp->q_pincount);
+}
+
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0.	 The
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
+ */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(atomic_read(&dqp->q_pincount) > 0);
+	if (atomic_dec_and_test(&dqp->q_pincount))
+		wake_up(&dqp->q_pinwait);
+}
+
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+	struct xfs_dquot	*dqp)
+{
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (atomic_read(&dqp->q_pincount) == 0)
+		return;
+
+	/*
+	 * Give the log a push so we don't wait here too long.
+	 */
+	xfs_log_force(dqp->q_mount, 0);
+	wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
+}
+
+STATIC uint
+xfs_qm_dquot_logitem_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+		__releases(&lip->li_ailp->ail_lock)
+		__acquires(&lip->li_ailp->ail_lock)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+	struct xfs_buf		*bp = lip->li_buf;
+	uint			rval = XFS_ITEM_SUCCESS;
+	int			error;
+
+	if (atomic_read(&dqp->q_pincount) > 0)
+		return XFS_ITEM_PINNED;
+
+	if (!xfs_dqlock_nowait(dqp))
+		return XFS_ITEM_LOCKED;
+
+	/*
+	 * Re-check the pincount now that we stabilized the value by
+	 * taking the quota lock.
+	 */
+	if (atomic_read(&dqp->q_pincount) > 0) {
+		rval = XFS_ITEM_PINNED;
+		goto out_unlock;
+	}
+
+	/*
+	 * Someone else is already flushing the dquot.  Nothing we can do
+	 * here but wait for the flush to finish and remove the item from
+	 * the AIL.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		rval = XFS_ITEM_FLUSHING;
+		goto out_unlock;
+	}
+
+	spin_unlock(&lip->li_ailp->ail_lock);
+
+	error = xfs_qm_dqflush(dqp, &bp);
+	if (!error) {
+		if (!xfs_buf_delwri_queue(bp, buffer_list))
+			rval = XFS_ITEM_FLUSHING;
+		xfs_buf_relse(bp);
+	} else if (error == -EAGAIN)
+		rval = XFS_ITEM_LOCKED;
+
+	spin_lock(&lip->li_ailp->ail_lock);
+out_unlock:
+	xfs_dqunlock(dqp);
+	return rval;
+}
+
+STATIC void
+xfs_qm_dquot_logitem_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/*
+	 * dquots are never 'held' from getting unlocked at the end of
+	 * a transaction.  Their locking and unlocking is hidden inside the
+	 * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+	 * for the logitem.
+	 */
+	xfs_dqunlock(dqp);
+}
+
+STATIC void
+xfs_qm_dquot_logitem_committing(
+	struct xfs_log_item	*lip,
+	xfs_csn_t		seq)
+{
+	return xfs_qm_dquot_logitem_release(lip);
+}
+
+static const struct xfs_item_ops xfs_dquot_item_ops = {
+	.iop_size	= xfs_qm_dquot_logitem_size,
+	.iop_format	= xfs_qm_dquot_logitem_format,
+	.iop_pin	= xfs_qm_dquot_logitem_pin,
+	.iop_unpin	= xfs_qm_dquot_logitem_unpin,
+	.iop_release	= xfs_qm_dquot_logitem_release,
+	.iop_committing	= xfs_qm_dquot_logitem_committing,
+	.iop_push	= xfs_qm_dquot_logitem_push,
+};
+
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_dq_logitem	*lp = &dqp->q_logitem;
+
+	xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
+					&xfs_dquot_item_ops);
+	lp->qli_dquot = dqp;
+}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
new file mode 100644
index 000000000..794710c24
--- /dev/null
+++ b/fs/xfs/xfs_dquot_item.h
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DQUOT_ITEM_H__
+#define __XFS_DQUOT_ITEM_H__
+
+struct xfs_dquot;
+struct xfs_trans;
+struct xfs_mount;
+
+struct xfs_dq_logitem {
+	struct xfs_log_item	qli_item;	/* common portion */
+	struct xfs_dquot	*qli_dquot;	/* dquot ptr */
+	xfs_lsn_t		qli_flush_lsn;	/* lsn at last flush */
+};
+
+void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
+
+#endif	/* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
new file mode 100644
index 000000000..8966ba842
--- /dev/null
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	struct xfs_mount	*mp = log->l_mp;
+	struct xfs_disk_dquot	*recddq;
+	struct xfs_dq_logformat	*dq_f;
+	uint			type;
+
+	if (mp->m_qflags == 0)
+		return;
+
+	recddq = item->ri_buf[1].i_addr;
+	if (recddq == NULL)
+		return;
+	if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+		return;
+
+	type = recddq->d_type & XFS_DQTYPE_REC_MASK;
+	ASSERT(type);
+	if (log->l_quotaoffs_flag & type)
+		return;
+
+	dq_f = item->ri_buf[0].i_addr;
+	ASSERT(dq_f);
+	ASSERT(dq_f->qlf_len == 1);
+
+	xlog_buf_readahead(log, dq_f->qlf_blkno,
+			XFS_FSB_TO_BB(mp, dq_f->qlf_len),
+			&xfs_dquot_buf_ra_ops);
+}
+
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_dquot_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			current_lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_buf			*bp;
+	struct xfs_disk_dquot		*ddq, *recddq;
+	struct xfs_dq_logformat		*dq_f;
+	xfs_failaddr_t			fa;
+	int				error;
+	uint				type;
+
+	/*
+	 * Filesystems are required to send in quota flags at mount time.
+	 */
+	if (mp->m_qflags == 0)
+		return 0;
+
+	recddq = item->ri_buf[1].i_addr;
+	if (recddq == NULL) {
+		xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
+		return -EFSCORRUPTED;
+	}
+	if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
+		xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
+			item->ri_buf[1].i_len, __func__);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * This type of quotas was turned off, so ignore this record.
+	 */
+	type = recddq->d_type & XFS_DQTYPE_REC_MASK;
+	ASSERT(type);
+	if (log->l_quotaoffs_flag & type)
+		return 0;
+
+	/*
+	 * At this point we know that quota was _not_ turned off.
+	 * Since the mount flags are not indicating to us otherwise, this
+	 * must mean that quota is on, and the dquot needs to be replayed.
+	 * Remember that we may not have fully recovered the superblock yet,
+	 * so we can't do the usual trick of looking at the SB quota bits.
+	 *
+	 * The other possibility, of course, is that the quota subsystem was
+	 * removed since the last mount - ENOSYS.
+	 */
+	dq_f = item->ri_buf[0].i_addr;
+	ASSERT(dq_f);
+	fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id);
+	if (fa) {
+		xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
+				dq_f->qlf_id, fa);
+		return -EFSCORRUPTED;
+	}
+	ASSERT(dq_f->qlf_len == 1);
+
+	/*
+	 * At this point we are assuming that the dquots have been allocated
+	 * and hence the buffer has valid dquots stamped in it. It should,
+	 * therefore, pass verifier validation. If the dquot is bad, then the
+	 * we'll return an error here, so we don't need to specifically check
+	 * the dquot in the buffer after the verifier has run.
+	 */
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
+				   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+				   &xfs_dquot_buf_ops);
+	if (error)
+		return error;
+
+	ASSERT(bp);
+	ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
+
+	/*
+	 * If the dquot has an LSN in it, recover the dquot only if it's less
+	 * than the lsn of the transaction we are replaying.
+	 */
+	if (xfs_has_crc(mp)) {
+		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+		xfs_lsn_t	lsn = be64_to_cpu(dqb->dd_lsn);
+
+		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+			goto out_release;
+		}
+	}
+
+	memcpy(ddq, recddq, item->ri_buf[1].i_len);
+	if (xfs_has_crc(mp)) {
+		xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
+	ASSERT(dq_f->qlf_size == 2);
+	ASSERT(bp->b_mount == mp);
+	bp->b_flags |= _XBF_LOGRECOVERY;
+	xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_dquot_item_ops = {
+	.item_type		= XFS_LI_DQUOT,
+	.ra_pass2		= xlog_recover_dquot_ra_pass2,
+	.commit_pass2		= xlog_recover_dquot_commit_pass2,
+};
+
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_quotaoff_commit_pass1(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	struct xfs_qoff_logformat	*qoff_f = item->ri_buf[0].i_addr;
+	ASSERT(qoff_f);
+
+	/*
+	 * The logitem format's flag tells us if this was user quotaoff,
+	 * group/project quotaoff or both.
+	 */
+	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQTYPE_USER;
+	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQTYPE_PROJ;
+	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQTYPE_GROUP;
+
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_quotaoff_item_ops = {
+	.item_type		= XFS_LI_QUOTAOFF,
+	.commit_pass1		= xlog_recover_quotaoff_commit_pass1,
+	/* nothing to commit in pass2 */
+};
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
new file mode 100644
index 000000000..c6b2aabd6
--- /dev/null
+++ b/fs/xfs/xfs_error.c
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_fs.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_sysfs.h"
+#include "xfs_inode.h"
+
+#ifdef DEBUG
+
+static unsigned int xfs_errortag_random_default[] = {
+	XFS_RANDOM_DEFAULT,
+	XFS_RANDOM_IFLUSH_1,
+	XFS_RANDOM_IFLUSH_2,
+	XFS_RANDOM_IFLUSH_3,
+	XFS_RANDOM_IFLUSH_4,
+	XFS_RANDOM_IFLUSH_5,
+	XFS_RANDOM_IFLUSH_6,
+	XFS_RANDOM_DA_READ_BUF,
+	XFS_RANDOM_BTREE_CHECK_LBLOCK,
+	XFS_RANDOM_BTREE_CHECK_SBLOCK,
+	XFS_RANDOM_ALLOC_READ_AGF,
+	XFS_RANDOM_IALLOC_READ_AGI,
+	XFS_RANDOM_ITOBP_INOTOBP,
+	XFS_RANDOM_IUNLINK,
+	XFS_RANDOM_IUNLINK_REMOVE,
+	XFS_RANDOM_DIR_INO_VALIDATE,
+	XFS_RANDOM_BULKSTAT_READ_CHUNK,
+	XFS_RANDOM_IODONE_IOERR,
+	XFS_RANDOM_STRATREAD_IOERR,
+	XFS_RANDOM_STRATCMPL_IOERR,
+	XFS_RANDOM_DIOWRITE_IOERR,
+	XFS_RANDOM_BMAPIFORMAT,
+	XFS_RANDOM_FREE_EXTENT,
+	XFS_RANDOM_RMAP_FINISH_ONE,
+	XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE,
+	XFS_RANDOM_REFCOUNT_FINISH_ONE,
+	XFS_RANDOM_BMAP_FINISH_ONE,
+	XFS_RANDOM_AG_RESV_CRITICAL,
+	XFS_RANDOM_DROP_WRITES,
+	XFS_RANDOM_LOG_BAD_CRC,
+	XFS_RANDOM_LOG_ITEM_PIN,
+	XFS_RANDOM_BUF_LRU_REF,
+	XFS_RANDOM_FORCE_SCRUB_REPAIR,
+	XFS_RANDOM_FORCE_SUMMARY_RECALC,
+	XFS_RANDOM_IUNLINK_FALLBACK,
+	XFS_RANDOM_BUF_IOERROR,
+	XFS_RANDOM_REDUCE_MAX_IEXTENTS,
+	XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
+	XFS_RANDOM_AG_RESV_FAIL,
+	XFS_RANDOM_LARP,
+	XFS_RANDOM_DA_LEAF_SPLIT,
+	XFS_RANDOM_ATTR_LEAF_TO_NODE,
+};
+
+struct xfs_errortag_attr {
+	struct attribute	attr;
+	unsigned int		tag;
+};
+
+static inline struct xfs_errortag_attr *
+to_attr(struct attribute *attr)
+{
+	return container_of(attr, struct xfs_errortag_attr, attr);
+}
+
+static inline struct xfs_mount *
+to_mp(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+
+	return container_of(kobj, struct xfs_mount, m_errortag_kobj);
+}
+
+STATIC ssize_t
+xfs_errortag_attr_store(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	const char		*buf,
+	size_t			count)
+{
+	struct xfs_mount	*mp = to_mp(kobject);
+	struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+	int			ret;
+	unsigned int		val;
+
+	if (strcmp(buf, "default") == 0) {
+		val = xfs_errortag_random_default[xfs_attr->tag];
+	} else {
+		ret = kstrtouint(buf, 0, &val);
+		if (ret)
+			return ret;
+	}
+
+	ret = xfs_errortag_set(mp, xfs_attr->tag, val);
+	if (ret)
+		return ret;
+	return count;
+}
+
+STATIC ssize_t
+xfs_errortag_attr_show(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	char			*buf)
+{
+	struct xfs_mount	*mp = to_mp(kobject);
+	struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n",
+			xfs_errortag_get(mp, xfs_attr->tag));
+}
+
+static const struct sysfs_ops xfs_errortag_sysfs_ops = {
+	.show = xfs_errortag_attr_show,
+	.store = xfs_errortag_attr_store,
+};
+
+#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \
+static struct xfs_errortag_attr xfs_errortag_attr_##_name = {		\
+	.attr = {.name = __stringify(_name),				\
+		 .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) },	\
+	.tag	= (_tag),						\
+}
+
+#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr
+
+XFS_ERRORTAG_ATTR_RW(noerror,		XFS_ERRTAG_NOERROR);
+XFS_ERRORTAG_ATTR_RW(iflush1,		XFS_ERRTAG_IFLUSH_1);
+XFS_ERRORTAG_ATTR_RW(iflush2,		XFS_ERRTAG_IFLUSH_2);
+XFS_ERRORTAG_ATTR_RW(iflush3,		XFS_ERRTAG_IFLUSH_3);
+XFS_ERRORTAG_ATTR_RW(iflush4,		XFS_ERRTAG_IFLUSH_4);
+XFS_ERRORTAG_ATTR_RW(iflush5,		XFS_ERRTAG_IFLUSH_5);
+XFS_ERRORTAG_ATTR_RW(iflush6,		XFS_ERRTAG_IFLUSH_6);
+XFS_ERRORTAG_ATTR_RW(dareadbuf,		XFS_ERRTAG_DA_READ_BUF);
+XFS_ERRORTAG_ATTR_RW(btree_chk_lblk,	XFS_ERRTAG_BTREE_CHECK_LBLOCK);
+XFS_ERRORTAG_ATTR_RW(btree_chk_sblk,	XFS_ERRTAG_BTREE_CHECK_SBLOCK);
+XFS_ERRORTAG_ATTR_RW(readagf,		XFS_ERRTAG_ALLOC_READ_AGF);
+XFS_ERRORTAG_ATTR_RW(readagi,		XFS_ERRTAG_IALLOC_READ_AGI);
+XFS_ERRORTAG_ATTR_RW(itobp,		XFS_ERRTAG_ITOBP_INOTOBP);
+XFS_ERRORTAG_ATTR_RW(iunlink,		XFS_ERRTAG_IUNLINK);
+XFS_ERRORTAG_ATTR_RW(iunlinkrm,		XFS_ERRTAG_IUNLINK_REMOVE);
+XFS_ERRORTAG_ATTR_RW(dirinovalid,	XFS_ERRTAG_DIR_INO_VALIDATE);
+XFS_ERRORTAG_ATTR_RW(bulkstat,		XFS_ERRTAG_BULKSTAT_READ_CHUNK);
+XFS_ERRORTAG_ATTR_RW(logiodone,		XFS_ERRTAG_IODONE_IOERR);
+XFS_ERRORTAG_ATTR_RW(stratread,		XFS_ERRTAG_STRATREAD_IOERR);
+XFS_ERRORTAG_ATTR_RW(stratcmpl,		XFS_ERRTAG_STRATCMPL_IOERR);
+XFS_ERRORTAG_ATTR_RW(diowrite,		XFS_ERRTAG_DIOWRITE_IOERR);
+XFS_ERRORTAG_ATTR_RW(bmapifmt,		XFS_ERRTAG_BMAPIFORMAT);
+XFS_ERRORTAG_ATTR_RW(free_extent,	XFS_ERRTAG_FREE_EXTENT);
+XFS_ERRORTAG_ATTR_RW(rmap_finish_one,	XFS_ERRTAG_RMAP_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(refcount_continue_update,	XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE);
+XFS_ERRORTAG_ATTR_RW(refcount_finish_one,	XFS_ERRTAG_REFCOUNT_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(bmap_finish_one,	XFS_ERRTAG_BMAP_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(ag_resv_critical,	XFS_ERRTAG_AG_RESV_CRITICAL);
+XFS_ERRORTAG_ATTR_RW(drop_writes,	XFS_ERRTAG_DROP_WRITES);
+XFS_ERRORTAG_ATTR_RW(log_bad_crc,	XFS_ERRTAG_LOG_BAD_CRC);
+XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
+XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
+XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
+XFS_ERRORTAG_ATTR_RW(bad_summary,	XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback,	XFS_ERRTAG_IUNLINK_FALLBACK);
+XFS_ERRORTAG_ATTR_RW(buf_ioerror,	XFS_ERRTAG_BUF_IOERROR);
+XFS_ERRORTAG_ATTR_RW(reduce_max_iextents,	XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
+XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent,	XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
+XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
+XFS_ERRORTAG_ATTR_RW(larp,		XFS_ERRTAG_LARP);
+XFS_ERRORTAG_ATTR_RW(da_leaf_split,	XFS_ERRTAG_DA_LEAF_SPLIT);
+XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node,	XFS_ERRTAG_ATTR_LEAF_TO_NODE);
+
+static struct attribute *xfs_errortag_attrs[] = {
+	XFS_ERRORTAG_ATTR_LIST(noerror),
+	XFS_ERRORTAG_ATTR_LIST(iflush1),
+	XFS_ERRORTAG_ATTR_LIST(iflush2),
+	XFS_ERRORTAG_ATTR_LIST(iflush3),
+	XFS_ERRORTAG_ATTR_LIST(iflush4),
+	XFS_ERRORTAG_ATTR_LIST(iflush5),
+	XFS_ERRORTAG_ATTR_LIST(iflush6),
+	XFS_ERRORTAG_ATTR_LIST(dareadbuf),
+	XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk),
+	XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk),
+	XFS_ERRORTAG_ATTR_LIST(readagf),
+	XFS_ERRORTAG_ATTR_LIST(readagi),
+	XFS_ERRORTAG_ATTR_LIST(itobp),
+	XFS_ERRORTAG_ATTR_LIST(iunlink),
+	XFS_ERRORTAG_ATTR_LIST(iunlinkrm),
+	XFS_ERRORTAG_ATTR_LIST(dirinovalid),
+	XFS_ERRORTAG_ATTR_LIST(bulkstat),
+	XFS_ERRORTAG_ATTR_LIST(logiodone),
+	XFS_ERRORTAG_ATTR_LIST(stratread),
+	XFS_ERRORTAG_ATTR_LIST(stratcmpl),
+	XFS_ERRORTAG_ATTR_LIST(diowrite),
+	XFS_ERRORTAG_ATTR_LIST(bmapifmt),
+	XFS_ERRORTAG_ATTR_LIST(free_extent),
+	XFS_ERRORTAG_ATTR_LIST(rmap_finish_one),
+	XFS_ERRORTAG_ATTR_LIST(refcount_continue_update),
+	XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
+	XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
+	XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
+	XFS_ERRORTAG_ATTR_LIST(drop_writes),
+	XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
+	XFS_ERRORTAG_ATTR_LIST(log_item_pin),
+	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
+	XFS_ERRORTAG_ATTR_LIST(force_repair),
+	XFS_ERRORTAG_ATTR_LIST(bad_summary),
+	XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
+	XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
+	XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
+	XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
+	XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
+	XFS_ERRORTAG_ATTR_LIST(larp),
+	XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
+	XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
+	NULL,
+};
+ATTRIBUTE_GROUPS(xfs_errortag);
+
+static struct kobj_type xfs_errortag_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_errortag_sysfs_ops,
+	.default_groups = xfs_errortag_groups,
+};
+
+int
+xfs_errortag_init(
+	struct xfs_mount	*mp)
+{
+	int ret;
+
+	mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
+			KM_MAYFAIL);
+	if (!mp->m_errortag)
+		return -ENOMEM;
+
+	ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
+				&mp->m_kobj, "errortag");
+	if (ret)
+		kmem_free(mp->m_errortag);
+	return ret;
+}
+
+void
+xfs_errortag_del(
+	struct xfs_mount	*mp)
+{
+	xfs_sysfs_del(&mp->m_errortag_kobj);
+	kmem_free(mp->m_errortag);
+}
+
+bool
+xfs_errortag_test(
+	struct xfs_mount	*mp,
+	const char		*expression,
+	const char		*file,
+	int			line,
+	unsigned int		error_tag)
+{
+	unsigned int		randfactor;
+
+	/*
+	 * To be able to use error injection anywhere, we need to ensure error
+	 * injection mechanism is already initialized.
+	 *
+	 * Code paths like I/O completion can be called before the
+	 * initialization is complete, but be able to inject errors in such
+	 * places is still useful.
+	 */
+	if (!mp->m_errortag)
+		return false;
+
+	ASSERT(error_tag < XFS_ERRTAG_MAX);
+	randfactor = mp->m_errortag[error_tag];
+	if (!randfactor || prandom_u32_max(randfactor))
+		return false;
+
+	xfs_warn_ratelimited(mp,
+"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
+			expression, file, line, mp->m_super->s_id);
+	return true;
+}
+
+int
+xfs_errortag_get(
+	struct xfs_mount	*mp,
+	unsigned int		error_tag)
+{
+	if (error_tag >= XFS_ERRTAG_MAX)
+		return -EINVAL;
+
+	return mp->m_errortag[error_tag];
+}
+
+int
+xfs_errortag_set(
+	struct xfs_mount	*mp,
+	unsigned int		error_tag,
+	unsigned int		tag_value)
+{
+	if (error_tag >= XFS_ERRTAG_MAX)
+		return -EINVAL;
+
+	mp->m_errortag[error_tag] = tag_value;
+	return 0;
+}
+
+int
+xfs_errortag_add(
+	struct xfs_mount	*mp,
+	unsigned int		error_tag)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
+
+	if (error_tag >= XFS_ERRTAG_MAX)
+		return -EINVAL;
+
+	return xfs_errortag_set(mp, error_tag,
+			xfs_errortag_random_default[error_tag]);
+}
+
+int
+xfs_errortag_clearall(
+	struct xfs_mount	*mp)
+{
+	memset(mp->m_errortag, 0, sizeof(unsigned int) * XFS_ERRTAG_MAX);
+	return 0;
+}
+#endif /* DEBUG */
+
+void
+xfs_error_report(
+	const char		*tag,
+	int			level,
+	struct xfs_mount	*mp,
+	const char		*filename,
+	int			linenum,
+	xfs_failaddr_t		failaddr)
+{
+	if (level <= xfs_error_level) {
+		xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
+		"Internal error %s at line %d of file %s.  Caller %pS",
+			    tag, linenum, filename, failaddr);
+
+		xfs_stack_trace();
+	}
+}
+
+void
+xfs_corruption_error(
+	const char		*tag,
+	int			level,
+	struct xfs_mount	*mp,
+	const void		*buf,
+	size_t			bufsize,
+	const char		*filename,
+	int			linenum,
+	xfs_failaddr_t		failaddr)
+{
+	if (buf && level <= xfs_error_level)
+		xfs_hex_dump(buf, bufsize);
+	xfs_error_report(tag, level, mp, filename, linenum, failaddr);
+	xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
+}
+
+/*
+ * Complain about the kinds of metadata corruption that we can't detect from a
+ * verifier, such as incorrect inter-block relationship data.  Does not set
+ * bp->b_error.
+ *
+ * Call xfs_buf_mark_corrupt, not this function.
+ */
+void
+xfs_buf_corruption_error(
+	struct xfs_buf		*bp,
+	xfs_failaddr_t		fa)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+
+	xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+		  "Metadata corruption detected at %pS, %s block 0x%llx",
+		  fa, bp->b_ops->name, xfs_buf_daddr(bp));
+
+	xfs_alert(mp, "Unmount and run xfs_repair");
+
+	if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+		xfs_stack_trace();
+}
+
+/*
+ * Warnings specifically for verifier errors.  Differentiate CRC vs. invalid
+ * values, and omit the stack trace unless the error level is tuned high.
+ */
+void
+xfs_buf_verifier_error(
+	struct xfs_buf		*bp,
+	int			error,
+	const char		*name,
+	const void		*buf,
+	size_t			bufsz,
+	xfs_failaddr_t		failaddr)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	xfs_failaddr_t		fa;
+	int			sz;
+
+	fa = failaddr ? failaddr : __return_address;
+	__xfs_buf_ioerror(bp, error, fa);
+
+	xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+		  "Metadata %s detected at %pS, %s block 0x%llx %s",
+		  bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
+		  fa, bp->b_ops->name, xfs_buf_daddr(bp), name);
+
+	xfs_alert(mp, "Unmount and run xfs_repair");
+
+	if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+		sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz);
+		xfs_alert(mp, "First %d bytes of corrupted metadata buffer:",
+				sz);
+		xfs_hex_dump(buf, sz);
+	}
+
+	if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+		xfs_stack_trace();
+}
+
+/*
+ * Warnings specifically for verifier errors.  Differentiate CRC vs. invalid
+ * values, and omit the stack trace unless the error level is tuned high.
+ */
+void
+xfs_verifier_error(
+	struct xfs_buf		*bp,
+	int			error,
+	xfs_failaddr_t		failaddr)
+{
+	return xfs_buf_verifier_error(bp, error, "", xfs_buf_offset(bp, 0),
+			XFS_CORRUPTION_DUMP_LEN, failaddr);
+}
+
+/*
+ * Warnings for inode corruption problems.  Don't bother with the stack
+ * trace unless the error level is turned up high.
+ */
+void
+xfs_inode_verifier_error(
+	struct xfs_inode	*ip,
+	int			error,
+	const char		*name,
+	const void		*buf,
+	size_t			bufsz,
+	xfs_failaddr_t		failaddr)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_failaddr_t		fa;
+	int			sz;
+
+	fa = failaddr ? failaddr : __return_address;
+
+	xfs_alert(mp, "Metadata %s detected at %pS, inode 0x%llx %s",
+		  error == -EFSBADCRC ? "CRC error" : "corruption",
+		  fa, ip->i_ino, name);
+
+	xfs_alert(mp, "Unmount and run xfs_repair");
+
+	if (buf && xfs_error_level >= XFS_ERRLEVEL_LOW) {
+		sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz);
+		xfs_alert(mp, "First %d bytes of corrupted metadata buffer:",
+				sz);
+		xfs_hex_dump(buf, sz);
+	}
+
+	if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+		xfs_stack_trace();
+}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
new file mode 100644
index 000000000..5191e9145
--- /dev/null
+++ b/fs/xfs/xfs_error.h
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_ERROR_H__
+#define	__XFS_ERROR_H__
+
+struct xfs_mount;
+
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+			const char *filename, int linenum,
+			xfs_failaddr_t failaddr);
+extern void xfs_corruption_error(const char *tag, int level,
+			struct xfs_mount *mp, const void *buf, size_t bufsize,
+			const char *filename, int linenum,
+			xfs_failaddr_t failaddr);
+void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
+extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
+			const char *name, const void *buf, size_t bufsz,
+			xfs_failaddr_t failaddr);
+extern void xfs_verifier_error(struct xfs_buf *bp, int error,
+			xfs_failaddr_t failaddr);
+extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
+			const char *name, const void *buf, size_t bufsz,
+			xfs_failaddr_t failaddr);
+
+#define	XFS_ERROR_REPORT(e, lvl, mp)	\
+	xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
+#define	XFS_CORRUPTION_ERROR(e, lvl, mp, buf, bufsize)	\
+	xfs_corruption_error(e, lvl, mp, buf, bufsize, \
+			     __FILE__, __LINE__, __return_address)
+
+#define XFS_ERRLEVEL_OFF	0
+#define XFS_ERRLEVEL_LOW	1
+#define XFS_ERRLEVEL_HIGH	5
+
+/* Dump 128 bytes of any corrupt buffer */
+#define XFS_CORRUPTION_DUMP_LEN		(128)
+
+#ifdef DEBUG
+extern int xfs_errortag_init(struct xfs_mount *mp);
+extern void xfs_errortag_del(struct xfs_mount *mp);
+extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
+		const char *file, int line, unsigned int error_tag);
+#define XFS_TEST_ERROR(expr, mp, tag)		\
+	((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+
+extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
+extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
+		unsigned int tag_value);
+extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
+extern int xfs_errortag_clearall(struct xfs_mount *mp);
+#else
+#define xfs_errortag_init(mp)			(0)
+#define xfs_errortag_del(mp)
+#define XFS_TEST_ERROR(expr, mp, tag)		(expr)
+#define xfs_errortag_set(mp, tag, val)		(ENOSYS)
+#define xfs_errortag_add(mp, tag)		(ENOSYS)
+#define xfs_errortag_clearall(mp)		(ENOSYS)
+#endif /* DEBUG */
+
+/*
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
+ *			a panic by setting xfs_panic_mask in a sysctl.
+ */
+#define		XFS_NO_PTAG			0u
+#define		XFS_PTAG_IFLUSH			(1u << 0)
+#define		XFS_PTAG_LOGRES			(1u << 1)
+#define		XFS_PTAG_AILDELETE		(1u << 2)
+#define		XFS_PTAG_ERROR_REPORT		(1u << 3)
+#define		XFS_PTAG_SHUTDOWN_CORRUPT	(1u << 4)
+#define		XFS_PTAG_SHUTDOWN_IOERROR	(1u << 5)
+#define		XFS_PTAG_SHUTDOWN_LOGERROR	(1u << 6)
+#define		XFS_PTAG_FSBLOCK_ZERO		(1u << 7)
+#define		XFS_PTAG_VERIFIER_ERROR		(1u << 8)
+
+#define XFS_PTAG_STRINGS \
+	{ XFS_NO_PTAG,			"none" }, \
+	{ XFS_PTAG_IFLUSH,		"iflush" }, \
+	{ XFS_PTAG_LOGRES,		"logres" }, \
+	{ XFS_PTAG_AILDELETE,		"aildelete" }, \
+	{ XFS_PTAG_ERROR_REPORT	,	"error_report" }, \
+	{ XFS_PTAG_SHUTDOWN_CORRUPT,	"corrupt" }, \
+	{ XFS_PTAG_SHUTDOWN_IOERROR,	"ioerror" }, \
+	{ XFS_PTAG_SHUTDOWN_LOGERROR,	"logerror" }, \
+	{ XFS_PTAG_FSBLOCK_ZERO,	"fsb_zero" }, \
+	{ XFS_PTAG_VERIFIER_ERROR,	"verifier" }
+
+#endif	/* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
new file mode 100644
index 000000000..1064c2342
--- /dev/null
+++ b/fs/xfs/xfs_export.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_dir2.h"
+#include "xfs_export.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_icache.h"
+#include "xfs_pnfs.h"
+
+/*
+ * Note that we only accept fileids which are long enough rather than allow
+ * the parent generation number to default to zero.  XFS considers zero a
+ * valid generation number not an invalid/wildcard value.
+ */
+static int xfs_fileid_length(int fileid_type)
+{
+	switch (fileid_type) {
+	case FILEID_INO32_GEN:
+		return 2;
+	case FILEID_INO32_GEN_PARENT:
+		return 4;
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		return 3;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		return 6;
+	}
+	return FILEID_INVALID;
+}
+
+STATIC int
+xfs_fs_encode_fh(
+	struct inode	*inode,
+	__u32		*fh,
+	int		*max_len,
+	struct inode	*parent)
+{
+	struct xfs_mount	*mp = XFS_M(inode->i_sb);
+	struct fid		*fid = (struct fid *)fh;
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fh;
+	int			fileid_type;
+	int			len;
+
+	/* Directories don't need their parent encoded, they have ".." */
+	if (!parent)
+		fileid_type = FILEID_INO32_GEN;
+	else
+		fileid_type = FILEID_INO32_GEN_PARENT;
+
+	/*
+	 * If the filesystem may contain 64bit inode numbers, we need
+	 * to use larger file handles that can represent them.
+	 *
+	 * While we only allocate inodes that do not fit into 32 bits any
+	 * large enough filesystem may contain them, thus the slightly
+	 * confusing looking conditional below.
+	 */
+	if (!xfs_has_small_inums(mp) || xfs_is_inode32(mp))
+		fileid_type |= XFS_FILEID_TYPE_64FLAG;
+
+	/*
+	 * Only encode if there is enough space given.  In practice
+	 * this means we can't export a filesystem with 64bit inodes
+	 * over NFSv2 with the subtree_check export option; the other
+	 * seven combinations work.  The real answer is "don't use v2".
+	 */
+	len = xfs_fileid_length(fileid_type);
+	if (*max_len < len) {
+		*max_len = len;
+		return FILEID_INVALID;
+	}
+	*max_len = len;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+		fid->i32.parent_ino = XFS_I(parent)->i_ino;
+		fid->i32.parent_gen = parent->i_generation;
+		fallthrough;
+	case FILEID_INO32_GEN:
+		fid->i32.ino = XFS_I(inode)->i_ino;
+		fid->i32.gen = inode->i_generation;
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		fid64->parent_ino = XFS_I(parent)->i_ino;
+		fid64->parent_gen = parent->i_generation;
+		fallthrough;
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		fid64->ino = XFS_I(inode)->i_ino;
+		fid64->gen = inode->i_generation;
+		break;
+	}
+
+	return fileid_type;
+}
+
+STATIC struct inode *
+xfs_nfs_get_inode(
+	struct super_block	*sb,
+	u64			ino,
+	u32			generation)
+{
+ 	xfs_mount_t		*mp = XFS_M(sb);
+	xfs_inode_t		*ip;
+	int			error;
+
+	/*
+	 * NFS can sometimes send requests for ino 0.  Fail them gracefully.
+	 */
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+
+	/*
+	 * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
+	 * fine and not an indication of a corrupted filesystem as clients can
+	 * send invalid file handles and we have to handle it gracefully..
+	 */
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
+	if (error) {
+
+		/*
+		 * EINVAL means the inode cluster doesn't exist anymore.
+		 * EFSCORRUPTED means the metadata pointing to the inode cluster
+		 * or the inode cluster itself is corrupt.  This implies the
+		 * filehandle is stale, so we should translate it here.
+		 * We don't use ESTALE directly down the chain to not
+		 * confuse applications using bulkstat that expect EINVAL.
+		 */
+		switch (error) {
+		case -EINVAL:
+		case -ENOENT:
+		case -EFSCORRUPTED:
+			error = -ESTALE;
+			break;
+		default:
+			break;
+		}
+		return ERR_PTR(error);
+	}
+
+	if (VFS_I(ip)->i_generation != generation) {
+		xfs_irele(ip);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return VFS_I(ip);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		 int fh_len, int fileid_type)
+{
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
+	struct inode		*inode = NULL;
+
+	if (fh_len < xfs_fileid_length(fileid_type))
+		return NULL;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+	case FILEID_INO32_GEN:
+		inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen);
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		 int fh_len, int fileid_type)
+{
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
+	struct inode		*inode = NULL;
+
+	if (fh_len < xfs_fileid_length(fileid_type))
+		return NULL;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+		inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
+					      fid->i32.parent_gen);
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		inode = xfs_nfs_get_inode(sb, fid64->parent_ino,
+					      fid64->parent_gen);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_get_parent(
+	struct dentry		*child)
+{
+	int			error;
+	struct xfs_inode	*cip;
+
+	error = xfs_lookup(XFS_I(d_inode(child)), &xfs_name_dotdot, &cip, NULL);
+	if (unlikely(error))
+		return ERR_PTR(error);
+
+	return d_obtain_alias(VFS_I(cip));
+}
+
+STATIC int
+xfs_fs_nfs_commit_metadata(
+	struct inode		*inode)
+{
+	return xfs_log_force_inode(XFS_I(inode));
+}
+
+const struct export_operations xfs_export_operations = {
+	.encode_fh		= xfs_fs_encode_fh,
+	.fh_to_dentry		= xfs_fs_fh_to_dentry,
+	.fh_to_parent		= xfs_fs_fh_to_parent,
+	.get_parent		= xfs_fs_get_parent,
+	.commit_metadata	= xfs_fs_nfs_commit_metadata,
+#ifdef CONFIG_EXPORTFS_BLOCK_OPS
+	.get_uuid		= xfs_fs_get_uuid,
+	.map_blocks		= xfs_fs_map_blocks,
+	.commit_blocks		= xfs_fs_commit_blocks,
+#endif
+};
diff --git a/fs/xfs/xfs_export.h b/fs/xfs/xfs_export.h
new file mode 100644
index 000000000..64471a3dd
--- /dev/null
+++ b/fs/xfs/xfs_export.h
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_EXPORT_H__
+#define __XFS_EXPORT_H__
+
+/*
+ * Common defines for code related to exporting XFS filesystems over NFS.
+ *
+ * The NFS fileid goes out on the wire as an array of
+ * 32bit unsigned ints in host order.  There are 5 possible
+ * formats.
+ *
+ * (1)	fileid_type=0x00
+ *	(no fileid data; handled by the generic code)
+ *
+ * (2)	fileid_type=0x01
+ *	inode-num
+ *	generation
+ *
+ * (3)	fileid_type=0x02
+ *	inode-num
+ *	generation
+ *	parent-inode-num
+ *	parent-generation
+ *
+ * (4)	fileid_type=0x81
+ *	inode-num-lo32
+ *	inode-num-hi32
+ *	generation
+ *
+ * (5)	fileid_type=0x82
+ *	inode-num-lo32
+ *	inode-num-hi32
+ *	generation
+ *	parent-inode-num-lo32
+ *	parent-inode-num-hi32
+ *	parent-generation
+ *
+ * Note, the NFS filehandle also includes an fsid portion which
+ * may have an inode number in it.  That number is hardcoded to
+ * 32bits and there is no way for XFS to intercept it.  In
+ * practice this means when exporting an XFS filesystem with 64bit
+ * inodes you should either export the mountpoint (rather than
+ * a subdirectory) or use the "fsid" export option.
+ */
+
+struct xfs_fid64 {
+	u64 ino;
+	u32 gen;
+	u64 parent_ino;
+	u32 parent_gen;
+} __attribute__((packed));
+
+/* This flag goes on the wire.  Don't play with it. */
+#define XFS_FILEID_TYPE_64FLAG	0x80	/* NFS fileid has 64bit inodes */
+
+#endif	/* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
new file mode 100644
index 000000000..ad22a003f
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.c
@@ -0,0 +1,630 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+
+void
+xfs_extent_busy_insert(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	unsigned int		flags)
+{
+	struct xfs_extent_busy	*new;
+	struct xfs_extent_busy	*busyp;
+	struct rb_node		**rbp;
+	struct rb_node		*parent = NULL;
+
+	new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
+	new->agno = pag->pag_agno;
+	new->bno = bno;
+	new->length = len;
+	INIT_LIST_HEAD(&new->list);
+	new->flags = flags;
+
+	/* trace before insert to be able to see failed inserts */
+	trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len);
+
+	spin_lock(&pag->pagb_lock);
+	rbp = &pag->pagb_tree.rb_node;
+	while (*rbp) {
+		parent = *rbp;
+		busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
+
+		if (new->bno < busyp->bno) {
+			rbp = &(*rbp)->rb_left;
+			ASSERT(new->bno + new->length <= busyp->bno);
+		} else if (new->bno > busyp->bno) {
+			rbp = &(*rbp)->rb_right;
+			ASSERT(bno >= busyp->bno + busyp->length);
+		} else {
+			ASSERT(0);
+		}
+	}
+
+	rb_link_node(&new->rb_node, parent, rbp);
+	rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+	list_add(&new->list, &tp->t_busy);
+	spin_unlock(&pag->pagb_lock);
+}
+
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate.  You need to be holding the busy extent tree lock when calling
+ * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+int
+xfs_extent_busy_search(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len)
+{
+	struct rb_node		*rbp;
+	struct xfs_extent_busy	*busyp;
+	int			match = 0;
+
+	/* find closest start bno overlap */
+	spin_lock(&pag->pagb_lock);
+	rbp = pag->pagb_tree.rb_node;
+	while (rbp) {
+		busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
+		if (bno < busyp->bno) {
+			/* may overlap, but exact start block is lower */
+			if (bno + len > busyp->bno)
+				match = -1;
+			rbp = rbp->rb_left;
+		} else if (bno > busyp->bno) {
+			/* may overlap, but exact start block is higher */
+			if (bno < busyp->bno + busyp->length)
+				match = -1;
+			rbp = rbp->rb_right;
+		} else {
+			/* bno matches busyp, length determines exact match */
+			match = (busyp->length == len) ? 1 : -1;
+			break;
+		}
+	}
+	spin_unlock(&pag->pagb_lock);
+	return match;
+}
+
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent.  If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation.  We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entry's block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_extent_busy_update_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	struct xfs_extent_busy	*busyp,
+	xfs_agblock_t		fbno,
+	xfs_extlen_t		flen,
+	bool			userdata) __releases(&pag->pagb_lock)
+					  __acquires(&pag->pagb_lock)
+{
+	xfs_agblock_t		fend = fbno + flen;
+	xfs_agblock_t		bbno = busyp->bno;
+	xfs_agblock_t		bend = bbno + busyp->length;
+
+	/*
+	 * This extent is currently being discarded.  Give the thread
+	 * performing the discard a chance to mark the extent unbusy
+	 * and retry.
+	 */
+	if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
+		spin_unlock(&pag->pagb_lock);
+		delay(1);
+		spin_lock(&pag->pagb_lock);
+		return false;
+	}
+
+	/*
+	 * If there is a busy extent overlapping a user allocation, we have
+	 * no choice but to force the log and retry the search.
+	 *
+	 * Fortunately this does not happen during normal operation, but
+	 * only if the filesystem is very low on space and has to dip into
+	 * the AGFL for normal allocations.
+	 */
+	if (userdata)
+		goto out_force_log;
+
+	if (bbno < fbno && bend > fend) {
+		/*
+		 * Case 1:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +---------+
+		 *        fbno   fend
+		 */
+
+		/*
+		 * We would have to split the busy extent to be able to track
+		 * it correct, which we cannot do because we would have to
+		 * modify the list of busy extents attached to the transaction
+		 * or CIL context, which is immutable.
+		 *
+		 * Force out the log to clear the busy extent and retry the
+		 * search.
+		 */
+		goto out_force_log;
+	} else if (bbno >= fbno && bend <= fend) {
+		/*
+		 * Case 2:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *    +-----------------+
+		 *    fbno           fend
+		 *
+		 * Case 3:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *    +--------------------------+
+		 *    fbno                    fend
+		 *
+		 * Case 4:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +--------------------------+
+		 *    fbno                    fend
+		 *
+		 * Case 5:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +-----------------------------------+
+		 *    fbno                             fend
+		 *
+		 */
+
+		/*
+		 * The busy extent is fully covered by the extent we are
+		 * allocating, and can simply be removed from the rbtree.
+		 * However we cannot remove it from the immutable list
+		 * tracking busy extents in the transaction or CIL context,
+		 * so set the length to zero to mark it invalid.
+		 *
+		 * We also need to restart the busy extent search from the
+		 * tree root, because erasing the node can rearrange the
+		 * tree topology.
+		 */
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+		busyp->length = 0;
+		return false;
+	} else if (fend < bend) {
+		/*
+		 * Case 6:
+		 *              bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *             +---------+
+		 *             fbno   fend
+		 *
+		 * Case 7:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +------------------+
+		 *    fbno            fend
+		 *
+		 */
+		busyp->bno = fend;
+	} else if (bbno < fbno) {
+		/*
+		 * Case 8:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +-------------+
+		 *        fbno       fend
+		 *
+		 * Case 9:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +----------------------+
+		 *        fbno                fend
+		 */
+		busyp->length = fbno - busyp->bno;
+	} else {
+		ASSERT(0);
+	}
+
+	trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
+	return true;
+
+out_force_log:
+	spin_unlock(&pag->pagb_lock);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
+	spin_lock(&pag->pagb_lock);
+	return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
+void
+xfs_extent_busy_reuse(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	xfs_agblock_t		fbno,
+	xfs_extlen_t		flen,
+	bool			userdata)
+{
+	struct rb_node		*rbp;
+
+	ASSERT(flen > 0);
+	spin_lock(&pag->pagb_lock);
+restart:
+	rbp = pag->pagb_tree.rb_node;
+	while (rbp) {
+		struct xfs_extent_busy *busyp =
+			rb_entry(rbp, struct xfs_extent_busy, rb_node);
+		xfs_agblock_t	bbno = busyp->bno;
+		xfs_agblock_t	bend = bbno + busyp->length;
+
+		if (fbno + flen <= bbno) {
+			rbp = rbp->rb_left;
+			continue;
+		} else if (fbno >= bend) {
+			rbp = rbp->rb_right;
+			continue;
+		}
+
+		if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
+						  userdata))
+			goto restart;
+	}
+	spin_unlock(&pag->pagb_lock);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy.  If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ *
+ * Return the current busy generation for the AG if the extent is busy. This
+ * value can be used to wait for at least one of the currently busy extents
+ * to be cleared. Note that the busy list is not guaranteed to be empty after
+ * the gen is woken. The state of a specific extent must always be confirmed
+ * with another call to xfs_extent_busy_trim() before it can be used.
+ */
+bool
+xfs_extent_busy_trim(
+	struct xfs_alloc_arg	*args,
+	xfs_agblock_t		*bno,
+	xfs_extlen_t		*len,
+	unsigned		*busy_gen)
+{
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	struct rb_node		*rbp;
+	bool			ret = false;
+
+	ASSERT(*len > 0);
+
+	spin_lock(&args->pag->pagb_lock);
+	fbno = *bno;
+	flen = *len;
+	rbp = args->pag->pagb_tree.rb_node;
+	while (rbp && flen >= args->minlen) {
+		struct xfs_extent_busy *busyp =
+			rb_entry(rbp, struct xfs_extent_busy, rb_node);
+		xfs_agblock_t	fend = fbno + flen;
+		xfs_agblock_t	bbno = busyp->bno;
+		xfs_agblock_t	bend = bbno + busyp->length;
+
+		if (fend <= bbno) {
+			rbp = rbp->rb_left;
+			continue;
+		} else if (fbno >= bend) {
+			rbp = rbp->rb_right;
+			continue;
+		}
+
+		if (bbno <= fbno) {
+			/* start overlap */
+
+			/*
+			 * Case 1:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +---------+
+			 *        fbno   fend
+			 *
+			 * Case 2:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +-------------+
+			 *    fbno       fend
+			 *
+			 * Case 3:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +-------------+
+			 *        fbno       fend
+			 *
+			 * Case 4:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +-----------------+
+			 *    fbno           fend
+			 *
+			 * No unbusy region in extent, return failure.
+			 */
+			if (fend <= bend)
+				goto fail;
+
+			/*
+			 * Case 5:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +----------------------+
+			 *        fbno                fend
+			 *
+			 * Case 6:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +--------------------------+
+			 *    fbno                    fend
+			 *
+			 * Needs to be trimmed to:
+			 *                       +-------+
+			 *                       fbno fend
+			 */
+			fbno = bend;
+		} else if (bend >= fend) {
+			/* end overlap */
+
+			/*
+			 * Case 7:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +------------------+
+			 *    fbno            fend
+			 *
+			 * Case 8:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +--------------------------+
+			 *    fbno                    fend
+			 *
+			 * Needs to be trimmed to:
+			 *    +-------+
+			 *    fbno fend
+			 */
+			fend = bbno;
+		} else {
+			/* middle overlap */
+
+			/*
+			 * Case 9:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +-----------------------------------+
+			 *    fbno                             fend
+			 *
+			 * Can be trimmed to:
+			 *    +-------+        OR         +-------+
+			 *    fbno fend                   fbno fend
+			 *
+			 * Backward allocation leads to significant
+			 * fragmentation of directories, which degrades
+			 * directory performance, therefore we always want to
+			 * choose the option that produces forward allocation
+			 * patterns.
+			 * Preferring the lower bno extent will make the next
+			 * request use "fend" as the start of the next
+			 * allocation;  if the segment is no longer busy at
+			 * that point, we'll get a contiguous allocation, but
+			 * even if it is still busy, we will get a forward
+			 * allocation.
+			 * We try to avoid choosing the segment at "bend",
+			 * because that can lead to the next allocation
+			 * taking the segment at "fbno", which would be a
+			 * backward allocation.  We only use the segment at
+			 * "fbno" if it is much larger than the current
+			 * requested size, because in that case there's a
+			 * good chance subsequent allocations will be
+			 * contiguous.
+			 */
+			if (bbno - fbno >= args->maxlen) {
+				/* left candidate fits perfect */
+				fend = bbno;
+			} else if (fend - bend >= args->maxlen * 4) {
+				/* right candidate has enough free space */
+				fbno = bend;
+			} else if (bbno - fbno >= args->minlen) {
+				/* left candidate fits minimum requirement */
+				fend = bbno;
+			} else {
+				goto fail;
+			}
+		}
+
+		flen = fend - fbno;
+	}
+out:
+
+	if (fbno != *bno || flen != *len) {
+		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
+					  fbno, flen);
+		*bno = fbno;
+		*len = flen;
+		*busy_gen = args->pag->pagb_gen;
+		ret = true;
+	}
+	spin_unlock(&args->pag->pagb_lock);
+	return ret;
+fail:
+	/*
+	 * Return a zero extent length as failure indications.  All callers
+	 * re-check if the trimmed extent satisfies the minlen requirement.
+	 */
+	flen = 0;
+	goto out;
+}
+
+STATIC void
+xfs_extent_busy_clear_one(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	struct xfs_extent_busy	*busyp)
+{
+	if (busyp->length) {
+		trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
+						busyp->length);
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+	}
+
+	list_del_init(&busyp->list);
+	kmem_free(busyp);
+}
+
+static void
+xfs_extent_busy_put_pag(
+	struct xfs_perag	*pag,
+	bool			wakeup)
+		__releases(pag->pagb_lock)
+{
+	if (wakeup) {
+		pag->pagb_gen++;
+		wake_up_all(&pag->pagb_wait);
+	}
+
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+}
+
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_extent_busy_clear(
+	struct xfs_mount	*mp,
+	struct list_head	*list,
+	bool			do_discard)
+{
+	struct xfs_extent_busy	*busyp, *n;
+	struct xfs_perag	*pag = NULL;
+	xfs_agnumber_t		agno = NULLAGNUMBER;
+	bool			wakeup = false;
+
+	list_for_each_entry_safe(busyp, n, list, list) {
+		if (busyp->agno != agno) {
+			if (pag)
+				xfs_extent_busy_put_pag(pag, wakeup);
+			agno = busyp->agno;
+			pag = xfs_perag_get(mp, agno);
+			spin_lock(&pag->pagb_lock);
+			wakeup = false;
+		}
+
+		if (do_discard && busyp->length &&
+		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
+			busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
+		} else {
+			xfs_extent_busy_clear_one(mp, pag, busyp);
+			wakeup = true;
+		}
+	}
+
+	if (pag)
+		xfs_extent_busy_put_pag(pag, wakeup);
+}
+
+/*
+ * Flush out all busy extents for this AG.
+ */
+void
+xfs_extent_busy_flush(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	unsigned		busy_gen)
+{
+	DEFINE_WAIT		(wait);
+	int			error;
+
+	error = xfs_log_force(mp, XFS_LOG_SYNC);
+	if (error)
+		return;
+
+	do {
+		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
+		if  (busy_gen != READ_ONCE(pag->pagb_gen))
+			break;
+		schedule();
+	} while (1);
+
+	finish_wait(&pag->pagb_wait, &wait);
+}
+
+void
+xfs_extent_busy_wait_all(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	DEFINE_WAIT		(wait);
+	xfs_agnumber_t		agno;
+
+	for_each_perag(mp, agno, pag) {
+		do {
+			prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
+			if  (RB_EMPTY_ROOT(&pag->pagb_tree))
+				break;
+			schedule();
+		} while (1);
+		finish_wait(&pag->pagb_wait, &wait);
+	}
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_extent_busy_ag_cmp(
+	void			*priv,
+	const struct list_head	*l1,
+	const struct list_head	*l2)
+{
+	struct xfs_extent_busy	*b1 =
+		container_of(l1, struct xfs_extent_busy, list);
+	struct xfs_extent_busy	*b2 =
+		container_of(l2, struct xfs_extent_busy, list);
+	s32 diff;
+
+	diff = b1->agno - b2->agno;
+	if (!diff)
+		diff = b1->bno - b2->bno;
+	return diff;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
new file mode 100644
index 000000000..4a1181310
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.h
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_EXTENT_BUSY_H__
+#define	__XFS_EXTENT_BUSY_H__
+
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+struct xfs_alloc_arg;
+
+/*
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_extent_busy_insert() for details.
+ */
+struct xfs_extent_busy {
+	struct rb_node	rb_node;	/* ag by-bno indexed search tree */
+	struct list_head list;		/* transaction busy extent list */
+	xfs_agnumber_t	agno;
+	xfs_agblock_t	bno;
+	xfs_extlen_t	length;
+	unsigned int	flags;
+#define XFS_EXTENT_BUSY_DISCARDED	0x01	/* undergoing a discard op. */
+#define XFS_EXTENT_BUSY_SKIP_DISCARD	0x02	/* do not discard */
+};
+
+void
+xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag,
+	xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+
+void
+xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
+	bool do_discard);
+
+int
+xfs_extent_busy_search(struct xfs_mount *mp, struct xfs_perag *pag,
+	xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_extent_busy_reuse(struct xfs_mount *mp, struct xfs_perag *pag,
+	xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+bool
+xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
+		xfs_extlen_t *len, unsigned *busy_gen);
+
+void
+xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag,
+	unsigned busy_gen);
+
+void
+xfs_extent_busy_wait_all(struct xfs_mount *mp);
+
+int
+xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a,
+	const struct list_head *b);
+
+static inline void xfs_extent_busy_sort(struct list_head *list)
+{
+	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
+}
+
+#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
new file mode 100644
index 000000000..d5130d1fc
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.c
@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_extfree_item.h"
+#include "xfs_log.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache	*xfs_efi_cache;
+struct kmem_cache	*xfs_efd_cache;
+
+static const struct xfs_item_ops xfs_efi_item_ops;
+
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
+
+STATIC void
+xfs_efi_item_free(
+	struct xfs_efi_log_item	*efip)
+{
+	kmem_free(efip->efi_item.li_lv_shadow);
+	if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
+		kmem_free(efip);
+	else
+		kmem_cache_free(xfs_efi_cache, efip);
+}
+
+/*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
+ */
+STATIC void
+xfs_efi_release(
+	struct xfs_efi_log_item	*efip)
+{
+	ASSERT(atomic_read(&efip->efi_refcount) > 0);
+	if (!atomic_dec_and_test(&efip->efi_refcount))
+		return;
+
+	xfs_trans_ail_delete(&efip->efi_item, 0);
+	xfs_efi_item_free(efip);
+}
+
+STATIC void
+xfs_efi_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efi log item. We use only 1 iovec, and we point that
+ * at the efi_log_format structure embedded in the efi item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efi item have been filled.
+ */
+STATIC void
+xfs_efi_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&efip->efi_next_extent) ==
+				efip->efi_format.efi_nextents);
+
+	efip->efi_format.efi_type = XFS_LI_EFI;
+	efip->efi_format.efi_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
+			&efip->efi_format,
+			xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents));
+}
+
+
+/*
+ * The unpin operation is the last place an EFI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the EFI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the EFI to either construct
+ * and commit the EFD or drop the EFD's reference in the event of error. Simply
+ * drop the log's EFI reference now that the log is done with it.
+ */
+STATIC void
+xfs_efi_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+	xfs_efi_release(efip);
+}
+
+/*
+ * The EFI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an EFD isn't going to be
+ * constructed and thus we free the EFI here directly.
+ */
+STATIC void
+xfs_efi_item_release(
+	struct xfs_log_item	*lip)
+{
+	xfs_efi_release(EFI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an efi item with the given number of extents.
+ */
+STATIC struct xfs_efi_log_item *
+xfs_efi_init(
+	struct xfs_mount	*mp,
+	uint			nextents)
+
+{
+	struct xfs_efi_log_item	*efip;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+		efip = kzalloc(xfs_efi_log_item_sizeof(nextents),
+				GFP_KERNEL | __GFP_NOFAIL);
+	} else {
+		efip = kmem_cache_zalloc(xfs_efi_cache,
+					 GFP_KERNEL | __GFP_NOFAIL);
+	}
+
+	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+	efip->efi_format.efi_nextents = nextents;
+	efip->efi_format.efi_id = (uintptr_t)(void *)efip;
+	atomic_set(&efip->efi_next_extent, 0);
+	atomic_set(&efip->efi_refcount, 2);
+
+	return efip;
+}
+
+/*
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+ * The given buffer can be in 32 bit or 64 bit form (which has different padding),
+ * one of which will be the native format for this kernel.
+ * It will handle the conversion of formats if necessary.
+ */
+STATIC int
+xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+{
+	xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
+	uint i;
+	uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
+	uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
+	uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
+
+	if (buf->i_len == len) {
+		memcpy(dst_efi_fmt, src_efi_fmt,
+		       offsetof(struct xfs_efi_log_format, efi_extents));
+		for (i = 0; i < src_efi_fmt->efi_nextents; i++)
+			memcpy(&dst_efi_fmt->efi_extents[i],
+			       &src_efi_fmt->efi_extents[i],
+			       sizeof(struct xfs_extent));
+		return 0;
+	} else if (buf->i_len == len32) {
+		xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_32->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_32->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_32->efi_extents[i].ext_len;
+		}
+		return 0;
+	} else if (buf->i_len == len64) {
+		xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_64->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_64->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_64->efi_extents[i].ext_len;
+		}
+		return 0;
+	}
+	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr,
+			buf->i_len);
+	return -EFSCORRUPTED;
+}
+
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
+
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+	kmem_free(efdp->efd_item.li_lv_shadow);
+	if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
+		kmem_free(efdp);
+	else
+		kmem_cache_free(xfs_efd_cache, efdp);
+}
+
+STATIC void
+xfs_efd_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efd log item. We use only 1 iovec, and we point that
+ * at the efd_log_format structure embedded in the efd item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efd item have been filled.
+ */
+STATIC void
+xfs_efd_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+
+	efdp->efd_format.efd_type = XFS_LI_EFD;
+	efdp->efd_format.efd_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
+			&efdp->efd_format,
+			xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents));
+}
+
+/*
+ * The EFD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the EFI and free the EFD.
+ */
+STATIC void
+xfs_efd_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+
+	xfs_efi_release(efdp->efd_efip);
+	xfs_efd_item_free(efdp);
+}
+
+static struct xfs_log_item *
+xfs_efd_item_intent(
+	struct xfs_log_item	*lip)
+{
+	return &EFD_ITEM(lip)->efd_efip->efi_item;
+}
+
+static const struct xfs_item_ops xfs_efd_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED |
+			  XFS_ITEM_INTENT_DONE,
+	.iop_size	= xfs_efd_item_size,
+	.iop_format	= xfs_efd_item_format,
+	.iop_release	= xfs_efd_item_release,
+	.iop_intent	= xfs_efd_item_intent,
+};
+
+/*
+ * Allocate an "extent free done" log item that will hold nextents worth of
+ * extents.  The caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+static struct xfs_efd_log_item *
+xfs_trans_get_efd(
+	struct xfs_trans		*tp,
+	struct xfs_efi_log_item		*efip,
+	unsigned int			nextents)
+{
+	struct xfs_efd_log_item		*efdp;
+
+	ASSERT(nextents > 0);
+
+	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
+		efdp = kzalloc(xfs_efd_log_item_sizeof(nextents),
+				GFP_KERNEL | __GFP_NOFAIL);
+	} else {
+		efdp = kmem_cache_zalloc(xfs_efd_cache,
+					GFP_KERNEL | __GFP_NOFAIL);
+	}
+
+	xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+			  &xfs_efd_item_ops);
+	efdp->efd_efip = efip;
+	efdp->efd_format.efd_nextents = nextents;
+	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+	xfs_trans_add_item(tp, &efdp->efd_item);
+	return efdp;
+}
+
+/*
+ * Free an extent and log it to the EFD. Note that the transaction is marked
+ * dirty regardless of whether the extent free succeeds or fails to support the
+ * EFI/EFD lifecycle rules.
+ */
+static int
+xfs_trans_free_extent(
+	struct xfs_trans		*tp,
+	struct xfs_efd_log_item		*efdp,
+	xfs_fsblock_t			start_block,
+	xfs_extlen_t			ext_len,
+	const struct xfs_owner_info	*oinfo,
+	bool				skip_discard)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_extent		*extp;
+	uint				next_extent;
+	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, start_block);
+	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
+								start_block);
+	int				error;
+
+	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
+
+	error = __xfs_free_extent(tp, start_block, ext_len,
+				  oinfo, XFS_AG_RESV_NONE, skip_discard);
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the EFI and frees the EFD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+	efdp->efd_next_extent++;
+
+	return error;
+}
+
+/* Sort bmap items by AG. */
+static int
+xfs_extent_free_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_mount		*mp = priv;
+	struct xfs_extent_free_item	*ra;
+	struct xfs_extent_free_item	*rb;
+
+	ra = container_of(a, struct xfs_extent_free_item, xefi_list);
+	rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+}
+
+/* Log a free extent to the intent item. */
+STATIC void
+xfs_extent_free_log_item(
+	struct xfs_trans		*tp,
+	struct xfs_efi_log_item		*efip,
+	struct xfs_extent_free_item	*free)
+{
+	uint				next_extent;
+	struct xfs_extent		*extp;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+	ASSERT(next_extent < efip->efi_format.efi_nextents);
+	extp = &efip->efi_format.efi_extents[next_extent];
+	extp->ext_start = free->xefi_startblock;
+	extp->ext_len = free->xefi_blockcount;
+}
+
+static struct xfs_log_item *
+xfs_extent_free_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_efi_log_item		*efip = xfs_efi_init(mp, count);
+	struct xfs_extent_free_item	*free;
+
+	ASSERT(count > 0);
+
+	xfs_trans_add_item(tp, &efip->efi_item);
+	if (sort)
+		list_sort(mp, items, xfs_extent_free_diff_items);
+	list_for_each_entry(free, items, xefi_list)
+		xfs_extent_free_log_item(tp, efip, free);
+	return &efip->efi_item;
+}
+
+/* Get an EFD so we can process all the free extents. */
+static struct xfs_log_item *
+xfs_extent_free_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
+}
+
+/* Process a free extent. */
+STATIC int
+xfs_extent_free_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_owner_info		oinfo = { };
+	struct xfs_extent_free_item	*free;
+	int				error;
+
+	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+	oinfo.oi_owner = free->xefi_owner;
+	if (free->xefi_flags & XFS_EFI_ATTR_FORK)
+		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+	if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
+		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+	error = xfs_trans_free_extent(tp, EFD_ITEM(done),
+			free->xefi_startblock,
+			free->xefi_blockcount,
+			&oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
+	kmem_cache_free(xfs_extfree_item_cache, free);
+	return error;
+}
+
+/* Abort all pending EFIs. */
+STATIC void
+xfs_extent_free_abort_intent(
+	struct xfs_log_item		*intent)
+{
+	xfs_efi_release(EFI_ITEM(intent));
+}
+
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_extent_free_item	*free;
+
+	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+	kmem_cache_free(xfs_extfree_item_cache, free);
+}
+
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_extent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_extent_free_finish_item,
+	.cancel_item	= xfs_extent_free_cancel_item,
+};
+
+/*
+ * AGFL blocks are accounted differently in the reserve pools and are not
+ * inserted into the busy extent list.
+ */
+STATIC int
+xfs_agfl_free_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_owner_info		oinfo = { };
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
+	struct xfs_extent_free_item	*free;
+	struct xfs_extent		*extp;
+	struct xfs_buf			*agbp;
+	int				error;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	uint				next_extent;
+	struct xfs_perag		*pag;
+
+	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+	ASSERT(free->xefi_blockcount == 1);
+	agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+	agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+	oinfo.oi_owner = free->xefi_owner;
+
+	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+
+	pag = xfs_perag_get(mp, agno);
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+	if (!error)
+		error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
+	xfs_perag_put(pag);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the EFI and frees the EFD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = free->xefi_startblock;
+	extp->ext_len = free->xefi_blockcount;
+	efdp->efd_next_extent++;
+
+	kmem_cache_free(xfs_extfree_item_cache, free);
+	return error;
+}
+
+/* sub-type with special handling for AGFL deferred frees */
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_extent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_agfl_free_finish_item,
+	.cancel_item	= xfs_extent_free_cancel_item,
+};
+
+/* Is this recovered EFI ok? */
+static inline bool
+xfs_efi_validate_ext(
+	struct xfs_mount		*mp,
+	struct xfs_extent		*extp)
+{
+	return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len);
+}
+
+/*
+ * Process an extent free intent item that was recovered from
+ * the log.  We need to free the extents that it describes.
+ */
+STATIC int
+xfs_efi_item_recover(
+	struct xfs_log_item		*lip,
+	struct list_head		*capture_list)
+{
+	struct xfs_efi_log_item		*efip = EFI_ITEM(lip);
+	struct xfs_mount		*mp = lip->li_log->l_mp;
+	struct xfs_efd_log_item		*efdp;
+	struct xfs_trans		*tp;
+	struct xfs_extent		*extp;
+	int				i;
+	int				error = 0;
+
+	/*
+	 * First check the validity of the extents described by the
+	 * EFI.  If any are bad, then assume that all are bad and
+	 * just toss the EFI.
+	 */
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		if (!xfs_efi_validate_ext(mp,
+					&efip->efi_format.efi_extents[i])) {
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					&efip->efi_format,
+					sizeof(efip->efi_format));
+			return -EFSCORRUPTED;
+		}
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error)
+		return error;
+	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &efip->efi_format.efi_extents[i];
+		error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
+					      extp->ext_len,
+					      &XFS_RMAP_OINFO_ANY_OWNER, false);
+		if (error == -EFSCORRUPTED)
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					extp, sizeof(*extp));
+		if (error)
+			goto abort_error;
+
+	}
+
+	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+abort_error:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+STATIC bool
+xfs_efi_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_efi_item_relog(
+	struct xfs_log_item		*intent,
+	struct xfs_trans		*tp)
+{
+	struct xfs_efd_log_item		*efdp;
+	struct xfs_efi_log_item		*efip;
+	struct xfs_extent		*extp;
+	unsigned int			count;
+
+	count = EFI_ITEM(intent)->efi_format.efi_nextents;
+	extp = EFI_ITEM(intent)->efi_format.efi_extents;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
+	efdp->efd_next_extent = count;
+	memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
+	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+	efip = xfs_efi_init(tp->t_mountp, count);
+	memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
+	atomic_set(&efip->efi_next_extent, count);
+	xfs_trans_add_item(tp, &efip->efi_item);
+	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+	return &efip->efi_item;
+}
+
+static const struct xfs_item_ops xfs_efi_item_ops = {
+	.flags		= XFS_ITEM_INTENT,
+	.iop_size	= xfs_efi_item_size,
+	.iop_format	= xfs_efi_item_format,
+	.iop_unpin	= xfs_efi_item_unpin,
+	.iop_release	= xfs_efi_item_release,
+	.iop_recover	= xfs_efi_item_recover,
+	.iop_match	= xfs_efi_item_match,
+	.iop_relog	= xfs_efi_item_relog,
+};
+
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_efi_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_efi_log_item		*efip;
+	struct xfs_efi_log_format	*efi_formatp;
+	int				error;
+
+	efi_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+	error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+	if (error) {
+		xfs_efi_item_free(efip);
+		return error;
+	}
+	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+	/*
+	 * Insert the intent into the AIL directly and drop one reference so
+	 * that finishing or canceling the work will drop the other.
+	 */
+	xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn);
+	xfs_efi_release(efip);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_efi_item_ops = {
+	.item_type		= XFS_LI_EFI,
+	.commit_pass2		= xlog_recover_efi_commit_pass2,
+};
+
+/*
+ * This routine is called when an EFD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding EFI if it
+ * was still in the log. To do this it searches the AIL for the EFI with an id
+ * equal to that in the EFD format structure. If we find it we drop the EFD
+ * reference, which removes the EFI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_efd_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_efd_log_format	*efd_formatp;
+	int				buflen = item->ri_buf[0].i_len;
+
+	efd_formatp = item->ri_buf[0].i_addr;
+
+	if (buflen < sizeof(struct xfs_efd_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				efd_formatp, buflen);
+		return -EFSCORRUPTED;
+	}
+
+	if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+						efd_formatp->efd_nextents) &&
+	    item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+						efd_formatp->efd_nextents)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				efd_formatp, buflen);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_efd_item_ops = {
+	.item_type		= XFS_LI_EFD,
+	.commit_pass2		= xlog_recover_efd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
new file mode 100644
index 000000000..da6a5afa6
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.h
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_EXTFREE_ITEM_H__
+#define	__XFS_EXTFREE_ITEM_H__
+
+/* kernel only EFI/EFD definitions */
+
+struct xfs_mount;
+struct kmem_cache;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_EFI_MAX_FAST_EXTENTS	16
+
+/*
+ * This is the "extent free intention" log item.  It is used to log the fact
+ * that some extents need to be free.  It is used in conjunction with the
+ * "extent free done" log item described below.
+ *
+ * The EFI is reference counted so that it is not freed prior to both the EFI
+ * and EFD being committed and unpinned. This ensures the EFI is inserted into
+ * the AIL even in the event of out of order EFI/EFD processing. In other words,
+ * an EFI is born with two references:
+ *
+ * 	1.) an EFI held reference to track EFI AIL insertion
+ * 	2.) an EFD held reference to track EFD commit
+ *
+ * On allocation, both references are the responsibility of the caller. Once the
+ * EFI is added to and dirtied in a transaction, ownership of reference one
+ * transfers to the transaction. The reference is dropped once the EFI is
+ * inserted to the AIL or in the event of failure along the way (e.g., commit
+ * failure, log I/O error, etc.). Note that the caller remains responsible for
+ * the EFD reference under all circumstances to this point. The caller has no
+ * means to detect failure once the transaction is committed, however.
+ * Therefore, an EFD is required after this point, even in the event of
+ * unrelated failure.
+ *
+ * Once an EFD is allocated and dirtied in a transaction, reference two
+ * transfers to the transaction. The EFD reference is dropped once it reaches
+ * the unpin handler. Similar to the EFI, the reference also drops in the event
+ * of commit failure or log I/O errors. Note that the EFD is not inserted in the
+ * AIL, so at this point both the EFI and EFD are freed.
+ */
+struct xfs_efi_log_item {
+	struct xfs_log_item	efi_item;
+	atomic_t		efi_refcount;
+	atomic_t		efi_next_extent;
+	xfs_efi_log_format_t	efi_format;
+};
+
+static inline size_t
+xfs_efi_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_efi_log_item, efi_format) +
+			xfs_efi_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "extent free done" log item.  It is used to log
+ * the fact that some extents earlier mentioned in an efi item
+ * have been freed.
+ */
+struct xfs_efd_log_item {
+	struct xfs_log_item	efd_item;
+	struct xfs_efi_log_item *efd_efip;
+	uint			efd_next_extent;
+	xfs_efd_log_format_t	efd_format;
+};
+
+static inline size_t
+xfs_efd_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_efd_log_item, efd_format) +
+			xfs_efd_log_format_sizeof(nr);
+}
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_EFD_MAX_FAST_EXTENTS	16
+
+extern struct kmem_cache	*xfs_efi_cache;
+extern struct kmem_cache	*xfs_efd_cache;
+
+#endif	/* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
new file mode 100644
index 000000000..e462d39c8
--- /dev/null
+++ b/fs/xfs/xfs_file.c
@@ -0,0 +1,1468 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_icache.h"
+#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
+#include "xfs_reflink.h"
+
+#include <linux/dax.h>
+#include <linux/falloc.h>
+#include <linux/backing-dev.h>
+#include <linux/mman.h>
+#include <linux/fadvise.h>
+#include <linux/mount.h>
+
+static const struct vm_operations_struct xfs_file_vm_ops;
+
+/*
+ * Decide if the given file range is aligned to the size of the fundamental
+ * allocation unit for the file.
+ */
+static bool
+xfs_is_falloc_aligned(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	long long int		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint64_t		mask;
+
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
+			u64	rextbytes;
+			u32	mod;
+
+			rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+			div_u64_rem(pos, rextbytes, &mod);
+			if (mod)
+				return false;
+			div_u64_rem(len, rextbytes, &mod);
+			return mod == 0;
+		}
+		mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
+	} else {
+		mask = mp->m_sb.sb_blocksize - 1;
+	}
+
+	return !((pos | len) & mask);
+}
+
+/*
+ * Fsync operations on directories are much simpler than on regular files,
+ * as there is no file data to flush, and thus also no need for explicit
+ * cache flush operations, and there are no non-transaction metadata updates
+ * on directories either.
+ */
+STATIC int
+xfs_dir_fsync(
+	struct file		*file,
+	loff_t			start,
+	loff_t			end,
+	int			datasync)
+{
+	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
+
+	trace_xfs_dir_fsync(ip);
+	return xfs_log_force_inode(ip);
+}
+
+static xfs_csn_t
+xfs_fsync_seq(
+	struct xfs_inode	*ip,
+	bool			datasync)
+{
+	if (!xfs_ipincount(ip))
+		return 0;
+	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+		return 0;
+	return ip->i_itemp->ili_commit_seq;
+}
+
+/*
+ * All metadata updates are logged, which means that we just have to flush the
+ * log up to the latest LSN that touched the inode.
+ *
+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+ * the log force before we clear the ili_fsync_fields field. This ensures that
+ * we don't get a racing sync operation that does not wait for the metadata to
+ * hit the journal before returning.  If we race with clearing ili_fsync_fields,
+ * then all that will happen is the log force will do nothing as the lsn will
+ * already be on disk.  We can't race with setting ili_fsync_fields because that
+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+ * shared until after the ili_fsync_fields is cleared.
+ */
+static  int
+xfs_fsync_flush_log(
+	struct xfs_inode	*ip,
+	bool			datasync,
+	int			*log_flushed)
+{
+	int			error = 0;
+	xfs_csn_t		seq;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	seq = xfs_fsync_seq(ip, datasync);
+	if (seq) {
+		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+					  log_flushed);
+
+		spin_lock(&ip->i_itemp->ili_lock);
+		ip->i_itemp->ili_fsync_fields = 0;
+		spin_unlock(&ip->i_itemp->ili_lock);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+STATIC int
+xfs_file_fsync(
+	struct file		*file,
+	loff_t			start,
+	loff_t			end,
+	int			datasync)
+{
+	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error, err2;
+	int			log_flushed = 0;
+
+	trace_xfs_file_fsync(ip);
+
+	error = file_write_and_wait_range(file, start, end);
+	if (error)
+		return error;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+	/*
+	 * If we have an RT and/or log subvolume we need to make sure to flush
+	 * the write cache the device used for file data first.  This is to
+	 * ensure newly written file data make it to disk before logging the new
+	 * inode size in case of an extending write.
+	 */
+	if (XFS_IS_REALTIME_INODE(ip))
+		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
+	else if (mp->m_logdev_targp != mp->m_ddev_targp)
+		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+
+	/*
+	 * Any inode that has dirty modifications in the log is pinned.  The
+	 * racy check here for a pinned inode will not catch modifications
+	 * that happen concurrently to the fsync call, but fsync semantics
+	 * only require to sync previously completed I/O.
+	 */
+	if (xfs_ipincount(ip)) {
+		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	/*
+	 * If we only have a single device, and the log force about was
+	 * a no-op we might have to flush the data device cache here.
+	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
+	 * an already allocated file and thus do not have any metadata to
+	 * commit.
+	 */
+	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
+	    mp->m_logdev_targp == mp->m_ddev_targp) {
+		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	return error;
+}
+
+static int
+xfs_ilock_iocb(
+	struct kiocb		*iocb,
+	unsigned int		lock_mode)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!xfs_ilock_nowait(ip, lock_mode))
+			return -EAGAIN;
+	} else {
+		xfs_ilock(ip, lock_mode);
+	}
+
+	return 0;
+}
+
+STATIC ssize_t
+xfs_file_dio_read(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	ssize_t			ret;
+
+	trace_xfs_file_direct_read(iocb, to);
+
+	if (!iov_iter_count(to))
+		return 0; /* skip atime */
+
+	file_accessed(iocb->ki_filp);
+
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
+	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_read(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
+	ssize_t			ret = 0;
+
+	trace_xfs_file_dax_read(iocb, to);
+
+	if (!iov_iter_count(to))
+		return 0; /* skip atime */
+
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
+	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_read(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	ssize_t			ret;
+
+	trace_xfs_file_buffered_read(iocb, to);
+
+	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+	if (ret)
+		return ret;
+	ret = generic_file_read_iter(iocb, to);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
+	ssize_t			ret = 0;
+
+	XFS_STATS_INC(mp, xs_read_calls);
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	if (IS_DAX(inode))
+		ret = xfs_file_dax_read(iocb, to);
+	else if (iocb->ki_flags & IOCB_DIRECT)
+		ret = xfs_file_dio_read(iocb, to);
+	else
+		ret = xfs_file_buffered_read(iocb, to);
+
+	if (ret > 0)
+		XFS_STATS_ADD(mp, xs_read_bytes, ret);
+	return ret;
+}
+
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
+ */
+STATIC ssize_t
+xfs_file_write_checks(
+	struct kiocb		*iocb,
+	struct iov_iter		*from,
+	unsigned int		*iolock)
+{
+	struct file		*file = iocb->ki_filp;
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			error = 0;
+	size_t			count = iov_iter_count(from);
+	bool			drained_dio = false;
+	loff_t			isize;
+
+restart:
+	error = generic_write_checks(iocb, from);
+	if (error <= 0)
+		return error;
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		error = break_layout(inode, false);
+		if (error == -EWOULDBLOCK)
+			error = -EAGAIN;
+	} else {
+		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+	}
+
+	if (error)
+		return error;
+
+	/*
+	 * For changing security info in file_remove_privs() we need i_rwsem
+	 * exclusively.
+	 */
+	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+		xfs_iunlock(ip, *iolock);
+		*iolock = XFS_IOLOCK_EXCL;
+		error = xfs_ilock_iocb(iocb, *iolock);
+		if (error) {
+			*iolock = 0;
+			return error;
+		}
+		goto restart;
+	}
+
+	/*
+	 * If the offset is beyond the size of the file, we need to zero any
+	 * blocks that fall between the existing EOF and the start of this
+	 * write.  If zeroing is needed and we are currently holding the iolock
+	 * shared, we need to update it to exclusive which implies having to
+	 * redo all checks before.
+	 *
+	 * We need to serialise against EOF updates that occur in IO completions
+	 * here. We want to make sure that nobody is changing the size while we
+	 * do this check until we have placed an IO barrier (i.e.  hold the
+	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
+	 * spinlock effectively forms a memory barrier once we have the
+	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+	 * hence be able to correctly determine if we need to run zeroing.
+	 *
+	 * We can do an unlocked check here safely as IO completion can only
+	 * extend EOF. Truncate is locked out at this point, so the EOF can
+	 * not move backwards, only forwards. Hence we only need to take the
+	 * slow path and spin locks when we are at or beyond the current EOF.
+	 */
+	if (iocb->ki_pos <= i_size_read(inode))
+		goto out;
+
+	spin_lock(&ip->i_flags_lock);
+	isize = i_size_read(inode);
+	if (iocb->ki_pos > isize) {
+		spin_unlock(&ip->i_flags_lock);
+
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+
+		if (!drained_dio) {
+			if (*iolock == XFS_IOLOCK_SHARED) {
+				xfs_iunlock(ip, *iolock);
+				*iolock = XFS_IOLOCK_EXCL;
+				xfs_ilock(ip, *iolock);
+				iov_iter_reexpand(from, count);
+			}
+			/*
+			 * We now have an IO submission barrier in place, but
+			 * AIO can do EOF updates during IO completion and hence
+			 * we now need to wait for all of them to drain. Non-AIO
+			 * DIO will have drained before we are given the
+			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+			 * no-op.
+			 */
+			inode_dio_wait(inode);
+			drained_dio = true;
+			goto restart;
+		}
+
+		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
+		error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+		if (error)
+			return error;
+	} else
+		spin_unlock(&ip->i_flags_lock);
+
+out:
+	return kiocb_modified(iocb);
+}
+
+static int
+xfs_dio_write_end_io(
+	struct kiocb		*iocb,
+	ssize_t			size,
+	int			error,
+	unsigned		flags)
+{
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	loff_t			offset = iocb->ki_pos;
+	unsigned int		nofs_flag;
+
+	trace_xfs_end_io_direct_write(ip, offset, size);
+
+	if (xfs_is_shutdown(ip->i_mount))
+		return -EIO;
+
+	if (error)
+		return error;
+	if (!size)
+		return 0;
+
+	/*
+	 * Capture amount written on completion as we can't reliably account
+	 * for it on submission.
+	 */
+	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
+
+	/*
+	 * We can allocate memory here while doing writeback on behalf of
+	 * memory reclaim.  To avoid memory allocation deadlocks set the
+	 * task-wide nofs context for the following operations.
+	 */
+	nofs_flag = memalloc_nofs_save();
+
+	if (flags & IOMAP_DIO_COW) {
+		error = xfs_reflink_end_cow(ip, offset, size);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Unwritten conversion updates the in-core isize after extent
+	 * conversion but before updating the on-disk size. Updating isize any
+	 * earlier allows a racing dio read to find unwritten extents before
+	 * they are converted.
+	 */
+	if (flags & IOMAP_DIO_UNWRITTEN) {
+		error = xfs_iomap_write_unwritten(ip, offset, size, true);
+		goto out;
+	}
+
+	/*
+	 * We need to update the in-core inode size here so that we don't end up
+	 * with the on-disk inode size being outside the in-core inode size. We
+	 * have no other method of updating EOF for AIO, so always do it here
+	 * if necessary.
+	 *
+	 * We need to lock the test/set EOF update as we can be racing with
+	 * other IO completions here to update the EOF. Failing to serialise
+	 * here can result in EOF moving backwards and Bad Things Happen when
+	 * that occurs.
+	 *
+	 * As IO completion only ever extends EOF, we can do an unlocked check
+	 * here to avoid taking the spinlock. If we land within the current EOF,
+	 * then we do not need to do an extending update at all, and we don't
+	 * need to take the lock to check this. If we race with an update moving
+	 * EOF, then we'll either still be beyond EOF and need to take the lock,
+	 * or we'll be within EOF and we don't need to take it at all.
+	 */
+	if (offset + size <= i_size_read(inode))
+		goto out;
+
+	spin_lock(&ip->i_flags_lock);
+	if (offset + size > i_size_read(inode)) {
+		i_size_write(inode, offset + size);
+		spin_unlock(&ip->i_flags_lock);
+		error = xfs_setfilesize(ip, offset, size);
+	} else {
+		spin_unlock(&ip->i_flags_lock);
+	}
+
+out:
+	memalloc_nofs_restore(nofs_flag);
+	return error;
+}
+
+static const struct iomap_dio_ops xfs_dio_write_ops = {
+	.end_io		= xfs_dio_write_end_io,
+};
+
+/*
+ * Handle block aligned direct I/O writes
+ */
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+	struct xfs_inode	*ip,
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	unsigned int		iolock = XFS_IOLOCK_SHARED;
+	ssize_t			ret;
+
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
+	ret = xfs_file_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
+	 * the iolock back to shared if we had to take the exclusive lock in
+	 * xfs_file_write_checks() for other reasons.
+	 */
+	if (iolock == XFS_IOLOCK_EXCL) {
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		iolock = XFS_IOLOCK_SHARED;
+	}
+	trace_xfs_file_direct_write(iocb, from);
+	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+			   &xfs_dio_write_ops, 0, NULL, 0);
+out_unlock:
+	if (iolock)
+		xfs_iunlock(ip, iolock);
+	return ret;
+}
+
+/*
+ * Handle block unaligned direct I/O writes
+ *
+ * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct I/O writes.  However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ * to do sub-block zeroing and that requires serialisation against other direct
+ * I/O to the same block.  In this case we need to serialise the submission of
+ * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ * In the case where sub-block zeroing is not required, we can do concurrent
+ * sub-block dios to the same block successfully.
+ *
+ * Optimistically submit the I/O using the shared lock first, but use the
+ * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ * if block allocation or partial block zeroing would be required.  In that case
+ * we try again with the exclusive lock.
+ */
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+	struct xfs_inode	*ip,
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	size_t			isize = i_size_read(VFS_I(ip));
+	size_t			count = iov_iter_count(from);
+	unsigned int		iolock = XFS_IOLOCK_SHARED;
+	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
+	ssize_t			ret;
+
+	/*
+	 * Extending writes need exclusivity because of the sub-block zeroing
+	 * that the DIO code always does for partial tail blocks beyond EOF, so
+	 * don't even bother trying the fast path in this case.
+	 */
+	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+retry_exclusive:
+		iolock = XFS_IOLOCK_EXCL;
+		flags = IOMAP_DIO_FORCE_WAIT;
+	}
+
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
+
+	/*
+	 * We can't properly handle unaligned direct I/O to reflink files yet,
+	 * as we can't unshare a partial block.
+	 */
+	if (xfs_is_cow_inode(ip)) {
+		trace_xfs_reflink_bounce_dio_write(iocb, from);
+		ret = -ENOTBLK;
+		goto out_unlock;
+	}
+
+	ret = xfs_file_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * If we are doing exclusive unaligned I/O, this must be the only I/O
+	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
+	 * conversions from the AIO end_io handler.  Wait for all other I/O to
+	 * drain first.
+	 */
+	if (flags & IOMAP_DIO_FORCE_WAIT)
+		inode_dio_wait(VFS_I(ip));
+
+	trace_xfs_file_direct_write(iocb, from);
+	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+			   &xfs_dio_write_ops, flags, NULL, 0);
+
+	/*
+	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
+	 * layer rejected it for mapping or locking reasons. If we are doing
+	 * nonblocking user I/O, propagate the error.
+	 */
+	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+		xfs_iunlock(ip, iolock);
+		goto retry_exclusive;
+	}
+
+out_unlock:
+	if (iolock)
+		xfs_iunlock(ip, iolock);
+	return ret;
+}
+
+static ssize_t
+xfs_file_dio_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+	size_t			count = iov_iter_count(from);
+
+	/* direct I/O must be aligned to device logical sector size */
+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+		return -EINVAL;
+	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+		return xfs_file_dio_write_unaligned(ip, iocb, from);
+	return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
+static noinline ssize_t
+xfs_file_dax_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	unsigned int		iolock = XFS_IOLOCK_EXCL;
+	ssize_t			ret, error = 0;
+	loff_t			pos;
+
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
+	ret = xfs_file_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out;
+
+	pos = iocb->ki_pos;
+
+	trace_xfs_file_dax_write(iocb, from);
+	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
+	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+		i_size_write(inode, iocb->ki_pos);
+		error = xfs_setfilesize(ip, pos, ret);
+	}
+out:
+	if (iolock)
+		xfs_iunlock(ip, iolock);
+	if (error)
+		return error;
+
+	if (ret > 0) {
+		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+
+		/* Handle various SYNC-type writes */
+		ret = generic_write_sync(iocb, ret);
+	}
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	bool			cleared_space = false;
+	unsigned int		iolock;
+
+write_retry:
+	iolock = XFS_IOLOCK_EXCL;
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
+
+	ret = xfs_file_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(inode);
+
+	trace_xfs_file_buffered_write(iocb, from);
+	ret = iomap_file_buffered_write(iocb, from,
+			&xfs_buffered_write_iomap_ops);
+	if (likely(ret >= 0))
+		iocb->ki_pos += ret;
+
+	/*
+	 * If we hit a space limit, try to free up some lingering preallocated
+	 * space before returning an error. In the case of ENOSPC, first try to
+	 * write back all dirty inodes to free up some of the excess reserved
+	 * metadata space. This reduces the chances that the eofblocks scan
+	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
+	 * also behaves as a filter to prevent too many eofblocks scans from
+	 * running at the same time.  Use a synchronous scan to increase the
+	 * effectiveness of the scan.
+	 */
+	if (ret == -EDQUOT && !cleared_space) {
+		xfs_iunlock(ip, iolock);
+		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
+		cleared_space = true;
+		goto write_retry;
+	} else if (ret == -ENOSPC && !cleared_space) {
+		struct xfs_icwalk	icw = {0};
+
+		cleared_space = true;
+		xfs_flush_inodes(ip->i_mount);
+
+		xfs_iunlock(ip, iolock);
+		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
+		xfs_blockgc_free_space(ip->i_mount, &icw);
+		goto write_retry;
+	}
+
+	current->backing_dev_info = NULL;
+out:
+	if (iolock)
+		xfs_iunlock(ip, iolock);
+
+	if (ret > 0) {
+		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+		/* Handle various SYNC-type writes */
+		ret = generic_write_sync(iocb, ret);
+	}
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_write_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	size_t			ocount = iov_iter_count(from);
+
+	XFS_STATS_INC(ip->i_mount, xs_write_calls);
+
+	if (ocount == 0)
+		return 0;
+
+	if (xfs_is_shutdown(ip->i_mount))
+		return -EIO;
+
+	if (IS_DAX(inode))
+		return xfs_file_dax_write(iocb, from);
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		/*
+		 * Allow a directio write to fall back to a buffered
+		 * write *only* in the case that we're doing a reflink
+		 * CoW.  In all other directio scenarios we do not
+		 * allow an operation to fall back to buffered mode.
+		 */
+		ret = xfs_file_dio_write(iocb, from);
+		if (ret != -ENOTBLK)
+			return ret;
+	}
+
+	return xfs_file_buffered_write(iocb, from);
+}
+
+static void
+xfs_wait_dax_page(
+	struct inode		*inode)
+{
+	struct xfs_inode        *ip = XFS_I(inode);
+
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+	schedule();
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+}
+
+int
+xfs_break_dax_layouts(
+	struct inode		*inode,
+	bool			*retry)
+{
+	struct page		*page;
+
+	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+
+	page = dax_layout_busy_page(inode->i_mapping);
+	if (!page)
+		return 0;
+
+	*retry = true;
+	return ___wait_var_event(&page->_refcount,
+			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+			0, 0, xfs_wait_dax_page(inode));
+}
+
+int
+xfs_break_layouts(
+	struct inode		*inode,
+	uint			*iolock,
+	enum layout_break_reason reason)
+{
+	bool			retry;
+	int			error;
+
+	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+
+	do {
+		retry = false;
+		switch (reason) {
+		case BREAK_UNMAP:
+			error = xfs_break_dax_layouts(inode, &retry);
+			if (error || retry)
+				break;
+			fallthrough;
+		case BREAK_WRITE:
+			error = xfs_break_leased_layouts(inode, iolock, &retry);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			error = -EINVAL;
+		}
+	} while (error == 0 && retry);
+
+	return error;
+}
+
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(filp));
+
+	if (xfs_has_wsync(ip->i_mount))
+		return true;
+	if (filp->f_flags & (__O_SYNC | O_DSYNC))
+		return true;
+	if (IS_SYNC(file_inode(filp)))
+		return true;
+
+	return false;
+}
+
+#define	XFS_FALLOC_FL_SUPPORTED						\
+		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
+		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
+		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
+
+STATIC long
+xfs_file_fallocate(
+	struct file		*file,
+	int			mode,
+	loff_t			offset,
+	loff_t			len)
+{
+	struct inode		*inode = file_inode(file);
+	struct xfs_inode	*ip = XFS_I(inode);
+	long			error;
+	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	loff_t			new_size = 0;
+	bool			do_file_insert = false;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
+		return -EOPNOTSUPP;
+
+	xfs_ilock(ip, iolock);
+	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Must wait for all AIO to complete before we continue as AIO can
+	 * change the file size on completion without holding any locks we
+	 * currently hold. We must do this first because AIO can update both
+	 * the on disk and in memory inode sizes, and the operations that follow
+	 * require the in-memory size to be fully up-to-date.
+	 */
+	inode_dio_wait(inode);
+
+	/*
+	 * Now AIO and DIO has drained we flush and (if necessary) invalidate
+	 * the cached range over the first operation we are about to run.
+	 *
+	 * We care about zero and collapse here because they both run a hole
+	 * punch over the range first. Because that can zero data, and the range
+	 * of invalidation for the shift operations is much larger, we still do
+	 * the required flush for collapse in xfs_prepare_shift().
+	 *
+	 * Insert has the same range requirements as collapse, and we extend the
+	 * file first which can zero data. Hence insert has the same
+	 * flush/invalidate requirements as collapse and so they are both
+	 * handled at the right time by xfs_prepare_shift().
+	 */
+	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+		    FALLOC_FL_COLLAPSE_RANGE)) {
+		error = xfs_flush_unmap_range(ip, offset, len);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = file_modified(file);
+	if (error)
+		goto out_unlock;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		error = xfs_free_file_space(ip, offset, len);
+		if (error)
+			goto out_unlock;
+	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+		if (!xfs_is_falloc_aligned(ip, offset, len)) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		/*
+		 * There is no need to overlap collapse range with EOF,
+		 * in which case it is effectively a truncate operation
+		 */
+		if (offset + len >= i_size_read(inode)) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		new_size = i_size_read(inode) - len;
+
+		error = xfs_collapse_file_space(ip, offset, len);
+		if (error)
+			goto out_unlock;
+	} else if (mode & FALLOC_FL_INSERT_RANGE) {
+		loff_t		isize = i_size_read(inode);
+
+		if (!xfs_is_falloc_aligned(ip, offset, len)) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		/*
+		 * New inode size must not exceed ->s_maxbytes, accounting for
+		 * possible signed overflow.
+		 */
+		if (inode->i_sb->s_maxbytes - isize < len) {
+			error = -EFBIG;
+			goto out_unlock;
+		}
+		new_size = isize + len;
+
+		/* Offset should be less than i_size */
+		if (offset >= isize) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+		do_file_insert = true;
+	} else {
+		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		    offset + len > i_size_read(inode)) {
+			new_size = offset + len;
+			error = inode_newsize_ok(inode, new_size);
+			if (error)
+				goto out_unlock;
+		}
+
+		if (mode & FALLOC_FL_ZERO_RANGE) {
+			/*
+			 * Punch a hole and prealloc the range.  We use a hole
+			 * punch rather than unwritten extent conversion for two
+			 * reasons:
+			 *
+			 *   1.) Hole punch handles partial block zeroing for us.
+			 *   2.) If prealloc returns ENOSPC, the file range is
+			 *       still zero-valued by virtue of the hole punch.
+			 */
+			unsigned int blksize = i_blocksize(inode);
+
+			trace_xfs_zero_file_space(ip);
+
+			error = xfs_free_file_space(ip, offset, len);
+			if (error)
+				goto out_unlock;
+
+			len = round_up(offset + len, blksize) -
+			      round_down(offset, blksize);
+			offset = round_down(offset, blksize);
+		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+			error = xfs_reflink_unshare(ip, offset, len);
+			if (error)
+				goto out_unlock;
+		} else {
+			/*
+			 * If always_cow mode we can't use preallocations and
+			 * thus should not create them.
+			 */
+			if (xfs_is_always_cow_inode(ip)) {
+				error = -EOPNOTSUPP;
+				goto out_unlock;
+			}
+		}
+
+		if (!xfs_is_always_cow_inode(ip)) {
+			error = xfs_alloc_file_space(ip, offset, len);
+			if (error)
+				goto out_unlock;
+		}
+	}
+
+	/* Change file size if needed */
+	if (new_size) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = new_size;
+		error = xfs_vn_setattr_size(file_mnt_user_ns(file),
+					    file_dentry(file), &iattr);
+		if (error)
+			goto out_unlock;
+	}
+
+	/*
+	 * Perform hole insertion now that the file size has been
+	 * updated so that if we crash during the operation we don't
+	 * leave shifted extents past EOF and hence losing access to
+	 * the data that is contained within them.
+	 */
+	if (do_file_insert) {
+		error = xfs_insert_file_space(ip, offset, len);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (xfs_file_sync_writes(file))
+		error = xfs_log_force_inode(ip);
+
+out_unlock:
+	xfs_iunlock(ip, iolock);
+	return error;
+}
+
+STATIC int
+xfs_file_fadvise(
+	struct file	*file,
+	loff_t		start,
+	loff_t		end,
+	int		advice)
+{
+	struct xfs_inode *ip = XFS_I(file_inode(file));
+	int ret;
+	int lockflags = 0;
+
+	/*
+	 * Operations creating pages in page cache need protection from hole
+	 * punching and similar ops
+	 */
+	if (advice == POSIX_FADV_WILLNEED) {
+		lockflags = XFS_IOLOCK_SHARED;
+		xfs_ilock(ip, lockflags);
+	}
+	ret = generic_fadvise(file, start, end, advice);
+	if (lockflags)
+		xfs_iunlock(ip, lockflags);
+	return ret;
+}
+
+STATIC loff_t
+xfs_file_remap_range(
+	struct file		*file_in,
+	loff_t			pos_in,
+	struct file		*file_out,
+	loff_t			pos_out,
+	loff_t			len,
+	unsigned int		remap_flags)
+{
+	struct inode		*inode_in = file_inode(file_in);
+	struct xfs_inode	*src = XFS_I(inode_in);
+	struct inode		*inode_out = file_inode(file_out);
+	struct xfs_inode	*dest = XFS_I(inode_out);
+	struct xfs_mount	*mp = src->i_mount;
+	loff_t			remapped = 0;
+	xfs_extlen_t		cowextsize;
+	int			ret;
+
+	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (!xfs_has_reflink(mp))
+		return -EOPNOTSUPP;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/* Prepare and then clone file data. */
+	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+			&len, remap_flags);
+	if (ret || len == 0)
+		return ret;
+
+	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
+	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
+			&remapped);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Carry the cowextsize hint from src to dest if we're sharing the
+	 * entire source file to the entire destination file, the source file
+	 * has a cowextsize hint, and the destination file does not.
+	 */
+	cowextsize = 0;
+	if (pos_in == 0 && len == i_size_read(inode_in) &&
+	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    pos_out == 0 && len >= i_size_read(inode_out) &&
+	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
+		cowextsize = src->i_cowextsize;
+
+	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
+			remap_flags);
+	if (ret)
+		goto out_unlock;
+
+	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
+		xfs_log_force_inode(dest);
+out_unlock:
+	xfs_iunlock2_io_mmap(src, dest);
+	if (ret)
+		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+	return remapped > 0 ? remapped : ret;
+}
+
+STATIC int
+xfs_file_open(
+	struct inode	*inode,
+	struct file	*file)
+{
+	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
+		return -EIO;
+	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
+	return generic_file_open(inode, file);
+}
+
+STATIC int
+xfs_dir_open(
+	struct inode	*inode,
+	struct file	*file)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	unsigned int	mode;
+	int		error;
+
+	error = xfs_file_open(inode, file);
+	if (error)
+		return error;
+
+	/*
+	 * If there are any blocks, read-ahead block 0 as we're almost
+	 * certain to have the next operation be a read there.
+	 */
+	mode = xfs_ilock_data_map_shared(ip);
+	if (ip->i_df.if_nextents > 0)
+		error = xfs_dir3_data_readahead(ip, 0, 0);
+	xfs_iunlock(ip, mode);
+	return error;
+}
+
+STATIC int
+xfs_file_release(
+	struct inode	*inode,
+	struct file	*filp)
+{
+	return xfs_release(XFS_I(inode));
+}
+
+STATIC int
+xfs_file_readdir(
+	struct file	*file,
+	struct dir_context *ctx)
+{
+	struct inode	*inode = file_inode(file);
+	xfs_inode_t	*ip = XFS_I(inode);
+	size_t		bufsize;
+
+	/*
+	 * The Linux API doesn't pass down the total size of the buffer
+	 * we read into down to the filesystem.  With the filldir concept
+	 * it's not needed for correct information, but the XFS dir2 leaf
+	 * code wants an estimate of the buffer size to calculate it's
+	 * readahead window and size the buffers used for mapping to
+	 * physical blocks.
+	 *
+	 * Try to give it an estimate that's good enough, maybe at some
+	 * point we can change the ->readdir prototype to include the
+	 * buffer size.  For now we use the current glibc buffer size.
+	 */
+	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
+
+	return xfs_readdir(NULL, ip, ctx, bufsize);
+}
+
+STATIC loff_t
+xfs_file_llseek(
+	struct file	*file,
+	loff_t		offset,
+	int		whence)
+{
+	struct inode		*inode = file->f_mapping->host;
+
+	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
+		return -EIO;
+
+	switch (whence) {
+	default:
+		return generic_file_llseek(file, offset, whence);
+	case SEEK_HOLE:
+		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
+		break;
+	case SEEK_DATA:
+		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
+		break;
+	}
+
+	if (offset < 0)
+		return offset;
+	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
+#ifdef CONFIG_FS_DAX
+static inline vm_fault_t
+xfs_dax_fault(
+	struct vm_fault		*vmf,
+	enum page_entry_size	pe_size,
+	bool			write_fault,
+	pfn_t			*pfn)
+{
+	return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+			(write_fault && !vmf->cow_page) ?
+				&xfs_dax_write_iomap_ops :
+				&xfs_read_iomap_ops);
+}
+#else
+static inline vm_fault_t
+xfs_dax_fault(
+	struct vm_fault		*vmf,
+	enum page_entry_size	pe_size,
+	bool			write_fault,
+	pfn_t			*pfn)
+{
+	ASSERT(0);
+	return VM_FAULT_SIGBUS;
+}
+#endif
+
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_lock (MM)
+ *   sb_start_pagefault(vfs, freeze)
+ *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
+ *       page_lock (MM)
+ *         i_lock (XFS - extent map serialisation)
+ */
+static vm_fault_t
+__xfs_filemap_fault(
+	struct vm_fault		*vmf,
+	enum page_entry_size	pe_size,
+	bool			write_fault)
+{
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
+	struct xfs_inode	*ip = XFS_I(inode);
+	vm_fault_t		ret;
+
+	trace_xfs_filemap_fault(ip, pe_size, write_fault);
+
+	if (write_fault) {
+		sb_start_pagefault(inode->i_sb);
+		file_update_time(vmf->vma->vm_file);
+	}
+
+	if (IS_DAX(inode)) {
+		pfn_t pfn;
+
+		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+		ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
+		if (ret & VM_FAULT_NEEDDSYNC)
+			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	} else {
+		if (write_fault) {
+			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+			ret = iomap_page_mkwrite(vmf,
+					&xfs_buffered_write_iomap_ops);
+			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+		} else {
+			ret = filemap_fault(vmf);
+		}
+	}
+
+	if (write_fault)
+		sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+
+static inline bool
+xfs_is_write_fault(
+	struct vm_fault		*vmf)
+{
+	return (vmf->flags & FAULT_FLAG_WRITE) &&
+	       (vmf->vma->vm_flags & VM_SHARED);
+}
+
+static vm_fault_t
+xfs_filemap_fault(
+	struct vm_fault		*vmf)
+{
+	/* DAX can shortcut the normal fault path on write faults! */
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
+			IS_DAX(file_inode(vmf->vma->vm_file)) &&
+			xfs_is_write_fault(vmf));
+}
+
+static vm_fault_t
+xfs_filemap_huge_fault(
+	struct vm_fault		*vmf,
+	enum page_entry_size	pe_size)
+{
+	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
+		return VM_FAULT_FALLBACK;
+
+	/* DAX can shortcut the normal fault path on write faults! */
+	return __xfs_filemap_fault(vmf, pe_size,
+			xfs_is_write_fault(vmf));
+}
+
+static vm_fault_t
+xfs_filemap_page_mkwrite(
+	struct vm_fault		*vmf)
+{
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
+}
+
+/*
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
+ */
+static vm_fault_t
+xfs_filemap_pfn_mkwrite(
+	struct vm_fault		*vmf)
+{
+
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static vm_fault_t
+xfs_filemap_map_pages(
+	struct vm_fault		*vmf,
+	pgoff_t			start_pgoff,
+	pgoff_t			end_pgoff)
+{
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
+	vm_fault_t ret;
+
+	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
+	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+	.fault		= xfs_filemap_fault,
+	.huge_fault	= xfs_filemap_huge_fault,
+	.map_pages	= xfs_filemap_map_pages,
+	.page_mkwrite	= xfs_filemap_page_mkwrite,
+	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+	struct file		*file,
+	struct vm_area_struct	*vma)
+{
+	struct inode		*inode = file_inode(file);
+	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
+
+	/*
+	 * We don't support synchronous mappings for non-DAX files and
+	 * for DAX files if underneath dax_device is not synchronous.
+	 */
+	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
+		return -EOPNOTSUPP;
+
+	file_accessed(file);
+	vma->vm_ops = &xfs_file_vm_ops;
+	if (IS_DAX(inode))
+		vma->vm_flags |= VM_HUGEPAGE;
+	return 0;
+}
+
+const struct file_operations xfs_file_operations = {
+	.llseek		= xfs_file_llseek,
+	.read_iter	= xfs_file_read_iter,
+	.write_iter	= xfs_file_write_iter,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.iopoll		= iocb_bio_iopoll,
+	.unlocked_ioctl	= xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= xfs_file_compat_ioctl,
+#endif
+	.mmap		= xfs_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
+	.open		= xfs_file_open,
+	.release	= xfs_file_release,
+	.fsync		= xfs_file_fsync,
+	.get_unmapped_area = thp_get_unmapped_area,
+	.fallocate	= xfs_file_fallocate,
+	.fadvise	= xfs_file_fadvise,
+	.remap_file_range = xfs_file_remap_range,
+};
+
+const struct file_operations xfs_dir_file_operations = {
+	.open		= xfs_dir_open,
+	.read		= generic_read_dir,
+	.iterate_shared	= xfs_file_readdir,
+	.llseek		= generic_file_llseek,
+	.unlocked_ioctl	= xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= xfs_file_compat_ioctl,
+#endif
+	.fsync		= xfs_dir_fsync,
+};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
new file mode 100644
index 000000000..34b21a29c
--- /dev/null
+++ b/fs/xfs/xfs_filestream.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * Copyright (c) 2014 Christoph Hellwig.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+#include "xfs_mru_cache.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans.h"
+#include "xfs_filestream.h"
+
+struct xfs_fstrm_item {
+	struct xfs_mru_cache_elem	mru;
+	xfs_agnumber_t			ag; /* AG in use for this directory */
+};
+
+enum xfs_fstrm_alloc {
+	XFS_PICK_USERDATA = 1,
+	XFS_PICK_LOWSPACE = 2,
+};
+
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters.  These counters allow xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it.
+ */
+int
+xfs_filestream_peek_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_read(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
+}
+
+static int
+xfs_filestream_get_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_inc_return(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
+}
+
+static void
+xfs_filestream_put_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, agno);
+	atomic_dec(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+}
+
+static void
+xfs_fstrm_free_func(
+	void			*data,
+	struct xfs_mru_cache_elem *mru)
+{
+	struct xfs_mount	*mp = data;
+	struct xfs_fstrm_item	*item =
+		container_of(mru, struct xfs_fstrm_item, mru);
+
+	xfs_filestream_put_ag(mp, item->ag);
+	trace_xfs_filestream_free(mp, mru->key, item->ag);
+
+	kmem_free(item);
+}
+
+/*
+ * Scan the AGs starting at startag looking for an AG that isn't in use and has
+ * at least minlen blocks free.
+ */
+static int
+xfs_filestream_pick_ag(
+	struct xfs_inode	*ip,
+	xfs_agnumber_t		startag,
+	xfs_agnumber_t		*agp,
+	int			flags,
+	xfs_extlen_t		minlen)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_fstrm_item	*item;
+	struct xfs_perag	*pag;
+	xfs_extlen_t		longest, free = 0, minfree, maxfree = 0;
+	xfs_agnumber_t		ag, max_ag = NULLAGNUMBER;
+	int			err, trylock, nscan;
+
+	ASSERT(S_ISDIR(VFS_I(ip)->i_mode));
+
+	/* 2% of an AG's blocks must be free for it to be chosen. */
+	minfree = mp->m_sb.sb_agblocks / 50;
+
+	ag = startag;
+	*agp = NULLAGNUMBER;
+
+	/* For the first pass, don't sleep trying to init the per-AG. */
+	trylock = XFS_ALLOC_FLAG_TRYLOCK;
+
+	for (nscan = 0; 1; nscan++) {
+		trace_xfs_filestream_scan(mp, ip->i_ino, ag);
+
+		pag = xfs_perag_get(mp, ag);
+
+		if (!pag->pagf_init) {
+			err = xfs_alloc_read_agf(pag, NULL, trylock, NULL);
+			if (err) {
+				if (err != -EAGAIN) {
+					xfs_perag_put(pag);
+					return err;
+				}
+				/* Couldn't lock the AGF, skip this AG. */
+				goto next_ag;
+			}
+		}
+
+		/* Keep track of the AG with the most free blocks. */
+		if (pag->pagf_freeblks > maxfree) {
+			maxfree = pag->pagf_freeblks;
+			max_ag = ag;
+		}
+
+		/*
+		 * The AG reference count does two things: it enforces mutual
+		 * exclusion when examining the suitability of an AG in this
+		 * loop, and it guards against two filestreams being established
+		 * in the same AG as each other.
+		 */
+		if (xfs_filestream_get_ag(mp, ag) > 1) {
+			xfs_filestream_put_ag(mp, ag);
+			goto next_ag;
+		}
+
+		longest = xfs_alloc_longest_free_extent(pag,
+				xfs_alloc_min_freelist(mp, pag),
+				xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
+		if (((minlen && longest >= minlen) ||
+		     (!minlen && pag->pagf_freeblks >= minfree)) &&
+		    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
+		     (flags & XFS_PICK_LOWSPACE))) {
+
+			/* Break out, retaining the reference on the AG. */
+			free = pag->pagf_freeblks;
+			xfs_perag_put(pag);
+			*agp = ag;
+			break;
+		}
+
+		/* Drop the reference on this AG, it's not usable. */
+		xfs_filestream_put_ag(mp, ag);
+next_ag:
+		xfs_perag_put(pag);
+		/* Move to the next AG, wrapping to AG 0 if necessary. */
+		if (++ag >= mp->m_sb.sb_agcount)
+			ag = 0;
+
+		/* If a full pass of the AGs hasn't been done yet, continue. */
+		if (ag != startag)
+			continue;
+
+		/* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */
+		if (trylock != 0) {
+			trylock = 0;
+			continue;
+		}
+
+		/* Finally, if lowspace wasn't set, set it for the 3rd pass. */
+		if (!(flags & XFS_PICK_LOWSPACE)) {
+			flags |= XFS_PICK_LOWSPACE;
+			continue;
+		}
+
+		/*
+		 * Take the AG with the most free space, regardless of whether
+		 * it's already in use by another filestream.
+		 */
+		if (max_ag != NULLAGNUMBER) {
+			xfs_filestream_get_ag(mp, max_ag);
+			free = maxfree;
+			*agp = max_ag;
+			break;
+		}
+
+		/* take AG 0 if none matched */
+		trace_xfs_filestream_pick(ip, *agp, free, nscan);
+		*agp = 0;
+		return 0;
+	}
+
+	trace_xfs_filestream_pick(ip, *agp, free, nscan);
+
+	if (*agp == NULLAGNUMBER)
+		return 0;
+
+	err = -ENOMEM;
+	item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
+	if (!item)
+		goto out_put_ag;
+
+	item->ag = *agp;
+
+	err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
+	if (err) {
+		if (err == -EEXIST)
+			err = 0;
+		goto out_free_item;
+	}
+
+	return 0;
+
+out_free_item:
+	kmem_free(item);
+out_put_ag:
+	xfs_filestream_put_ag(mp, *agp);
+	return err;
+}
+
+static struct xfs_inode *
+xfs_filestream_get_parent(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip), *dir = NULL;
+	struct dentry		*dentry, *parent;
+
+	dentry = d_find_alias(inode);
+	if (!dentry)
+		goto out;
+
+	parent = dget_parent(dentry);
+	if (!parent)
+		goto out_dput;
+
+	dir = igrab(d_inode(parent));
+	dput(parent);
+
+out_dput:
+	dput(dentry);
+out:
+	return dir ? XFS_I(dir) : NULL;
+}
+
+/*
+ * Find the right allocation group for a file, either by finding an
+ * existing file stream or creating a new one.
+ *
+ * Returns NULLAGNUMBER in case of an error.
+ */
+xfs_agnumber_t
+xfs_filestream_lookup_ag(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_inode	*pip = NULL;
+	xfs_agnumber_t		startag, ag = NULLAGNUMBER;
+	struct xfs_mru_cache_elem *mru;
+
+	ASSERT(S_ISREG(VFS_I(ip)->i_mode));
+
+	pip = xfs_filestream_get_parent(ip);
+	if (!pip)
+		return NULLAGNUMBER;
+
+	mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
+	if (mru) {
+		ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
+		xfs_mru_cache_done(mp->m_filestream);
+
+		trace_xfs_filestream_lookup(mp, ip->i_ino, ag);
+		goto out;
+	}
+
+	/*
+	 * Set the starting AG using the rotor for inode32, otherwise
+	 * use the directory inode's AG.
+	 */
+	if (xfs_is_inode32(mp)) {
+		xfs_agnumber_t	 rotorstep = xfs_rotorstep;
+		startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
+		mp->m_agfrotor = (mp->m_agfrotor + 1) %
+		                 (mp->m_sb.sb_agcount * rotorstep);
+	} else
+		startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
+
+	if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0))
+		ag = NULLAGNUMBER;
+out:
+	xfs_irele(pip);
+	return ag;
+}
+
+/*
+ * Pick a new allocation group for the current file and its file stream.
+ *
+ * This is called when the allocator can't find a suitable extent in the
+ * current AG, and we have to move the stream into a new AG with more space.
+ */
+int
+xfs_filestream_new_ag(
+	struct xfs_bmalloca	*ap,
+	xfs_agnumber_t		*agp)
+{
+	struct xfs_inode	*ip = ap->ip, *pip;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_extlen_t		minlen = ap->length;
+	xfs_agnumber_t		startag = 0;
+	int			flags = 0;
+	int			err = 0;
+	struct xfs_mru_cache_elem *mru;
+
+	*agp = NULLAGNUMBER;
+
+	pip = xfs_filestream_get_parent(ip);
+	if (!pip)
+		goto exit;
+
+	mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
+	if (mru) {
+		struct xfs_fstrm_item *item =
+			container_of(mru, struct xfs_fstrm_item, mru);
+		startag = (item->ag + 1) % mp->m_sb.sb_agcount;
+	}
+
+	if (ap->datatype & XFS_ALLOC_USERDATA)
+		flags |= XFS_PICK_USERDATA;
+	if (ap->tp->t_flags & XFS_TRANS_LOWMODE)
+		flags |= XFS_PICK_LOWSPACE;
+
+	err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
+
+	/*
+	 * Only free the item here so we skip over the old AG earlier.
+	 */
+	if (mru)
+		xfs_fstrm_free_func(mp, mru);
+
+	xfs_irele(pip);
+exit:
+	if (*agp == NULLAGNUMBER)
+		*agp = 0;
+	return err;
+}
+
+void
+xfs_filestream_deassociate(
+	struct xfs_inode	*ip)
+{
+	xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino);
+}
+
+int
+xfs_filestream_mount(
+	xfs_mount_t	*mp)
+{
+	/*
+	 * The filestream timer tunable is currently fixed within the range of
+	 * one second to four minutes, with five seconds being the default.  The
+	 * group count is somewhat arbitrary, but it'd be nice to adhere to the
+	 * timer tunable to within about 10 percent.  This requires at least 10
+	 * groups.
+	 */
+	return xfs_mru_cache_create(&mp->m_filestream, mp,
+			xfs_fstrm_centisecs * 10, 10, xfs_fstrm_free_func);
+}
+
+void
+xfs_filestream_unmount(
+	xfs_mount_t	*mp)
+{
+	xfs_mru_cache_destroy(mp->m_filestream);
+}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
new file mode 100644
index 000000000..403226ebb
--- /dev/null
+++ b/fs/xfs/xfs_filestream.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FILESTREAM_H__
+#define __XFS_FILESTREAM_H__
+
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_bmalloca;
+
+int xfs_filestream_mount(struct xfs_mount *mp);
+void xfs_filestream_unmount(struct xfs_mount *mp);
+void xfs_filestream_deassociate(struct xfs_inode *ip);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
+int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno);
+
+static inline int
+xfs_inode_is_filestream(
+	struct xfs_inode	*ip)
+{
+	return xfs_has_filestreams(ip->i_mount) ||
+		(ip->i_diflags & XFS_DIFLAG_FILESTREAM);
+}
+
+#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
new file mode 100644
index 000000000..d8337274c
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.c
@@ -0,0 +1,974 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+
+/* Convert an xfs_fsmap to an fsmap. */
+static void
+xfs_fsmap_from_internal(
+	struct fsmap		*dest,
+	struct xfs_fsmap	*src)
+{
+	dest->fmr_device = src->fmr_device;
+	dest->fmr_flags = src->fmr_flags;
+	dest->fmr_physical = BBTOB(src->fmr_physical);
+	dest->fmr_owner = src->fmr_owner;
+	dest->fmr_offset = BBTOB(src->fmr_offset);
+	dest->fmr_length = BBTOB(src->fmr_length);
+	dest->fmr_reserved[0] = 0;
+	dest->fmr_reserved[1] = 0;
+	dest->fmr_reserved[2] = 0;
+}
+
+/* Convert an fsmap to an xfs_fsmap. */
+void
+xfs_fsmap_to_internal(
+	struct xfs_fsmap	*dest,
+	struct fsmap		*src)
+{
+	dest->fmr_device = src->fmr_device;
+	dest->fmr_flags = src->fmr_flags;
+	dest->fmr_physical = BTOBBT(src->fmr_physical);
+	dest->fmr_owner = src->fmr_owner;
+	dest->fmr_offset = BTOBBT(src->fmr_offset);
+	dest->fmr_length = BTOBBT(src->fmr_length);
+}
+
+/* Convert an fsmap owner into an rmapbt owner. */
+static int
+xfs_fsmap_owner_to_rmap(
+	struct xfs_rmap_irec	*dest,
+	const struct xfs_fsmap	*src)
+{
+	if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) {
+		dest->rm_owner = src->fmr_owner;
+		return 0;
+	}
+
+	switch (src->fmr_owner) {
+	case 0:			/* "lowest owner id possible" */
+	case -1ULL:		/* "highest owner id possible" */
+		dest->rm_owner = 0;
+		break;
+	case XFS_FMR_OWN_FREE:
+		dest->rm_owner = XFS_RMAP_OWN_NULL;
+		break;
+	case XFS_FMR_OWN_UNKNOWN:
+		dest->rm_owner = XFS_RMAP_OWN_UNKNOWN;
+		break;
+	case XFS_FMR_OWN_FS:
+		dest->rm_owner = XFS_RMAP_OWN_FS;
+		break;
+	case XFS_FMR_OWN_LOG:
+		dest->rm_owner = XFS_RMAP_OWN_LOG;
+		break;
+	case XFS_FMR_OWN_AG:
+		dest->rm_owner = XFS_RMAP_OWN_AG;
+		break;
+	case XFS_FMR_OWN_INOBT:
+		dest->rm_owner = XFS_RMAP_OWN_INOBT;
+		break;
+	case XFS_FMR_OWN_INODES:
+		dest->rm_owner = XFS_RMAP_OWN_INODES;
+		break;
+	case XFS_FMR_OWN_REFC:
+		dest->rm_owner = XFS_RMAP_OWN_REFC;
+		break;
+	case XFS_FMR_OWN_COW:
+		dest->rm_owner = XFS_RMAP_OWN_COW;
+		break;
+	case XFS_FMR_OWN_DEFECTIVE:	/* not implemented */
+		/* fall through */
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Convert an rmapbt owner into an fsmap owner. */
+static int
+xfs_fsmap_owner_from_rmap(
+	struct xfs_fsmap		*dest,
+	const struct xfs_rmap_irec	*src)
+{
+	dest->fmr_flags = 0;
+	if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) {
+		dest->fmr_owner = src->rm_owner;
+		return 0;
+	}
+	dest->fmr_flags |= FMR_OF_SPECIAL_OWNER;
+
+	switch (src->rm_owner) {
+	case XFS_RMAP_OWN_FS:
+		dest->fmr_owner = XFS_FMR_OWN_FS;
+		break;
+	case XFS_RMAP_OWN_LOG:
+		dest->fmr_owner = XFS_FMR_OWN_LOG;
+		break;
+	case XFS_RMAP_OWN_AG:
+		dest->fmr_owner = XFS_FMR_OWN_AG;
+		break;
+	case XFS_RMAP_OWN_INOBT:
+		dest->fmr_owner = XFS_FMR_OWN_INOBT;
+		break;
+	case XFS_RMAP_OWN_INODES:
+		dest->fmr_owner = XFS_FMR_OWN_INODES;
+		break;
+	case XFS_RMAP_OWN_REFC:
+		dest->fmr_owner = XFS_FMR_OWN_REFC;
+		break;
+	case XFS_RMAP_OWN_COW:
+		dest->fmr_owner = XFS_FMR_OWN_COW;
+		break;
+	case XFS_RMAP_OWN_NULL:	/* "free" */
+		dest->fmr_owner = XFS_FMR_OWN_FREE;
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/* getfsmap query state */
+struct xfs_getfsmap_info {
+	struct xfs_fsmap_head	*head;
+	struct fsmap		*fsmap_recs;	/* mapping records */
+	struct xfs_buf		*agf_bp;	/* AGF, for refcount queries */
+	struct xfs_perag	*pag;		/* AG info, if applicable */
+	xfs_daddr_t		next_daddr;	/* next daddr we expect */
+	u64			missing_owner;	/* owner of holes */
+	u32			dev;		/* device id */
+	struct xfs_rmap_irec	low;		/* low rmap key */
+	struct xfs_rmap_irec	high;		/* high rmap key */
+	bool			last;		/* last extent? */
+};
+
+/* Associate a device with a getfsmap handler. */
+struct xfs_getfsmap_dev {
+	u32			dev;
+	int			(*fn)(struct xfs_trans *tp,
+				      const struct xfs_fsmap *keys,
+				      struct xfs_getfsmap_info *info);
+};
+
+/* Compare two getfsmap device handlers. */
+static int
+xfs_getfsmap_dev_compare(
+	const void			*p1,
+	const void			*p2)
+{
+	const struct xfs_getfsmap_dev	*d1 = p1;
+	const struct xfs_getfsmap_dev	*d2 = p2;
+
+	return d1->dev - d2->dev;
+}
+
+/* Decide if this mapping is shared. */
+STATIC int
+xfs_getfsmap_is_shared(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	const struct xfs_rmap_irec	*rec,
+	bool				*stat)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*cur;
+	xfs_agblock_t			fbno;
+	xfs_extlen_t			flen;
+	int				error;
+
+	*stat = false;
+	if (!xfs_has_reflink(mp))
+		return 0;
+	/* rt files will have no perag structure */
+	if (!info->pag)
+		return 0;
+
+	/* Are there any shared blocks here? */
+	flen = 0;
+	cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag);
+
+	error = xfs_refcount_find_shared(cur, rec->rm_startblock,
+			rec->rm_blockcount, &fbno, &flen, false);
+
+	xfs_btree_del_cursor(cur, error);
+	if (error)
+		return error;
+
+	*stat = flen > 0;
+	return 0;
+}
+
+static inline void
+xfs_getfsmap_format(
+	struct xfs_mount		*mp,
+	struct xfs_fsmap		*xfm,
+	struct xfs_getfsmap_info	*info)
+{
+	struct fsmap			*rec;
+
+	trace_xfs_getfsmap_mapping(mp, xfm);
+
+	rec = &info->fsmap_recs[info->head->fmh_entries++];
+	xfs_fsmap_from_internal(rec, xfm);
+}
+
+/*
+ * Format a reverse mapping for getfsmap, having translated rm_startblock
+ * into the appropriate daddr units.
+ */
+STATIC int
+xfs_getfsmap_helper(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	const struct xfs_rmap_irec	*rec,
+	xfs_daddr_t			rec_daddr)
+{
+	struct xfs_fsmap		fmr;
+	struct xfs_mount		*mp = tp->t_mountp;
+	bool				shared;
+	int				error;
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	/*
+	 * Filter out records that start before our startpoint, if the
+	 * caller requested that.
+	 */
+	if (xfs_rmap_compare(rec, &info->low) < 0) {
+		rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+		if (info->next_daddr < rec_daddr)
+			info->next_daddr = rec_daddr;
+		return 0;
+	}
+
+	/* Are we just counting mappings? */
+	if (info->head->fmh_count == 0) {
+		if (info->head->fmh_entries == UINT_MAX)
+			return -ECANCELED;
+
+		if (rec_daddr > info->next_daddr)
+			info->head->fmh_entries++;
+
+		if (info->last)
+			return 0;
+
+		info->head->fmh_entries++;
+
+		rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+		if (info->next_daddr < rec_daddr)
+			info->next_daddr = rec_daddr;
+		return 0;
+	}
+
+	/*
+	 * If the record starts past the last physical block we saw,
+	 * then we've found a gap.  Report the gap as being owned by
+	 * whatever the caller specified is the missing owner.
+	 */
+	if (rec_daddr > info->next_daddr) {
+		if (info->head->fmh_entries >= info->head->fmh_count)
+			return -ECANCELED;
+
+		fmr.fmr_device = info->dev;
+		fmr.fmr_physical = info->next_daddr;
+		fmr.fmr_owner = info->missing_owner;
+		fmr.fmr_offset = 0;
+		fmr.fmr_length = rec_daddr - info->next_daddr;
+		fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+		xfs_getfsmap_format(mp, &fmr, info);
+	}
+
+	if (info->last)
+		goto out;
+
+	/* Fill out the extent we found */
+	if (info->head->fmh_entries >= info->head->fmh_count)
+		return -ECANCELED;
+
+	trace_xfs_fsmap_mapping(mp, info->dev,
+			info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec);
+
+	fmr.fmr_device = info->dev;
+	fmr.fmr_physical = rec_daddr;
+	error = xfs_fsmap_owner_from_rmap(&fmr, rec);
+	if (error)
+		return error;
+	fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset);
+	fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+	if (rec->rm_flags & XFS_RMAP_UNWRITTEN)
+		fmr.fmr_flags |= FMR_OF_PREALLOC;
+	if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+		fmr.fmr_flags |= FMR_OF_ATTR_FORK;
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		fmr.fmr_flags |= FMR_OF_EXTENT_MAP;
+	if (fmr.fmr_flags == 0) {
+		error = xfs_getfsmap_is_shared(tp, info, rec, &shared);
+		if (error)
+			return error;
+		if (shared)
+			fmr.fmr_flags |= FMR_OF_SHARED;
+	}
+
+	xfs_getfsmap_format(mp, &fmr, info);
+out:
+	rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+	if (info->next_daddr < rec_daddr)
+		info->next_daddr = rec_daddr;
+	return 0;
+}
+
+/* Transform a rmapbt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_helper(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_getfsmap_info	*info = priv;
+	xfs_fsblock_t			fsb;
+	xfs_daddr_t			rec_daddr;
+
+	fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock);
+	rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
+
+	return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
+}
+
+/* Transform a bnobt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_bnobt_helper(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_alloc_rec_incore *rec,
+	void				*priv)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_getfsmap_info	*info = priv;
+	struct xfs_rmap_irec		irec;
+	xfs_daddr_t			rec_daddr;
+
+	rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.pag->pag_agno,
+			rec->ar_startblock);
+
+	irec.rm_startblock = rec->ar_startblock;
+	irec.rm_blockcount = rec->ar_blockcount;
+	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
+	irec.rm_offset = 0;
+	irec.rm_flags = 0;
+
+	return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr);
+}
+
+/* Set rmap flags based on the getfsmap flags */
+static void
+xfs_getfsmap_set_irec_flags(
+	struct xfs_rmap_irec	*irec,
+	const struct xfs_fsmap	*fmr)
+{
+	irec->rm_flags = 0;
+	if (fmr->fmr_flags & FMR_OF_ATTR_FORK)
+		irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+	if (fmr->fmr_flags & FMR_OF_EXTENT_MAP)
+		irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+	if (fmr->fmr_flags & FMR_OF_PREALLOC)
+		irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+}
+
+/* Execute a getfsmap query against the log device. */
+STATIC int
+xfs_getfsmap_logdev(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_rmap_irec		rmap;
+	int				error;
+
+	/* Set up search keys */
+	info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+	error = xfs_fsmap_owner_to_rmap(&info->low, keys);
+	if (error)
+		return error;
+	info->low.rm_blockcount = 0;
+	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+	error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1);
+	if (error)
+		return error;
+	info->high.rm_startblock = -1U;
+	info->high.rm_owner = ULLONG_MAX;
+	info->high.rm_offset = ULLONG_MAX;
+	info->high.rm_blockcount = 0;
+	info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+	info->missing_owner = XFS_FMR_OWN_FREE;
+
+	trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low);
+	trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high);
+
+	if (keys[0].fmr_physical > 0)
+		return 0;
+
+	/* Fabricate an rmap entry for the external log device. */
+	rmap.rm_startblock = 0;
+	rmap.rm_blockcount = mp->m_sb.sb_logblocks;
+	rmap.rm_owner = XFS_RMAP_OWN_LOG;
+	rmap.rm_offset = 0;
+	rmap.rm_flags = 0;
+
+	return xfs_getfsmap_helper(tp, info, &rmap, 0);
+}
+
+#ifdef CONFIG_XFS_RT
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*rec,
+	void				*priv)
+{
+	struct xfs_getfsmap_info	*info = priv;
+	struct xfs_rmap_irec		irec;
+	xfs_daddr_t			rec_daddr;
+
+	irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+	rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock);
+	irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
+	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
+	irec.rm_offset = 0;
+	irec.rm_flags = 0;
+
+	return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
+
+/* Execute a getfsmap query against the realtime device. */
+STATIC int
+__xfs_getfsmap_rtdev(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	int				(*query_fn)(struct xfs_trans *,
+						    struct xfs_getfsmap_info *),
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_fsblock_t			start_fsb;
+	xfs_fsblock_t			end_fsb;
+	uint64_t			eofs;
+	int				error = 0;
+
+	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+	end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
+
+	/* Set up search keys */
+	info->low.rm_startblock = start_fsb;
+	error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+	if (error)
+		return error;
+	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+	info->low.rm_blockcount = 0;
+	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+	info->high.rm_startblock = end_fsb;
+	error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+	if (error)
+		return error;
+	info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset);
+	info->high.rm_blockcount = 0;
+	xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+
+	trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low);
+	trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high);
+
+	return query_fn(tp, info);
+}
+
+/* Actually query the realtime bitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_query(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_rtalloc_rec		alow = { 0 };
+	struct xfs_rtalloc_rec		ahigh = { 0 };
+	struct xfs_mount		*mp = tp->t_mountp;
+	int				error;
+
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
+
+	/*
+	 * Set up query parameters to return free rtextents covering the range
+	 * we want.
+	 */
+	alow.ar_startext = info->low.rm_startblock;
+	ahigh.ar_startext = info->high.rm_startblock;
+	do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
+	if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
+		ahigh.ar_startext++;
+	error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
+			xfs_getfsmap_rtdev_rtbitmap_helper, info);
+	if (error)
+		goto err;
+
+	/*
+	 * Report any gaps at the end of the rtbitmap by simulating a null
+	 * rmap starting at the block after the end of the query range.
+	 */
+	info->last = true;
+	ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
+
+	error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
+	if (error)
+		goto err;
+err:
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+/* Execute a getfsmap query against the realtime device rtbitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+	return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
+			info);
+}
+#endif /* CONFIG_XFS_RT */
+
+/* Execute a getfsmap query against the regular data device. */
+STATIC int
+__xfs_getfsmap_datadev(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info,
+	int				(*query_fn)(struct xfs_trans *,
+						    struct xfs_getfsmap_info *,
+						    struct xfs_btree_cur **,
+						    void *),
+	void				*priv)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_perag		*pag;
+	struct xfs_btree_cur		*bt_cur = NULL;
+	xfs_fsblock_t			start_fsb;
+	xfs_fsblock_t			end_fsb;
+	xfs_agnumber_t			start_ag;
+	xfs_agnumber_t			end_ag;
+	uint64_t			eofs;
+	int				error = 0;
+
+	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical);
+	end_fsb = XFS_DADDR_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
+
+	/*
+	 * Convert the fsmap low/high keys to AG based keys.  Initialize
+	 * low to the fsmap low key and max out the high key to the end
+	 * of the AG.
+	 */
+	info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
+	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+	error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+	if (error)
+		return error;
+	info->low.rm_blockcount = 0;
+	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+	info->high.rm_startblock = -1U;
+	info->high.rm_owner = ULLONG_MAX;
+	info->high.rm_offset = ULLONG_MAX;
+	info->high.rm_blockcount = 0;
+	info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+
+	start_ag = XFS_FSB_TO_AGNO(mp, start_fsb);
+	end_ag = XFS_FSB_TO_AGNO(mp, end_fsb);
+
+	for_each_perag_range(mp, start_ag, end_ag, pag) {
+		/*
+		 * Set the AG high key from the fsmap high key if this
+		 * is the last AG that we're querying.
+		 */
+		info->pag = pag;
+		if (pag->pag_agno == end_ag) {
+			info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
+					end_fsb);
+			info->high.rm_offset = XFS_BB_TO_FSBT(mp,
+					keys[1].fmr_offset);
+			error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+			if (error)
+				break;
+			xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+		}
+
+		if (bt_cur) {
+			xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+			bt_cur = NULL;
+			xfs_trans_brelse(tp, info->agf_bp);
+			info->agf_bp = NULL;
+		}
+
+		error = xfs_alloc_read_agf(pag, tp, 0, &info->agf_bp);
+		if (error)
+			break;
+
+		trace_xfs_fsmap_low_key(mp, info->dev, pag->pag_agno,
+				&info->low);
+		trace_xfs_fsmap_high_key(mp, info->dev, pag->pag_agno,
+				&info->high);
+
+		error = query_fn(tp, info, &bt_cur, priv);
+		if (error)
+			break;
+
+		/*
+		 * Set the AG low key to the start of the AG prior to
+		 * moving on to the next AG.
+		 */
+		if (pag->pag_agno == start_ag) {
+			info->low.rm_startblock = 0;
+			info->low.rm_owner = 0;
+			info->low.rm_offset = 0;
+			info->low.rm_flags = 0;
+		}
+
+		/*
+		 * If this is the last AG, report any gap at the end of it
+		 * before we drop the reference to the perag when the loop
+		 * terminates.
+		 */
+		if (pag->pag_agno == end_ag) {
+			info->last = true;
+			error = query_fn(tp, info, &bt_cur, priv);
+			if (error)
+				break;
+		}
+		info->pag = NULL;
+	}
+
+	if (bt_cur)
+		xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
+							 XFS_BTREE_NOERROR);
+	if (info->agf_bp) {
+		xfs_trans_brelse(tp, info->agf_bp);
+		info->agf_bp = NULL;
+	}
+	if (info->pag) {
+		xfs_perag_put(info->pag);
+		info->pag = NULL;
+	} else if (pag) {
+		/* loop termination case */
+		xfs_perag_put(pag);
+	}
+
+	return error;
+}
+
+/* Actually query the rmap btree. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt_query(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	struct xfs_btree_cur		**curpp,
+	void				*priv)
+{
+	/* Report any gap at the end of the last AG. */
+	if (info->last)
+		return xfs_getfsmap_datadev_helper(*curpp, &info->high, info);
+
+	/* Allocate cursor for this AG and query_range it. */
+	*curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+			info->pag);
+	return xfs_rmap_query_range(*curpp, &info->low, &info->high,
+			xfs_getfsmap_datadev_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device rmapbt. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	info->missing_owner = XFS_FMR_OWN_FREE;
+	return __xfs_getfsmap_datadev(tp, keys, info,
+			xfs_getfsmap_datadev_rmapbt_query, NULL);
+}
+
+/* Actually query the bno btree. */
+STATIC int
+xfs_getfsmap_datadev_bnobt_query(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	struct xfs_btree_cur		**curpp,
+	void				*priv)
+{
+	struct xfs_alloc_rec_incore	*key = priv;
+
+	/* Report any gap at the end of the last AG. */
+	if (info->last)
+		return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
+
+	/* Allocate cursor for this AG and query_range it. */
+	*curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+			info->pag, XFS_BTNUM_BNO);
+	key->ar_startblock = info->low.rm_startblock;
+	key[1].ar_startblock = info->high.rm_startblock;
+	return xfs_alloc_query_range(*curpp, key, &key[1],
+			xfs_getfsmap_datadev_bnobt_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device's bnobt. */
+STATIC int
+xfs_getfsmap_datadev_bnobt(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_alloc_rec_incore	akeys[2];
+
+	info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+	return __xfs_getfsmap_datadev(tp, keys, info,
+			xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
+}
+
+/* Do we recognize the device? */
+STATIC bool
+xfs_getfsmap_is_valid_device(
+	struct xfs_mount	*mp,
+	struct xfs_fsmap	*fm)
+{
+	if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
+	    fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
+		return true;
+	if (mp->m_logdev_targp &&
+	    fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
+		return true;
+	if (mp->m_rtdev_targp &&
+	    fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
+		return true;
+	return false;
+}
+
+/* Ensure that the low key is less than the high key. */
+STATIC bool
+xfs_getfsmap_check_keys(
+	struct xfs_fsmap		*low_key,
+	struct xfs_fsmap		*high_key)
+{
+	if (low_key->fmr_device > high_key->fmr_device)
+		return false;
+	if (low_key->fmr_device < high_key->fmr_device)
+		return true;
+
+	if (low_key->fmr_physical > high_key->fmr_physical)
+		return false;
+	if (low_key->fmr_physical < high_key->fmr_physical)
+		return true;
+
+	if (low_key->fmr_owner > high_key->fmr_owner)
+		return false;
+	if (low_key->fmr_owner < high_key->fmr_owner)
+		return true;
+
+	if (low_key->fmr_offset > high_key->fmr_offset)
+		return false;
+	if (low_key->fmr_offset < high_key->fmr_offset)
+		return true;
+
+	return false;
+}
+
+/*
+ * There are only two devices if we didn't configure RT devices at build time.
+ */
+#ifdef CONFIG_XFS_RT
+#define XFS_GETFSMAP_DEVS	3
+#else
+#define XFS_GETFSMAP_DEVS	2
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Get filesystem's extents as described in head, and format for output. Fills
+ * in the supplied records array until there are no more reverse mappings to
+ * return or head.fmh_entries == head.fmh_count.  In the second case, this
+ * function returns -ECANCELED to indicate that more records would have been
+ * returned.
+ *
+ * Key to Confusion
+ * ----------------
+ * There are multiple levels of keys and counters at work here:
+ * xfs_fsmap_head.fmh_keys	-- low and high fsmap keys passed in;
+ * 				   these reflect fs-wide sector addrs.
+ * dkeys			-- fmh_keys used to query each device;
+ * 				   these are fmh_keys but w/ the low key
+ * 				   bumped up by fmr_length.
+ * xfs_getfsmap_info.next_daddr	-- next disk addr we expect to see; this
+ *				   is how we detect gaps in the fsmap
+				   records and report them.
+ * xfs_getfsmap_info.low/high	-- per-AG low/high keys computed from
+ * 				   dkeys; used to query the metadata.
+ */
+int
+xfs_getfsmap(
+	struct xfs_mount		*mp,
+	struct xfs_fsmap_head		*head,
+	struct fsmap			*fsmap_recs)
+{
+	struct xfs_trans		*tp = NULL;
+	struct xfs_fsmap		dkeys[2];	/* per-dev keys */
+	struct xfs_getfsmap_dev		handlers[XFS_GETFSMAP_DEVS];
+	struct xfs_getfsmap_info	info = { NULL };
+	bool				use_rmap;
+	int				i;
+	int				error = 0;
+
+	if (head->fmh_iflags & ~FMH_IF_VALID)
+		return -EINVAL;
+	if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) ||
+	    !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
+		return -EINVAL;
+
+	use_rmap = xfs_has_rmapbt(mp) &&
+		   has_capability_noaudit(current, CAP_SYS_ADMIN);
+	head->fmh_entries = 0;
+
+	/* Set up our device handlers. */
+	memset(handlers, 0, sizeof(handlers));
+	handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+	if (use_rmap)
+		handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
+	else
+		handlers[0].fn = xfs_getfsmap_datadev_bnobt;
+	if (mp->m_logdev_targp != mp->m_ddev_targp) {
+		handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
+		handlers[1].fn = xfs_getfsmap_logdev;
+	}
+#ifdef CONFIG_XFS_RT
+	if (mp->m_rtdev_targp) {
+		handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
+		handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
+	}
+#endif /* CONFIG_XFS_RT */
+
+	xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
+			xfs_getfsmap_dev_compare);
+
+	/*
+	 * To continue where we left off, we allow userspace to use the
+	 * last mapping from a previous call as the low key of the next.
+	 * This is identified by a non-zero length in the low key. We
+	 * have to increment the low key in this scenario to ensure we
+	 * don't return the same mapping again, and instead return the
+	 * very next mapping.
+	 *
+	 * If the low key mapping refers to file data, the same physical
+	 * blocks could be mapped to several other files/offsets.
+	 * According to rmapbt record ordering, the minimal next
+	 * possible record for the block range is the next starting
+	 * offset in the same inode. Therefore, bump the file offset to
+	 * continue the search appropriately.  For all other low key
+	 * mapping types (attr blocks, metadata), bump the physical
+	 * offset as there can be no other mapping for the same physical
+	 * block range.
+	 */
+	dkeys[0] = head->fmh_keys[0];
+	if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) {
+		dkeys[0].fmr_physical += dkeys[0].fmr_length;
+		dkeys[0].fmr_owner = 0;
+		if (dkeys[0].fmr_offset)
+			return -EINVAL;
+	} else
+		dkeys[0].fmr_offset += dkeys[0].fmr_length;
+	dkeys[0].fmr_length = 0;
+	memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap));
+
+	if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
+		return -EINVAL;
+
+	info.next_daddr = head->fmh_keys[0].fmr_physical +
+			  head->fmh_keys[0].fmr_length;
+	info.fsmap_recs = fsmap_recs;
+	info.head = head;
+
+	/* For each device we support... */
+	for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
+		/* Is this device within the range the user asked for? */
+		if (!handlers[i].fn)
+			continue;
+		if (head->fmh_keys[0].fmr_device > handlers[i].dev)
+			continue;
+		if (head->fmh_keys[1].fmr_device < handlers[i].dev)
+			break;
+
+		/*
+		 * If this device number matches the high key, we have
+		 * to pass the high key to the handler to limit the
+		 * query results.  If the device number exceeds the
+		 * low key, zero out the low key so that we get
+		 * everything from the beginning.
+		 */
+		if (handlers[i].dev == head->fmh_keys[1].fmr_device)
+			dkeys[1] = head->fmh_keys[1];
+		if (handlers[i].dev > head->fmh_keys[0].fmr_device)
+			memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
+
+		/*
+		 * Grab an empty transaction so that we can use its recursive
+		 * buffer locking abilities to detect cycles in the rmapbt
+		 * without deadlocking.
+		 */
+		error = xfs_trans_alloc_empty(mp, &tp);
+		if (error)
+			break;
+
+		info.dev = handlers[i].dev;
+		info.last = false;
+		info.pag = NULL;
+		error = handlers[i].fn(tp, dkeys, &info);
+		if (error)
+			break;
+		xfs_trans_cancel(tp);
+		tp = NULL;
+		info.next_daddr = 0;
+	}
+
+	if (tp)
+		xfs_trans_cancel(tp);
+	head->fmh_oflags = FMH_OF_DEV_T;
+	return error;
+}
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
new file mode 100644
index 000000000..a0775788e
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_FSMAP_H__
+#define __XFS_FSMAP_H__
+
+struct fsmap;
+
+/* internal fsmap representation */
+struct xfs_fsmap {
+	dev_t		fmr_device;	/* device id */
+	uint32_t	fmr_flags;	/* mapping flags */
+	uint64_t	fmr_physical;	/* device offset of segment */
+	uint64_t	fmr_owner;	/* owner id */
+	xfs_fileoff_t	fmr_offset;	/* file offset of segment */
+	xfs_filblks_t	fmr_length;	/* length of segment, blocks */
+};
+
+struct xfs_fsmap_head {
+	uint32_t	fmh_iflags;	/* control flags */
+	uint32_t	fmh_oflags;	/* output flags */
+	unsigned int	fmh_count;	/* # of entries in array incl. input */
+	unsigned int	fmh_entries;	/* # of entries filled in (output). */
+
+	struct xfs_fsmap fmh_keys[2];	/* low and high keys */
+};
+
+void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
+
+int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
+		struct fsmap *out_recs);
+
+#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
new file mode 100644
index 000000000..13851c0d6
--- /dev/null
+++ b/fs/xfs/xfs_fsops.c
@@ -0,0 +1,604 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+#include "xfs_fsops.h"
+#include "xfs_trans_space.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trace.h"
+
+/*
+ * Write new AG headers to disk. Non-transactional, but need to be
+ * written and completed prior to the growfs transaction being logged.
+ * To do this, we use a delayed write buffer list and wait for
+ * submission and IO completion of the list as a whole. This allows the
+ * IO subsystem to merge all the AG headers in a single AG into a single
+ * IO and hide most of the latency of the IO from us.
+ *
+ * This also means that if we get an error whilst building the buffer
+ * list to write, we can cancel the entire list without having written
+ * anything.
+ */
+static int
+xfs_resizefs_init_new_ags(
+	struct xfs_trans	*tp,
+	struct aghdr_init_data	*id,
+	xfs_agnumber_t		oagcount,
+	xfs_agnumber_t		nagcount,
+	xfs_rfsblock_t		delta,
+	struct xfs_perag	*last_pag,
+	bool			*lastag_extended)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_rfsblock_t		nb = mp->m_sb.sb_dblocks + delta;
+	int			error;
+
+	*lastag_extended = false;
+
+	INIT_LIST_HEAD(&id->buffer_list);
+	for (id->agno = nagcount - 1;
+	     id->agno >= oagcount;
+	     id->agno--, delta -= id->agsize) {
+
+		if (id->agno == nagcount - 1)
+			id->agsize = nb - (id->agno *
+					(xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+		else
+			id->agsize = mp->m_sb.sb_agblocks;
+
+		error = xfs_ag_init_headers(mp, id);
+		if (error) {
+			xfs_buf_delwri_cancel(&id->buffer_list);
+			return error;
+		}
+	}
+
+	error = xfs_buf_delwri_submit(&id->buffer_list);
+	if (error)
+		return error;
+
+	if (delta) {
+		*lastag_extended = true;
+		error = xfs_ag_extend_space(last_pag, tp, delta);
+	}
+	return error;
+}
+
+/*
+ * growfs operations
+ */
+static int
+xfs_growfs_data_private(
+	struct xfs_mount	*mp,		/* mount point for filesystem */
+	struct xfs_growfs_data	*in)		/* growfs data input struct */
+{
+	struct xfs_buf		*bp;
+	int			error;
+	xfs_agnumber_t		nagcount;
+	xfs_agnumber_t		nagimax = 0;
+	xfs_rfsblock_t		nb, nb_div, nb_mod;
+	int64_t			delta;
+	bool			lastag_extended;
+	xfs_agnumber_t		oagcount;
+	struct xfs_trans	*tp;
+	struct aghdr_init_data	id = {};
+	struct xfs_perag	*last_pag;
+
+	nb = in->newblocks;
+	error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
+	if (error)
+		return error;
+
+	if (nb > mp->m_sb.sb_dblocks) {
+		error = xfs_buf_read_uncached(mp->m_ddev_targp,
+				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+		if (error)
+			return error;
+		xfs_buf_relse(bp);
+	}
+
+	nb_div = nb;
+	nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
+	nagcount = nb_div + (nb_mod != 0);
+	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
+		nagcount--;
+		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
+	}
+	delta = nb - mp->m_sb.sb_dblocks;
+	/*
+	 * Reject filesystems with a single AG because they are not
+	 * supported, and reject a shrink operation that would cause a
+	 * filesystem to become unsupported.
+	 */
+	if (delta < 0 && nagcount < 2)
+		return -EINVAL;
+
+	oagcount = mp->m_sb.sb_agcount;
+	/* allocate the new per-ag structures */
+	if (nagcount > oagcount) {
+		error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
+		if (error)
+			return error;
+	} else if (nagcount < oagcount) {
+		/* TODO: shrinking the entire AGs hasn't yet completed */
+		return -EINVAL;
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+			(delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0,
+			XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	last_pag = xfs_perag_get(mp, oagcount - 1);
+	if (delta > 0) {
+		error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
+				delta, last_pag, &lastag_extended);
+	} else {
+		xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
+	"EXPERIMENTAL online shrink feature in use. Use at your own risk!");
+
+		error = xfs_ag_shrink_space(last_pag, &tp, -delta);
+	}
+	xfs_perag_put(last_pag);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Update changed superblock fields transactionally. These are not
+	 * seen by the rest of the world until the transaction commit applies
+	 * them atomically to the superblock.
+	 */
+	if (nagcount > oagcount)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+	if (delta)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta);
+	if (id.nfree)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
+
+	/*
+	 * Sync sb counters now to reflect the updated values. This is
+	 * particularly important for shrink because the write verifier
+	 * will fail if sb_fdblocks is ever larger than sb_dblocks.
+	 */
+	if (xfs_has_lazysbcount(mp))
+		xfs_log_sb(tp);
+
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp);
+	if (error)
+		return error;
+
+	/* New allocation groups fully initialized, so update mount struct */
+	if (nagimax)
+		mp->m_maxagi = nagimax;
+	xfs_set_low_space_thresholds(mp);
+	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+
+	if (delta > 0) {
+		/*
+		 * If we expanded the last AG, free the per-AG reservation
+		 * so we can reinitialize it with the new size.
+		 */
+		if (lastag_extended) {
+			struct xfs_perag	*pag;
+
+			pag = xfs_perag_get(mp, id.agno);
+			error = xfs_ag_resv_free(pag);
+			xfs_perag_put(pag);
+			if (error)
+				return error;
+		}
+		/*
+		 * Reserve AG metadata blocks. ENOSPC here does not mean there
+		 * was a growfs failure, just that there still isn't space for
+		 * new user data after the grow has been run.
+		 */
+		error = xfs_fs_reserve_ag_blocks(mp);
+		if (error == -ENOSPC)
+			error = 0;
+	}
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+static int
+xfs_growfs_log_private(
+	struct xfs_mount	*mp,	/* mount point for filesystem */
+	struct xfs_growfs_log	*in)	/* growfs log input struct */
+{
+	xfs_extlen_t		nb;
+
+	nb = in->newblocks;
+	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
+		return -EINVAL;
+	if (nb == mp->m_sb.sb_logblocks &&
+	    in->isint == (mp->m_sb.sb_logstart != 0))
+		return -EINVAL;
+	/*
+	 * Moving the log is hard, need new interfaces to sync
+	 * the log first, hold off all activity while moving it.
+	 * Can have shorter or longer log in the same space,
+	 * or transform internal to external log or vice versa.
+	 */
+	return -ENOSYS;
+}
+
+static int
+xfs_growfs_imaxpct(
+	struct xfs_mount	*mp,
+	__u32			imaxpct)
+{
+	struct xfs_trans	*tp;
+	int			dpct;
+	int			error;
+
+	if (imaxpct > 100)
+		return -EINVAL;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	dpct = imaxpct - mp->m_sb.sb_imax_pct;
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+	xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp);
+}
+
+/*
+ * protected versions of growfs function acquire and release locks on the mount
+ * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
+ * XFS_IOC_FSGROWFSRT
+ */
+int
+xfs_growfs_data(
+	struct xfs_mount	*mp,
+	struct xfs_growfs_data	*in)
+{
+	int			error = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!mutex_trylock(&mp->m_growlock))
+		return -EWOULDBLOCK;
+
+	/* update imaxpct separately to the physical grow of the filesystem */
+	if (in->imaxpct != mp->m_sb.sb_imax_pct) {
+		error = xfs_growfs_imaxpct(mp, in->imaxpct);
+		if (error)
+			goto out_error;
+	}
+
+	if (in->newblocks != mp->m_sb.sb_dblocks) {
+		error = xfs_growfs_data_private(mp, in);
+		if (error)
+			goto out_error;
+	}
+
+	/* Post growfs calculations needed to reflect new state in operations */
+	if (mp->m_sb.sb_imax_pct) {
+		uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+		do_div(icount, 100);
+		M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount);
+	} else
+		M_IGEO(mp)->maxicount = 0;
+
+	/* Update secondary superblocks now the physical grow has completed */
+	error = xfs_update_secondary_sbs(mp);
+
+out_error:
+	/*
+	 * Increment the generation unconditionally, the error could be from
+	 * updating the secondary superblocks, in which case the new size
+	 * is live already.
+	 */
+	mp->m_generation++;
+	mutex_unlock(&mp->m_growlock);
+	return error;
+}
+
+int
+xfs_growfs_log(
+	xfs_mount_t		*mp,
+	struct xfs_growfs_log	*in)
+{
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!mutex_trylock(&mp->m_growlock))
+		return -EWOULDBLOCK;
+	error = xfs_growfs_log_private(mp, in);
+	mutex_unlock(&mp->m_growlock);
+	return error;
+}
+
+/*
+ * exported through ioctl XFS_IOC_FSCOUNTS
+ */
+
+void
+xfs_fs_counts(
+	xfs_mount_t		*mp,
+	xfs_fsop_counts_t	*cnt)
+{
+	cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
+	cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
+	cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+						xfs_fdblocks_unavailable(mp);
+	cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
+}
+
+/*
+ * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
+ *
+ * xfs_reserve_blocks is called to set m_resblks
+ * in the in-core mount table. The number of unused reserved blocks
+ * is kept in m_resblks_avail.
+ *
+ * Reserve the requested number of blocks if available. Otherwise return
+ * as many as possible to satisfy the request. The actual number
+ * reserved are returned in outval
+ *
+ * A null inval pointer indicates that only the current reserved blocks
+ * available  should  be returned no settings are changed.
+ */
+
+int
+xfs_reserve_blocks(
+	xfs_mount_t             *mp,
+	uint64_t              *inval,
+	xfs_fsop_resblks_t      *outval)
+{
+	int64_t			lcounter, delta;
+	int64_t			fdblks_delta = 0;
+	uint64_t		request;
+	int64_t			free;
+	int			error = 0;
+
+	/* If inval is null, report current values and return */
+	if (inval == (uint64_t *)NULL) {
+		if (!outval)
+			return -EINVAL;
+		outval->resblks = mp->m_resblks;
+		outval->resblks_avail = mp->m_resblks_avail;
+		return 0;
+	}
+
+	request = *inval;
+
+	/*
+	 * With per-cpu counters, this becomes an interesting problem. we need
+	 * to work out if we are freeing or allocation blocks first, then we can
+	 * do the modification as necessary.
+	 *
+	 * We do this under the m_sb_lock so that if we are near ENOSPC, we will
+	 * hold out any changes while we work out what to do. This means that
+	 * the amount of free space can change while we do this, so we need to
+	 * retry if we end up trying to reserve more space than is available.
+	 */
+	spin_lock(&mp->m_sb_lock);
+
+	/*
+	 * If our previous reservation was larger than the current value,
+	 * then move any unused blocks back to the free pool. Modify the resblks
+	 * counters directly since we shouldn't have any problems unreserving
+	 * space.
+	 */
+	if (mp->m_resblks > request) {
+		lcounter = mp->m_resblks_avail - request;
+		if (lcounter  > 0) {		/* release unused blocks */
+			fdblks_delta = lcounter;
+			mp->m_resblks_avail -= lcounter;
+		}
+		mp->m_resblks = request;
+		if (fdblks_delta) {
+			spin_unlock(&mp->m_sb_lock);
+			error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+			spin_lock(&mp->m_sb_lock);
+		}
+
+		goto out;
+	}
+
+	/*
+	 * If the request is larger than the current reservation, reserve the
+	 * blocks before we update the reserve counters. Sample m_fdblocks and
+	 * perform a partial reservation if the request exceeds free space.
+	 *
+	 * The code below estimates how many blocks it can request from
+	 * fdblocks to stash in the reserve pool.  This is a classic TOCTOU
+	 * race since fdblocks updates are not always coordinated via
+	 * m_sb_lock.  Set the reserve size even if there's not enough free
+	 * space to fill it because mod_fdblocks will refill an undersized
+	 * reserve when it can.
+	 */
+	free = percpu_counter_sum(&mp->m_fdblocks) -
+						xfs_fdblocks_unavailable(mp);
+	delta = request - mp->m_resblks;
+	mp->m_resblks = request;
+	if (delta > 0 && free > 0) {
+		/*
+		 * We'll either succeed in getting space from the free block
+		 * count or we'll get an ENOSPC.  Don't set the reserved flag
+		 * here - we don't want to reserve the extra reserve blocks
+		 * from the reserve.
+		 *
+		 * The desired reserve size can change after we drop the lock.
+		 * Use mod_fdblocks to put the space into the reserve or into
+		 * fdblocks as appropriate.
+		 */
+		fdblks_delta = min(free, delta);
+		spin_unlock(&mp->m_sb_lock);
+		error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+		if (!error)
+			xfs_mod_fdblocks(mp, fdblks_delta, 0);
+		spin_lock(&mp->m_sb_lock);
+	}
+out:
+	if (outval) {
+		outval->resblks = mp->m_resblks;
+		outval->resblks_avail = mp->m_resblks_avail;
+	}
+
+	spin_unlock(&mp->m_sb_lock);
+	return error;
+}
+
+int
+xfs_fs_goingdown(
+	xfs_mount_t	*mp,
+	uint32_t	inflags)
+{
+	switch (inflags) {
+	case XFS_FSOP_GOING_FLAGS_DEFAULT: {
+		if (!freeze_bdev(mp->m_super->s_bdev)) {
+			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+			thaw_bdev(mp->m_super->s_bdev);
+		}
+		break;
+	}
+	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
+		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+		break;
+	case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
+		xfs_force_shutdown(mp,
+				SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping the filesystem
+ * consistent. We don't do an unmount here; just shutdown the shop, make sure
+ * that absolutely nothing persistent happens to this filesystem after this
+ * point.
+ *
+ * The shutdown state change is atomic, resulting in the first and only the
+ * first shutdown call processing the shutdown. This means we only shutdown the
+ * log once as it requires, and we don't spam the logs when multiple concurrent
+ * shutdowns race to set the shutdown flags.
+ */
+void
+xfs_do_force_shutdown(
+	struct xfs_mount *mp,
+	uint32_t	flags,
+	char		*fname,
+	int		lnnum)
+{
+	int		tag;
+	const char	*why;
+
+
+	if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) {
+		xlog_shutdown_wait(mp->m_log);
+		return;
+	}
+	if (mp->m_sb_bp)
+		mp->m_sb_bp->b_flags |= XBF_DONE;
+
+	if (flags & SHUTDOWN_FORCE_UMOUNT)
+		xfs_alert(mp, "User initiated shutdown received.");
+
+	if (xlog_force_shutdown(mp->m_log, flags)) {
+		tag = XFS_PTAG_SHUTDOWN_LOGERROR;
+		why = "Log I/O Error";
+	} else if (flags & SHUTDOWN_CORRUPT_INCORE) {
+		tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+		why = "Corruption of in-memory data";
+	} else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
+		tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+		why = "Corruption of on-disk metadata";
+	} else {
+		tag = XFS_PTAG_SHUTDOWN_IOERROR;
+		why = "Metadata I/O Error";
+	}
+
+	trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum);
+
+	xfs_alert_tag(mp, tag,
+"%s (0x%x) detected at %pS (%s:%d).  Shutting down filesystem.",
+			why, flags, __return_address, fname, lnnum);
+	xfs_alert(mp,
+		"Please unmount the filesystem and rectify the problem(s)");
+	if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+		xfs_stack_trace();
+}
+
+/*
+ * Reserve free space for per-AG metadata.
+ */
+int
+xfs_fs_reserve_ag_blocks(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			err2;
+
+	mp->m_finobt_nores = false;
+	for_each_perag(mp, agno, pag) {
+		err2 = xfs_ag_resv_init(pag, NULL);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	if (error && error != -ENOSPC) {
+		xfs_warn(mp,
+	"Error %d reserving per-AG metadata reserve pool.", error);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	}
+
+	return error;
+}
+
+/*
+ * Free space reserved for per-AG metadata.
+ */
+int
+xfs_fs_unreserve_ag_blocks(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			err2;
+
+	for_each_perag(mp, agno, pag) {
+		err2 = xfs_ag_resv_free(pag);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	if (error)
+		xfs_warn(mp,
+	"Error %d freeing per-AG metadata reserve pool.", error);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
new file mode 100644
index 000000000..2cffe51a3
--- /dev/null
+++ b/fs/xfs/xfs_fsops.h
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FSOPS_H__
+#define	__XFS_FSOPS_H__
+
+extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
+extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
+extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
+extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
+				xfs_fsop_resblks_t *outval);
+extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
+
+extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+
+#endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
new file mode 100644
index 000000000..4d0a98f92
--- /dev/null
+++ b/fs/xfs/xfs_globals.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+
+/*
+ * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
+ * other XFS code uses these values.  Times are measured in centisecs (i.e.
+ * 100ths of a second) with the exception of blockgc_timer, which is measured
+ * in seconds.
+ */
+xfs_param_t xfs_params = {
+			  /*	MIN		DFLT		MAX	*/
+	.sgid_inherit	= {	0,		0,		1	},
+	.symlink_mode	= {	0,		0,		1	},
+	.panic_mask	= {	0,		0,		256	},
+	.error_level	= {	0,		3,		11	},
+	.syncd_timer	= {	1*100,		30*100,		7200*100},
+	.stats_clear	= {	0,		0,		1	},
+	.inherit_sync	= {	0,		1,		1	},
+	.inherit_nodump	= {	0,		1,		1	},
+	.inherit_noatim = {	0,		1,		1	},
+	.xfs_buf_timer	= {	100/2,		1*100,		30*100	},
+	.xfs_buf_age	= {	1*100,		15*100,		7200*100},
+	.inherit_nosym	= {	0,		0,		1	},
+	.rotorstep	= {	1,		1,		255	},
+	.inherit_nodfrg	= {	0,		1,		1	},
+	.fstrm_timer	= {	1,		30*100,		3600*100},
+	.blockgc_timer	= {	1,		300,		3600*24},
+};
+
+struct xfs_globals xfs_globals = {
+	.log_recovery_delay	=	0,	/* no delay by default */
+	.mount_delay		=	0,	/* no delay by default */
+#ifdef XFS_ASSERT_FATAL
+	.bug_on_assert		=	true,	/* assert failures BUG() */
+#else
+	.bug_on_assert		=	false,	/* assert failures WARN() */
+#endif
+#ifdef DEBUG
+	.pwork_threads		=	-1,	/* automatic thread detection */
+	.larp			=	false,	/* log attribute replay */
+#endif
+};
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
new file mode 100644
index 000000000..72a075bb2
--- /dev/null
+++ b/fs/xfs/xfs_health.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_health.h"
+#include "xfs_ag.h"
+
+/*
+ * Warn about metadata corruption that we detected but haven't fixed, and
+ * make sure we're not sitting on anything that would get in the way of
+ * recovery.
+ */
+void
+xfs_health_unmount(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	unsigned int		sick = 0;
+	unsigned int		checked = 0;
+	bool			warn = false;
+
+	if (xfs_is_shutdown(mp))
+		return;
+
+	/* Measure AG corruption levels. */
+	for_each_perag(mp, agno, pag) {
+		xfs_ag_measure_sickness(pag, &sick, &checked);
+		if (sick) {
+			trace_xfs_ag_unfixed_corruption(mp, agno, sick);
+			warn = true;
+		}
+	}
+
+	/* Measure realtime volume corruption levels. */
+	xfs_rt_measure_sickness(mp, &sick, &checked);
+	if (sick) {
+		trace_xfs_rt_unfixed_corruption(mp, sick);
+		warn = true;
+	}
+
+	/*
+	 * Measure fs corruption and keep the sample around for the warning.
+	 * See the note below for why we exempt FS_COUNTERS.
+	 */
+	xfs_fs_measure_sickness(mp, &sick, &checked);
+	if (sick & ~XFS_SICK_FS_COUNTERS) {
+		trace_xfs_fs_unfixed_corruption(mp, sick);
+		warn = true;
+	}
+
+	if (warn) {
+		xfs_warn(mp,
+"Uncorrected metadata errors detected; please run xfs_repair.");
+
+		/*
+		 * We discovered uncorrected metadata problems at some point
+		 * during this filesystem mount and have advised the
+		 * administrator to run repair once the unmount completes.
+		 *
+		 * However, we must be careful -- when FSCOUNTERS are flagged
+		 * unhealthy, the unmount procedure omits writing the clean
+		 * unmount record to the log so that the next mount will run
+		 * recovery and recompute the summary counters.  In other
+		 * words, we leave a dirty log to get the counters fixed.
+		 *
+		 * Unfortunately, xfs_repair cannot recover dirty logs, so if
+		 * there were filesystem problems, FSCOUNTERS was flagged, and
+		 * the administrator takes our advice to run xfs_repair,
+		 * they'll have to zap the log before repairing structures.
+		 * We don't really want to encourage this, so we mark the
+		 * FSCOUNTERS healthy so that a subsequent repair run won't see
+		 * a dirty log.
+		 */
+		if (sick & XFS_SICK_FS_COUNTERS)
+			xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+	}
+}
+
+/* Mark unhealthy per-fs metadata. */
+void
+xfs_fs_mark_sick(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+	trace_xfs_fs_mark_sick(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_fs_sick |= mask;
+	mp->m_fs_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a per-fs metadata healed. */
+void
+xfs_fs_mark_healthy(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+	trace_xfs_fs_mark_healthy(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_fs_sick &= ~mask;
+	mp->m_fs_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which per-fs metadata are unhealthy. */
+void
+xfs_fs_measure_sickness(
+	struct xfs_mount	*mp,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&mp->m_sb_lock);
+	*sick = mp->m_fs_sick;
+	*checked = mp->m_fs_checked;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy realtime metadata. */
+void
+xfs_rt_mark_sick(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+	trace_xfs_rt_mark_sick(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_rt_sick |= mask;
+	mp->m_rt_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a realtime metadata healed. */
+void
+xfs_rt_mark_healthy(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+	trace_xfs_rt_mark_healthy(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_rt_sick &= ~mask;
+	mp->m_rt_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which realtime metadata are unhealthy. */
+void
+xfs_rt_measure_sickness(
+	struct xfs_mount	*mp,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&mp->m_sb_lock);
+	*sick = mp->m_rt_sick;
+	*checked = mp->m_rt_checked;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy per-ag metadata. */
+void
+xfs_ag_mark_sick(
+	struct xfs_perag	*pag,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+	trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
+
+	spin_lock(&pag->pag_state_lock);
+	pag->pag_sick |= mask;
+	pag->pag_checked |= mask;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata ok. */
+void
+xfs_ag_mark_healthy(
+	struct xfs_perag	*pag,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+	trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
+
+	spin_lock(&pag->pag_state_lock);
+	pag->pag_sick &= ~mask;
+	pag->pag_checked |= mask;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Sample which per-ag metadata are unhealthy. */
+void
+xfs_ag_measure_sickness(
+	struct xfs_perag	*pag,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&pag->pag_state_lock);
+	*sick = pag->pag_sick;
+	*checked = pag->pag_checked;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark the unhealthy parts of an inode. */
+void
+xfs_inode_mark_sick(
+	struct xfs_inode	*ip,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+	trace_xfs_inode_mark_sick(ip, mask);
+
+	spin_lock(&ip->i_flags_lock);
+	ip->i_sick |= mask;
+	ip->i_checked |= mask;
+	spin_unlock(&ip->i_flags_lock);
+
+	/*
+	 * Keep this inode around so we don't lose the sickness report.  Scrub
+	 * grabs inodes with DONTCACHE assuming that most inode are ok, which
+	 * is not the case here.
+	 */
+	spin_lock(&VFS_I(ip)->i_lock);
+	VFS_I(ip)->i_state &= ~I_DONTCACHE;
+	spin_unlock(&VFS_I(ip)->i_lock);
+}
+
+/* Mark parts of an inode healed. */
+void
+xfs_inode_mark_healthy(
+	struct xfs_inode	*ip,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+	trace_xfs_inode_mark_healthy(ip, mask);
+
+	spin_lock(&ip->i_flags_lock);
+	ip->i_sick &= ~mask;
+	ip->i_checked |= mask;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+/* Sample which parts of an inode are unhealthy. */
+void
+xfs_inode_measure_sickness(
+	struct xfs_inode	*ip,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&ip->i_flags_lock);
+	*sick = ip->i_sick;
+	*checked = ip->i_checked;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+/* Mappings between internal sick masks and ioctl sick masks. */
+
+struct ioctl_sick_map {
+	unsigned int		sick_mask;
+	unsigned int		ioctl_mask;
+};
+
+static const struct ioctl_sick_map fs_map[] = {
+	{ XFS_SICK_FS_COUNTERS,	XFS_FSOP_GEOM_SICK_COUNTERS},
+	{ XFS_SICK_FS_UQUOTA,	XFS_FSOP_GEOM_SICK_UQUOTA },
+	{ XFS_SICK_FS_GQUOTA,	XFS_FSOP_GEOM_SICK_GQUOTA },
+	{ XFS_SICK_FS_PQUOTA,	XFS_FSOP_GEOM_SICK_PQUOTA },
+	{ 0, 0 },
+};
+
+static const struct ioctl_sick_map rt_map[] = {
+	{ XFS_SICK_RT_BITMAP,	XFS_FSOP_GEOM_SICK_RT_BITMAP },
+	{ XFS_SICK_RT_SUMMARY,	XFS_FSOP_GEOM_SICK_RT_SUMMARY },
+	{ 0, 0 },
+};
+
+static inline void
+xfgeo_health_tick(
+	struct xfs_fsop_geom		*geo,
+	unsigned int			sick,
+	unsigned int			checked,
+	const struct ioctl_sick_map	*m)
+{
+	if (checked & m->sick_mask)
+		geo->checked |= m->ioctl_mask;
+	if (sick & m->sick_mask)
+		geo->sick |= m->ioctl_mask;
+}
+
+/* Fill out fs geometry health info. */
+void
+xfs_fsop_geom_health(
+	struct xfs_mount		*mp,
+	struct xfs_fsop_geom		*geo)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			sick;
+	unsigned int			checked;
+
+	geo->sick = 0;
+	geo->checked = 0;
+
+	xfs_fs_measure_sickness(mp, &sick, &checked);
+	for (m = fs_map; m->sick_mask; m++)
+		xfgeo_health_tick(geo, sick, checked, m);
+
+	xfs_rt_measure_sickness(mp, &sick, &checked);
+	for (m = rt_map; m->sick_mask; m++)
+		xfgeo_health_tick(geo, sick, checked, m);
+}
+
+static const struct ioctl_sick_map ag_map[] = {
+	{ XFS_SICK_AG_SB,	XFS_AG_GEOM_SICK_SB },
+	{ XFS_SICK_AG_AGF,	XFS_AG_GEOM_SICK_AGF },
+	{ XFS_SICK_AG_AGFL,	XFS_AG_GEOM_SICK_AGFL },
+	{ XFS_SICK_AG_AGI,	XFS_AG_GEOM_SICK_AGI },
+	{ XFS_SICK_AG_BNOBT,	XFS_AG_GEOM_SICK_BNOBT },
+	{ XFS_SICK_AG_CNTBT,	XFS_AG_GEOM_SICK_CNTBT },
+	{ XFS_SICK_AG_INOBT,	XFS_AG_GEOM_SICK_INOBT },
+	{ XFS_SICK_AG_FINOBT,	XFS_AG_GEOM_SICK_FINOBT },
+	{ XFS_SICK_AG_RMAPBT,	XFS_AG_GEOM_SICK_RMAPBT },
+	{ XFS_SICK_AG_REFCNTBT,	XFS_AG_GEOM_SICK_REFCNTBT },
+	{ 0, 0 },
+};
+
+/* Fill out ag geometry health info. */
+void
+xfs_ag_geom_health(
+	struct xfs_perag		*pag,
+	struct xfs_ag_geometry		*ageo)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			sick;
+	unsigned int			checked;
+
+	ageo->ag_sick = 0;
+	ageo->ag_checked = 0;
+
+	xfs_ag_measure_sickness(pag, &sick, &checked);
+	for (m = ag_map; m->sick_mask; m++) {
+		if (checked & m->sick_mask)
+			ageo->ag_checked |= m->ioctl_mask;
+		if (sick & m->sick_mask)
+			ageo->ag_sick |= m->ioctl_mask;
+	}
+}
+
+static const struct ioctl_sick_map ino_map[] = {
+	{ XFS_SICK_INO_CORE,	XFS_BS_SICK_INODE },
+	{ XFS_SICK_INO_BMBTD,	XFS_BS_SICK_BMBTD },
+	{ XFS_SICK_INO_BMBTA,	XFS_BS_SICK_BMBTA },
+	{ XFS_SICK_INO_BMBTC,	XFS_BS_SICK_BMBTC },
+	{ XFS_SICK_INO_DIR,	XFS_BS_SICK_DIR },
+	{ XFS_SICK_INO_XATTR,	XFS_BS_SICK_XATTR },
+	{ XFS_SICK_INO_SYMLINK,	XFS_BS_SICK_SYMLINK },
+	{ XFS_SICK_INO_PARENT,	XFS_BS_SICK_PARENT },
+	{ 0, 0 },
+};
+
+/* Fill out bulkstat health info. */
+void
+xfs_bulkstat_health(
+	struct xfs_inode		*ip,
+	struct xfs_bulkstat		*bs)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			sick;
+	unsigned int			checked;
+
+	bs->bs_sick = 0;
+	bs->bs_checked = 0;
+
+	xfs_inode_measure_sickness(ip, &sick, &checked);
+	for (m = ino_map; m->sick_mask; m++) {
+		if (checked & m->sick_mask)
+			bs->bs_checked |= m->ioctl_mask;
+		if (sick & m->sick_mask)
+			bs->bs_sick |= m->ioctl_mask;
+	}
+}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
new file mode 100644
index 000000000..d884cba1d
--- /dev/null
+++ b/fs/xfs/xfs_icache.c
@@ -0,0 +1,2251 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_reflink.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_log_priv.h"
+
+#include <linux/iversion.h>
+
+/* Radix tree tags for incore inode tree. */
+
+/* inode is to be reclaimed */
+#define XFS_ICI_RECLAIM_TAG	0
+/* Inode has speculative preallocations (posteof or cow) to clean. */
+#define XFS_ICI_BLOCKGC_TAG	1
+
+/*
+ * The goal for walking incore inodes.  These can correspond with incore inode
+ * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
+ */
+enum xfs_icwalk_goal {
+	/* Goals directly associated with tagged inodes. */
+	XFS_ICWALK_BLOCKGC	= XFS_ICI_BLOCKGC_TAG,
+	XFS_ICWALK_RECLAIM	= XFS_ICI_RECLAIM_TAG,
+};
+
+static int xfs_icwalk(struct xfs_mount *mp,
+		enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
+static int xfs_icwalk_ag(struct xfs_perag *pag,
+		enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
+
+/*
+ * Private inode cache walk flags for struct xfs_icwalk.  Must not
+ * coincide with XFS_ICWALK_FLAGS_VALID.
+ */
+
+/* Stop scanning after icw_scan_limit inodes. */
+#define XFS_ICWALK_FLAG_SCAN_LIMIT	(1U << 28)
+
+#define XFS_ICWALK_FLAG_RECLAIM_SICK	(1U << 27)
+#define XFS_ICWALK_FLAG_UNION		(1U << 26) /* union filter algorithm */
+
+#define XFS_ICWALK_PRIVATE_FLAGS	(XFS_ICWALK_FLAG_SCAN_LIMIT | \
+					 XFS_ICWALK_FLAG_RECLAIM_SICK | \
+					 XFS_ICWALK_FLAG_UNION)
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+struct xfs_inode *
+xfs_inode_alloc(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+
+	/*
+	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
+	 * and return NULL here on ENOMEM.
+	 */
+	ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+	if (inode_init_always(mp->m_super, VFS_I(ip))) {
+		kmem_cache_free(xfs_inode_cache, ip);
+		return NULL;
+	}
+
+	/* VFS doesn't initialise i_mode or i_state! */
+	VFS_I(ip)->i_mode = 0;
+	VFS_I(ip)->i_state = 0;
+	mapping_set_large_folios(VFS_I(ip)->i_mapping);
+
+	XFS_STATS_INC(mp, vn_active);
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(ip->i_ino == 0);
+
+	/* initialise the xfs inode */
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+	ip->i_cowfp = NULL;
+	memset(&ip->i_af, 0, sizeof(ip->i_af));
+	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
+	memset(&ip->i_df, 0, sizeof(ip->i_df));
+	ip->i_flags = 0;
+	ip->i_delayed_blks = 0;
+	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
+	ip->i_nblocks = 0;
+	ip->i_forkoff = 0;
+	ip->i_sick = 0;
+	ip->i_checked = 0;
+	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
+	INIT_LIST_HEAD(&ip->i_ioend_list);
+	spin_lock_init(&ip->i_ioend_lock);
+	ip->i_next_unlinked = NULLAGINO;
+	ip->i_prev_unlinked = NULLAGINO;
+
+	return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+	struct rcu_head		*head)
+{
+	struct inode		*inode = container_of(head, struct inode, i_rcu);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	switch (VFS_I(ip)->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		xfs_idestroy_fork(&ip->i_df);
+		break;
+	}
+
+	xfs_ifork_zap_attr(ip);
+
+	if (ip->i_cowfp) {
+		xfs_idestroy_fork(ip->i_cowfp);
+		kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
+	}
+	if (ip->i_itemp) {
+		ASSERT(!test_bit(XFS_LI_IN_AIL,
+				 &ip->i_itemp->ili_item.li_flags));
+		xfs_inode_item_destroy(ip);
+		ip->i_itemp = NULL;
+	}
+
+	kmem_cache_free(xfs_inode_cache, ip);
+}
+
+static void
+__xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
+	XFS_STATS_DEC(ip->i_mount, vn_active);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+void
+xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
+
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	__xfs_inode_free(ip);
+}
+
+/*
+ * Queue background inode reclaim work if there are reclaimable inodes and there
+ * isn't reclaim work already scheduled or in progress.
+ */
+static void
+xfs_reclaim_work_queue(
+	struct xfs_mount        *mp)
+{
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Background scanning to trim preallocated space. This is queued based on the
+ * 'speculative_prealloc_lifetime' tunable (5m by default).
+ */
+static inline void
+xfs_blockgc_queue(
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+
+	if (!xfs_is_blockgc_enabled(mp))
+		return;
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
+		queue_delayed_work(pag->pag_mount->m_blockgc_wq,
+				   &pag->pag_blockgc_work,
+				   msecs_to_jiffies(xfs_blockgc_secs * 1000));
+	rcu_read_unlock();
+}
+
+/* Set a tag on both the AG incore inode tree and the AG radix tree. */
+static void
+xfs_perag_set_inode_tag(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	unsigned int		tag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	bool			was_tagged;
+
+	lockdep_assert_held(&pag->pag_ici_lock);
+
+	was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
+	radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
+
+	if (tag == XFS_ICI_RECLAIM_TAG)
+		pag->pag_ici_reclaimable++;
+
+	if (was_tagged)
+		return;
+
+	/* propagate the tag up into the perag radix tree */
+	spin_lock(&mp->m_perag_lock);
+	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
+	spin_unlock(&mp->m_perag_lock);
+
+	/* start background work */
+	switch (tag) {
+	case XFS_ICI_RECLAIM_TAG:
+		xfs_reclaim_work_queue(mp);
+		break;
+	case XFS_ICI_BLOCKGC_TAG:
+		xfs_blockgc_queue(pag);
+		break;
+	}
+
+	trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
+}
+
+/* Clear a tag on both the AG incore inode tree and the AG radix tree. */
+static void
+xfs_perag_clear_inode_tag(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	unsigned int		tag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+
+	lockdep_assert_held(&pag->pag_ici_lock);
+
+	/*
+	 * Reclaim can signal (with a null agino) that it cleared its own tag
+	 * by removing the inode from the radix tree.
+	 */
+	if (agino != NULLAGINO)
+		radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
+	else
+		ASSERT(tag == XFS_ICI_RECLAIM_TAG);
+
+	if (tag == XFS_ICI_RECLAIM_TAG)
+		pag->pag_ici_reclaimable--;
+
+	if (radix_tree_tagged(&pag->pag_ici_root, tag))
+		return;
+
+	/* clear the tag from the perag radix tree */
+	spin_lock(&mp->m_perag_lock);
+	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
+	spin_unlock(&mp->m_perag_lock);
+
+	trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
+}
+
+/*
+ * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
+ * part of the structure. This is made more complex by the fact we store
+ * information about the on-disk values in the VFS inode and so we can't just
+ * overwrite the values unconditionally. Hence we save the parameters we
+ * need to retain across reinitialisation, and rewrite them into the VFS inode
+ * after reinitialisation even if it fails.
+ */
+static int
+xfs_reinit_inode(
+	struct xfs_mount	*mp,
+	struct inode		*inode)
+{
+	int			error;
+	uint32_t		nlink = inode->i_nlink;
+	uint32_t		generation = inode->i_generation;
+	uint64_t		version = inode_peek_iversion(inode);
+	umode_t			mode = inode->i_mode;
+	dev_t			dev = inode->i_rdev;
+	kuid_t			uid = inode->i_uid;
+	kgid_t			gid = inode->i_gid;
+
+	error = inode_init_always(mp->m_super, inode);
+
+	set_nlink(inode, nlink);
+	inode->i_generation = generation;
+	inode_set_iversion_queried(inode, version);
+	inode->i_mode = mode;
+	inode->i_rdev = dev;
+	inode->i_uid = uid;
+	inode->i_gid = gid;
+	mapping_set_large_folios(inode->i_mapping);
+	return error;
+}
+
+/*
+ * Carefully nudge an inode whose VFS state has been torn down back into a
+ * usable state.  Drops the i_flags_lock and the rcu read lock.
+ */
+static int
+xfs_iget_recycle(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip) __releases(&ip->i_flags_lock)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	int			error;
+
+	trace_xfs_iget_recycle(ip);
+
+	/*
+	 * We need to make it look like the inode is being reclaimed to prevent
+	 * the actual reclaim workers from stomping over us while we recycle
+	 * the inode.  We can't clear the radix tree tag yet as it requires
+	 * pag_ici_lock to be held exclusive.
+	 */
+	ip->i_flags |= XFS_IRECLAIM;
+
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+
+	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+	error = xfs_reinit_inode(mp, inode);
+	if (error) {
+		/*
+		 * Re-initializing the inode failed, and we are in deep
+		 * trouble.  Try to re-add it to the reclaim list.
+		 */
+		rcu_read_lock();
+		spin_lock(&ip->i_flags_lock);
+		ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+		ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+
+		trace_xfs_iget_recycle_fail(ip);
+		return error;
+	}
+
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+
+	/*
+	 * Clear the per-lifetime state in the inode as we are now effectively
+	 * a new inode and need to return to the initial state before reuse
+	 * occurs.
+	 */
+	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+	ip->i_flags |= XFS_INEW;
+	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			XFS_ICI_RECLAIM_TAG);
+	inode->i_state = I_NEW;
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+
+	return 0;
+}
+
+/*
+ * If we are allocating a new inode, then check what was returned is
+ * actually a free, empty inode. If we are not allocating an inode,
+ * then check we didn't find a free inode.
+ *
+ * Returns:
+ *	0		if the inode free state matches the lookup context
+ *	-ENOENT		if the inode is free and we are not allocating
+ *	-EFSCORRUPTED	if there is any state mismatch at all
+ */
+static int
+xfs_iget_check_free_state(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	if (flags & XFS_IGET_CREATE) {
+		/* should be a free inode */
+		if (VFS_I(ip)->i_mode != 0) {
+			xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
+				ip->i_ino, VFS_I(ip)->i_mode);
+			return -EFSCORRUPTED;
+		}
+
+		if (ip->i_nblocks != 0) {
+			xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx has blocks allocated!",
+				ip->i_ino);
+			return -EFSCORRUPTED;
+		}
+		return 0;
+	}
+
+	/* should be an allocated inode */
+	if (VFS_I(ip)->i_mode == 0)
+		return -ENOENT;
+
+	return 0;
+}
+
+/* Make all pending inactivation work start immediately. */
+static bool
+xfs_inodegc_queue_all(
+	struct xfs_mount	*mp)
+{
+	struct xfs_inodegc	*gc;
+	int			cpu;
+	bool			ret = false;
+
+	for_each_online_cpu(cpu) {
+		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+		if (!llist_empty(&gc->list)) {
+			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+			ret = true;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
+	int			flags,
+	int			lock_flags) __releases(RCU)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino)
+		goto out_skip;
+
+	/*
+	 * If we are racing with another cache hit that is currently
+	 * instantiating this inode or currently recycling it out of
+	 * reclaimable state, wait for the initialisation to complete
+	 * before continuing.
+	 *
+	 * If we're racing with the inactivation worker we also want to wait.
+	 * If we're creating a new file, it's possible that the worker
+	 * previously marked the inode as free on disk but hasn't finished
+	 * updating the incore state yet.  The AGI buffer will be dirty and
+	 * locked to the icreate transaction, so a synchronous push of the
+	 * inodegc workers would result in deadlock.  For a regular iget, the
+	 * worker is running already, so we might as well wait.
+	 *
+	 * XXX(hch): eventually we should do something equivalent to
+	 *	     wait_on_inode to wait for these flags to be cleared
+	 *	     instead of polling for it.
+	 */
+	if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
+		goto out_skip;
+
+	if (ip->i_flags & XFS_NEED_INACTIVE) {
+		/* Unlinked inodes cannot be re-grabbed. */
+		if (VFS_I(ip)->i_nlink == 0) {
+			error = -ENOENT;
+			goto out_error;
+		}
+		goto out_inodegc_flush;
+	}
+
+	/*
+	 * Check the inode free state is valid. This also detects lookup
+	 * racing with unlinks.
+	 */
+	error = xfs_iget_check_free_state(ip, flags);
+	if (error)
+		goto out_error;
+
+	/* Skip inodes that have no vfs state. */
+	if ((flags & XFS_IGET_INCORE) &&
+	    (ip->i_flags & XFS_IRECLAIMABLE))
+		goto out_skip;
+
+	/* The inode fits the selection criteria; process it. */
+	if (ip->i_flags & XFS_IRECLAIMABLE) {
+		/* Drops i_flags_lock and RCU read lock. */
+		error = xfs_iget_recycle(pag, ip);
+		if (error)
+			return error;
+	} else {
+		/* If the VFS inode is being torn down, pause and try again. */
+		if (!igrab(inode))
+			goto out_skip;
+
+		/* We've got a live one. */
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+		trace_xfs_iget_hit(ip);
+	}
+
+	if (lock_flags != 0)
+		xfs_ilock(ip, lock_flags);
+
+	if (!(flags & XFS_IGET_INCORE))
+		xfs_iflags_clear(ip, XFS_ISTALE);
+	XFS_STATS_INC(mp, xs_ig_found);
+
+	return 0;
+
+out_skip:
+	trace_xfs_iget_skip(ip);
+	XFS_STATS_INC(mp, xs_ig_frecycle);
+	error = -EAGAIN;
+out_error:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+	return error;
+
+out_inodegc_flush:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+	/*
+	 * Do not wait for the workers, because the caller could hold an AGI
+	 * buffer lock.  We're just going to sleep in a loop anyway.
+	 */
+	if (xfs_is_inodegc_enabled(mp))
+		xfs_inodegc_queue_all(mp);
+	return -EAGAIN;
+}
+
+static int
+xfs_iget_cache_miss(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	xfs_trans_t		*tp,
+	xfs_ino_t		ino,
+	struct xfs_inode	**ipp,
+	int			flags,
+	int			lock_flags)
+{
+	struct xfs_inode	*ip;
+	int			error;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
+	int			iflags;
+
+	ip = xfs_inode_alloc(mp, ino);
+	if (!ip)
+		return -ENOMEM;
+
+	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
+	if (error)
+		goto out_destroy;
+
+	/*
+	 * For version 5 superblocks, if we are initialising a new inode and we
+	 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
+	 * simply build the new inode core with a random generation number.
+	 *
+	 * For version 4 (and older) superblocks, log recovery is dependent on
+	 * the i_flushiter field being initialised from the current on-disk
+	 * value and hence we must also read the inode off disk even when
+	 * initializing new inodes.
+	 */
+	if (xfs_has_v3inodes(mp) &&
+	    (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
+		VFS_I(ip)->i_generation = get_random_u32();
+	} else {
+		struct xfs_buf		*bp;
+
+		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
+		if (error)
+			goto out_destroy;
+
+		error = xfs_inode_from_disk(ip,
+				xfs_buf_offset(bp, ip->i_imap.im_boffset));
+		if (!error)
+			xfs_buf_set_ref(bp, XFS_INO_REF);
+		xfs_trans_brelse(tp, bp);
+
+		if (error)
+			goto out_destroy;
+	}
+
+	trace_xfs_iget_miss(ip);
+
+	/*
+	 * Check the inode free state is valid. This also detects lookup
+	 * racing with unlinks.
+	 */
+	error = xfs_iget_check_free_state(ip, flags);
+	if (error)
+		goto out_destroy;
+
+	/*
+	 * Preload the radix tree so we can insert safely under the
+	 * write spinlock. Note that we cannot sleep inside the preload
+	 * region. Since we can be called from transaction context, don't
+	 * recurse into the file system.
+	 */
+	if (radix_tree_preload(GFP_NOFS)) {
+		error = -EAGAIN;
+		goto out_destroy;
+	}
+
+	/*
+	 * Because the inode hasn't been added to the radix-tree yet it can't
+	 * be found by another thread, so we can do the non-sleeping lock here.
+	 */
+	if (lock_flags) {
+		if (!xfs_ilock_nowait(ip, lock_flags))
+			BUG();
+	}
+
+	/*
+	 * These values must be set before inserting the inode into the radix
+	 * tree as the moment it is inserted a concurrent lookup (allowed by the
+	 * RCU locking mechanism) can find it and that lookup must see that this
+	 * is an inode currently under construction (i.e. that XFS_INEW is set).
+	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+	 * memory barrier that ensures this detection works correctly at lookup
+	 * time.
+	 */
+	iflags = XFS_INEW;
+	if (flags & XFS_IGET_DONTCACHE)
+		d_mark_dontcache(VFS_I(ip));
+	ip->i_udquot = NULL;
+	ip->i_gdquot = NULL;
+	ip->i_pdquot = NULL;
+	xfs_iflags_set(ip, iflags);
+
+	/* insert the new inode */
+	spin_lock(&pag->pag_ici_lock);
+	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+	if (unlikely(error)) {
+		WARN_ON(error != -EEXIST);
+		XFS_STATS_INC(mp, xs_ig_dup);
+		error = -EAGAIN;
+		goto out_preload_end;
+	}
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+
+	*ipp = ip;
+	return 0;
+
+out_preload_end:
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+out_destroy:
+	__destroy_inode(VFS_I(ip));
+	xfs_inode_free(ip);
+	return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.  The inode is looked up
+ * in the cache held in each AG.  If the inode is found in the cache, initialise
+ * the vfs inode if necessary.
+ *
+ * If it is not in core, read it in from the file system's device, add it to the
+ * cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * Inode lookup is only done during metadata operations and not as part of the
+ * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
+ */
+int
+xfs_iget(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	uint			flags,
+	uint			lock_flags,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_inode	*ip;
+	struct xfs_perag	*pag;
+	xfs_agino_t		agino;
+	int			error;
+
+	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+	/* reject inode numbers outside existing AGs */
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+		return -EINVAL;
+
+	XFS_STATS_INC(mp, xs_ig_attempts);
+
+	/* get the perag structure and ensure that it's inode capable */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+	agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+	error = 0;
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+	if (ip) {
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	} else {
+		rcu_read_unlock();
+		if (flags & XFS_IGET_INCORE) {
+			error = -ENODATA;
+			goto out_error_or_again;
+		}
+		XFS_STATS_INC(mp, xs_ig_missed);
+
+		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+							flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	}
+	xfs_perag_put(pag);
+
+	*ipp = ip;
+
+	/*
+	 * If we have a real type for an on-disk inode, we can setup the inode
+	 * now.	 If it's a new inode being created, xfs_init_new_inode will
+	 * handle it.
+	 */
+	if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
+		xfs_setup_existing_inode(ip);
+	return 0;
+
+out_error_or_again:
+	if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
+		delay(1);
+		goto again;
+	}
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * "Is this a cached inode that's also allocated?"
+ *
+ * Look up an inode by number in the given file system.  If the inode is
+ * in cache and isn't in purgatory, return 1 if the inode is allocated
+ * and 0 if it is not.  For all other cases (not in cache, being torn
+ * down, etc.), return a negative error code.
+ *
+ * The caller has to prevent inode allocation and freeing activity,
+ * presumably by locking the AGI buffer.   This is to ensure that an
+ * inode cannot transition from allocated to freed until the caller is
+ * ready to allow that.  If the inode is in an intermediate state (new,
+ * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
+ * inode is not in the cache, -ENOENT will be returned.  The caller must
+ * deal with these scenarios appropriately.
+ *
+ * This is a specialized use case for the online scrubber; if you're
+ * reading this, you probably want xfs_iget.
+ */
+int
+xfs_icache_inode_is_allocated(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	bool			*inuse)
+{
+	struct xfs_inode	*ip;
+	int			error;
+
+	error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
+	if (error)
+		return error;
+
+	*inuse = !!(VFS_I(ip)->i_mode);
+	xfs_irele(ip);
+	return 0;
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ *
+ * We have found this inode via a lookup under RCU, so the inode may have
+ * already been freed, or it may be in the process of being recycled by
+ * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
+ * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
+ * will not be set. Hence we need to check for both these flag conditions to
+ * avoid inodes that are no longer reclaim candidates.
+ *
+ * Note: checking for other state flags here, under the i_flags_lock or not, is
+ * racy and should be avoided. Those races should be resolved only after we have
+ * ensured that we are able to reclaim this inode and the world can see that we
+ * are going to reclaim it.
+ *
+ * Return true if we grabbed it, false otherwise.
+ */
+static bool
+xfs_reclaim_igrab(
+	struct xfs_inode	*ip,
+	struct xfs_icwalk	*icw)
+{
+	ASSERT(rcu_read_lock_held());
+
+	spin_lock(&ip->i_flags_lock);
+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* not a reclaim candidate. */
+		spin_unlock(&ip->i_flags_lock);
+		return false;
+	}
+
+	/* Don't reclaim a sick inode unless the caller asked for it. */
+	if (ip->i_sick &&
+	    (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
+		spin_unlock(&ip->i_flags_lock);
+		return false;
+	}
+
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	return true;
+}
+
+/*
+ * Inode reclaim is non-blocking, so the default action if progress cannot be
+ * made is to "requeue" the inode for reclaim by unlocking it and clearing the
+ * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
+ * blocking anymore and hence we can wait for the inode to be able to reclaim
+ * it.
+ *
+ * We do no IO here - if callers require inodes to be cleaned they must push the
+ * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
+ * done in the background in a non-blocking manner, and enables memory reclaim
+ * to make progress without blocking.
+ */
+static void
+xfs_reclaim_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag)
+{
+	xfs_ino_t		ino = ip->i_ino; /* for radix_tree_delete */
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+		goto out;
+	if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
+		goto out_iunlock;
+
+	/*
+	 * Check for log shutdown because aborting the inode can move the log
+	 * tail and corrupt in memory state. This is fine if the log is shut
+	 * down, but if the log is still active and only the mount is shut down
+	 * then the in-memory log tail movement caused by the abort can be
+	 * incorrectly propagated to disk.
+	 */
+	if (xlog_is_shutdown(ip->i_mount->m_log)) {
+		xfs_iunpin_wait(ip);
+		xfs_iflush_shutdown_abort(ip);
+		goto reclaim;
+	}
+	if (xfs_ipincount(ip))
+		goto out_clear_flush;
+	if (!xfs_inode_clean(ip))
+		goto out_clear_flush;
+
+	xfs_iflags_clear(ip, XFS_IFLUSHING);
+reclaim:
+	trace_xfs_inode_reclaiming(ip);
+
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always appears
+	 * to be reclaimed with an invalid inode number when in the free state.
+	 * We do this as early as possible under the ILOCK so that
+	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
+	 * detect races with us here. By doing this, we guarantee that once
+	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
+	 * it will see either a valid inode that will serialise correctly, or it
+	 * will see an invalid inode that it can skip.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	ip->i_sick = 0;
+	ip->i_checked = 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
+	/*
+	 * Remove the inode from the per-AG radix tree.
+	 *
+	 * Because radix_tree_delete won't complain even if the item was never
+	 * added to the tree assert that it's been there before to catch
+	 * problems with the inode life time early on.
+	 */
+	spin_lock(&pag->pag_ici_lock);
+	if (!radix_tree_delete(&pag->pag_ici_root,
+				XFS_INO_TO_AGINO(ip->i_mount, ino)))
+		ASSERT(0);
+	xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
+	spin_unlock(&pag->pag_ici_lock);
+
+	/*
+	 * Here we do an (almost) spurious inode lock in order to coordinate
+	 * with inode cache radix tree lookups.  This is because the lookup
+	 * can reference the inodes in the cache without taking references.
+	 *
+	 * We make that OK here by ensuring that we wait until the inode is
+	 * unlocked after the lookup before we go ahead and free it.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	ASSERT(xfs_inode_clean(ip));
+
+	__xfs_inode_free(ip);
+	return;
+
+out_clear_flush:
+	xfs_iflags_clear(ip, XFS_IFLUSHING);
+out_iunlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	xfs_iflags_clear(ip, XFS_IRECLAIM);
+}
+
+/* Reclaim sick inodes if we're unmounting or the fs went down. */
+static inline bool
+xfs_want_reclaim_sick(
+	struct xfs_mount	*mp)
+{
+	return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
+	       xfs_is_shutdown(mp);
+}
+
+void
+xfs_reclaim_inodes(
+	struct xfs_mount	*mp)
+{
+	struct xfs_icwalk	icw = {
+		.icw_flags	= 0,
+	};
+
+	if (xfs_want_reclaim_sick(mp))
+		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
+
+	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+		xfs_ail_push_all_sync(mp->m_ail);
+		xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
+	}
+}
+
+/*
+ * The shrinker infrastructure determines how many inodes we should scan for
+ * reclaim. We want as many clean inodes ready to reclaim as possible, so we
+ * push the AIL here. We also want to proactively free up memory if we can to
+ * minimise the amount of work memory reclaim has to do so we kick the
+ * background reclaim if it isn't already scheduled.
+ */
+long
+xfs_reclaim_inodes_nr(
+	struct xfs_mount	*mp,
+	unsigned long		nr_to_scan)
+{
+	struct xfs_icwalk	icw = {
+		.icw_flags	= XFS_ICWALK_FLAG_SCAN_LIMIT,
+		.icw_scan_limit	= min_t(unsigned long, LONG_MAX, nr_to_scan),
+	};
+
+	if (xfs_want_reclaim_sick(mp))
+		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
+
+	/* kick background reclaimer and push the AIL */
+	xfs_reclaim_work_queue(mp);
+	xfs_ail_push_all(mp->m_ail);
+
+	xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
+	return 0;
+}
+
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+long
+xfs_reclaim_inodes_count(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		ag = 0;
+	long			reclaimable = 0;
+
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		ag = pag->pag_agno + 1;
+		reclaimable += pag->pag_ici_reclaimable;
+		xfs_perag_put(pag);
+	}
+	return reclaimable;
+}
+
+STATIC bool
+xfs_icwalk_match_id(
+	struct xfs_inode	*ip,
+	struct xfs_icwalk	*icw)
+{
+	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
+	    !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
+		return false;
+
+	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
+	    !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
+		return false;
+
+	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
+	    ip->i_projid != icw->icw_prid)
+		return false;
+
+	return true;
+}
+
+/*
+ * A union-based inode filtering algorithm. Process the inode if any of the
+ * criteria match. This is for global/internal scans only.
+ */
+STATIC bool
+xfs_icwalk_match_id_union(
+	struct xfs_inode	*ip,
+	struct xfs_icwalk	*icw)
+{
+	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
+	    uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
+		return true;
+
+	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
+	    gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
+		return true;
+
+	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
+	    ip->i_projid == icw->icw_prid)
+		return true;
+
+	return false;
+}
+
+/*
+ * Is this inode @ip eligible for eof/cow block reclamation, given some
+ * filtering parameters @icw?  The inode is eligible if @icw is null or
+ * if the predicate functions match.
+ */
+static bool
+xfs_icwalk_match(
+	struct xfs_inode	*ip,
+	struct xfs_icwalk	*icw)
+{
+	bool			match;
+
+	if (!icw)
+		return true;
+
+	if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
+		match = xfs_icwalk_match_id_union(ip, icw);
+	else
+		match = xfs_icwalk_match_id(ip, icw);
+	if (!match)
+		return false;
+
+	/* skip the inode if the file size is too small */
+	if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
+	    XFS_ISIZE(ip) < icw->icw_min_file_size)
+		return false;
+
+	return true;
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low.
+ */
+void
+xfs_reclaim_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_reclaim_work);
+
+	xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
+	xfs_reclaim_work_queue(mp);
+}
+
+STATIC int
+xfs_inode_free_eofblocks(
+	struct xfs_inode	*ip,
+	struct xfs_icwalk	*icw,
+	unsigned int		*lockflags)
+{
+	bool			wait;
+
+	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
+
+	if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
+		return 0;
+
+	/*
+	 * If the mapping is dirty the operation can block and wait for some
+	 * time. Unless we are waiting, skip it.
+	 */
+	if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+		return 0;
+
+	if (!xfs_icwalk_match(ip, icw))
+		return 0;
+
+	/*
+	 * If the caller is waiting, return -EAGAIN to keep the background
+	 * scanner moving and revisit the inode in a subsequent pass.
+	 */
+	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+		if (wait)
+			return -EAGAIN;
+		return 0;
+	}
+	*lockflags |= XFS_IOLOCK_EXCL;
+
+	if (xfs_can_free_eofblocks(ip, false))
+		return xfs_free_eofblocks(ip);
+
+	/* inode could be preallocated or append-only */
+	trace_xfs_inode_free_eofblocks_invalid(ip);
+	xfs_inode_clear_eofblocks_tag(ip);
+	return 0;
+}
+
+static void
+xfs_blockgc_set_iflag(
+	struct xfs_inode	*ip,
+	unsigned long		iflag)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+
+	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
+
+	/*
+	 * Don't bother locking the AG and looking up in the radix trees
+	 * if we already know that we have the tag set.
+	 */
+	if (ip->i_flags & iflag)
+		return;
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags |= iflag;
+	spin_unlock(&ip->i_flags_lock);
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+
+	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			XFS_ICI_BLOCKGC_TAG);
+
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+void
+xfs_inode_set_eofblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_set_eofblocks_tag(ip);
+	return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
+}
+
+static void
+xfs_blockgc_clear_iflag(
+	struct xfs_inode	*ip,
+	unsigned long		iflag)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	bool			clear_tag;
+
+	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
+
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags &= ~iflag;
+	clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	if (!clear_tag)
+		return;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+
+	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			XFS_ICI_BLOCKGC_TAG);
+
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+void
+xfs_inode_clear_eofblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_clear_eofblocks_tag(ip);
+	return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
+}
+
+/*
+ * Set ourselves up to free CoW blocks from this file.  If it's already clean
+ * then we can bail out quickly, but otherwise we must back off if the file
+ * is undergoing some kind of write.
+ */
+static bool
+xfs_prep_free_cowblocks(
+	struct xfs_inode	*ip)
+{
+	/*
+	 * Just clear the tag if we have an empty cow fork or none at all. It's
+	 * possible the inode was fully unshared since it was originally tagged.
+	 */
+	if (!xfs_inode_has_cow_data(ip)) {
+		trace_xfs_inode_free_cowblocks_invalid(ip);
+		xfs_inode_clear_cowblocks_tag(ip);
+		return false;
+	}
+
+	/*
+	 * If the mapping is dirty or under writeback we cannot touch the
+	 * CoW fork.  Leave it alone if we're in the midst of a directio.
+	 */
+	if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
+	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
+	    atomic_read(&VFS_I(ip)->i_dio_count))
+		return false;
+
+	return true;
+}
+
+/*
+ * Automatic CoW Reservation Freeing
+ *
+ * These functions automatically garbage collect leftover CoW reservations
+ * that were made on behalf of a cowextsize hint when we start to run out
+ * of quota or when the reservations sit around for too long.  If the file
+ * has dirty pages or is undergoing writeback, its CoW reservations will
+ * be retained.
+ *
+ * The actual garbage collection piggybacks off the same code that runs
+ * the speculative EOF preallocation garbage collector.
+ */
+STATIC int
+xfs_inode_free_cowblocks(
+	struct xfs_inode	*ip,
+	struct xfs_icwalk	*icw,
+	unsigned int		*lockflags)
+{
+	bool			wait;
+	int			ret = 0;
+
+	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
+
+	if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
+		return 0;
+
+	if (!xfs_prep_free_cowblocks(ip))
+		return 0;
+
+	if (!xfs_icwalk_match(ip, icw))
+		return 0;
+
+	/*
+	 * If the caller is waiting, return -EAGAIN to keep the background
+	 * scanner moving and revisit the inode in a subsequent pass.
+	 */
+	if (!(*lockflags & XFS_IOLOCK_EXCL) &&
+	    !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+		if (wait)
+			return -EAGAIN;
+		return 0;
+	}
+	*lockflags |= XFS_IOLOCK_EXCL;
+
+	if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
+		if (wait)
+			return -EAGAIN;
+		return 0;
+	}
+	*lockflags |= XFS_MMAPLOCK_EXCL;
+
+	/*
+	 * Check again, nobody else should be able to dirty blocks or change
+	 * the reflink iflag now that we have the first two locks held.
+	 */
+	if (xfs_prep_free_cowblocks(ip))
+		ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
+	return ret;
+}
+
+void
+xfs_inode_set_cowblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_set_cowblocks_tag(ip);
+	return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
+}
+
+void
+xfs_inode_clear_cowblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_clear_cowblocks_tag(ip);
+	return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
+}
+
+/* Disable post-EOF and CoW block auto-reclamation. */
+void
+xfs_blockgc_stop(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	if (!xfs_clear_blockgc_enabled(mp))
+		return;
+
+	for_each_perag(mp, agno, pag)
+		cancel_delayed_work_sync(&pag->pag_blockgc_work);
+	trace_xfs_blockgc_stop(mp, __return_address);
+}
+
+/* Enable post-EOF and CoW block auto-reclamation. */
+void
+xfs_blockgc_start(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	if (xfs_set_blockgc_enabled(mp))
+		return;
+
+	trace_xfs_blockgc_start(mp, __return_address);
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+		xfs_blockgc_queue(pag);
+}
+
+/* Don't try to run block gc on an inode that's in any of these states. */
+#define XFS_BLOCKGC_NOGRAB_IFLAGS	(XFS_INEW | \
+					 XFS_NEED_INACTIVE | \
+					 XFS_INACTIVATING | \
+					 XFS_IRECLAIMABLE | \
+					 XFS_IRECLAIM)
+/*
+ * Decide if the given @ip is eligible for garbage collection of speculative
+ * preallocations, and grab it if so.  Returns true if it's ready to go or
+ * false if we should just ignore it.
+ */
+static bool
+xfs_blockgc_igrab(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	ASSERT(rcu_read_lock_held());
+
+	/* Check for stale RCU freed inode */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
+		goto out_unlock_noent;
+	spin_unlock(&ip->i_flags_lock);
+
+	/* nothing to sync during shutdown */
+	if (xfs_is_shutdown(ip->i_mount))
+		return false;
+
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		return false;
+
+	/* inode is valid */
+	return true;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return false;
+}
+
+/* Scan one incore inode for block preallocations that we can remove. */
+static int
+xfs_blockgc_scan_inode(
+	struct xfs_inode	*ip,
+	struct xfs_icwalk	*icw)
+{
+	unsigned int		lockflags = 0;
+	int			error;
+
+	error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
+	if (error)
+		goto unlock;
+
+	error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
+unlock:
+	if (lockflags)
+		xfs_iunlock(ip, lockflags);
+	xfs_irele(ip);
+	return error;
+}
+
+/* Background worker that trims preallocated space. */
+void
+xfs_blockgc_worker(
+	struct work_struct	*work)
+{
+	struct xfs_perag	*pag = container_of(to_delayed_work(work),
+					struct xfs_perag, pag_blockgc_work);
+	struct xfs_mount	*mp = pag->pag_mount;
+	int			error;
+
+	trace_xfs_blockgc_worker(mp, __return_address);
+
+	error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
+	if (error)
+		xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
+				pag->pag_agno, error);
+	xfs_blockgc_queue(pag);
+}
+
+/*
+ * Try to free space in the filesystem by purging inactive inodes, eofblocks
+ * and cowblocks.
+ */
+int
+xfs_blockgc_free_space(
+	struct xfs_mount	*mp,
+	struct xfs_icwalk	*icw)
+{
+	int			error;
+
+	trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
+
+	error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
+	if (error)
+		return error;
+
+	xfs_inodegc_flush(mp);
+	return 0;
+}
+
+/*
+ * Reclaim all the free space that we can by scheduling the background blockgc
+ * and inodegc workers immediately and waiting for them all to clear.
+ */
+void
+xfs_blockgc_flush_all(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	trace_xfs_blockgc_flush_all(mp, __return_address);
+
+	/*
+	 * For each blockgc worker, move its queue time up to now.  If it
+	 * wasn't queued, it will not be requeued.  Then flush whatever's
+	 * left.
+	 */
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+		mod_delayed_work(pag->pag_mount->m_blockgc_wq,
+				&pag->pag_blockgc_work, 0);
+
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+		flush_delayed_work(&pag->pag_blockgc_work);
+
+	xfs_inodegc_flush(mp);
+}
+
+/*
+ * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
+ * quota caused an allocation failure, so we make a best effort by including
+ * each quota under low free space conditions (less than 1% free space) in the
+ * scan.
+ *
+ * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
+ * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
+ * MMAPLOCK.
+ */
+int
+xfs_blockgc_free_dquots(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	struct xfs_dquot	*pdqp,
+	unsigned int		iwalk_flags)
+{
+	struct xfs_icwalk	icw = {0};
+	bool			do_work = false;
+
+	if (!udqp && !gdqp && !pdqp)
+		return 0;
+
+	/*
+	 * Run a scan to free blocks using the union filter to cover all
+	 * applicable quotas in a single scan.
+	 */
+	icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
+
+	if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
+		icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
+		icw.icw_flags |= XFS_ICWALK_FLAG_UID;
+		do_work = true;
+	}
+
+	if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
+		icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
+		icw.icw_flags |= XFS_ICWALK_FLAG_GID;
+		do_work = true;
+	}
+
+	if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
+		icw.icw_prid = pdqp->q_id;
+		icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
+		do_work = true;
+	}
+
+	if (!do_work)
+		return 0;
+
+	return xfs_blockgc_free_space(mp, &icw);
+}
+
+/* Run cow/eofblocks scans on the quotas attached to the inode. */
+int
+xfs_blockgc_free_quota(
+	struct xfs_inode	*ip,
+	unsigned int		iwalk_flags)
+{
+	return xfs_blockgc_free_dquots(ip->i_mount,
+			xfs_inode_dquot(ip, XFS_DQTYPE_USER),
+			xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
+			xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
+}
+
+/* XFS Inode Cache Walking Code */
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH	32
+
+
+/*
+ * Decide if we want to grab this inode in anticipation of doing work towards
+ * the goal.
+ */
+static inline bool
+xfs_icwalk_igrab(
+	enum xfs_icwalk_goal	goal,
+	struct xfs_inode	*ip,
+	struct xfs_icwalk	*icw)
+{
+	switch (goal) {
+	case XFS_ICWALK_BLOCKGC:
+		return xfs_blockgc_igrab(ip);
+	case XFS_ICWALK_RECLAIM:
+		return xfs_reclaim_igrab(ip, icw);
+	default:
+		return false;
+	}
+}
+
+/*
+ * Process an inode.  Each processing function must handle any state changes
+ * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
+ */
+static inline int
+xfs_icwalk_process_inode(
+	enum xfs_icwalk_goal	goal,
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	struct xfs_icwalk	*icw)
+{
+	int			error = 0;
+
+	switch (goal) {
+	case XFS_ICWALK_BLOCKGC:
+		error = xfs_blockgc_scan_inode(ip, icw);
+		break;
+	case XFS_ICWALK_RECLAIM:
+		xfs_reclaim_inode(ip, pag);
+		break;
+	}
+	return error;
+}
+
+/*
+ * For a given per-AG structure @pag and a goal, grab qualifying inodes and
+ * process them in some manner.
+ */
+static int
+xfs_icwalk_ag(
+	struct xfs_perag	*pag,
+	enum xfs_icwalk_goal	goal,
+	struct xfs_icwalk	*icw)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	uint32_t		first_index;
+	int			last_error = 0;
+	int			skipped;
+	bool			done;
+	int			nr_found;
+
+restart:
+	done = false;
+	skipped = 0;
+	if (goal == XFS_ICWALK_RECLAIM)
+		first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
+	else
+		first_index = 0;
+	nr_found = 0;
+	do {
+		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+		int		error = 0;
+		int		i;
+
+		rcu_read_lock();
+
+		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+				(void **) batch, first_index,
+				XFS_LOOKUP_BATCH, goal);
+		if (!nr_found) {
+			done = true;
+			rcu_read_unlock();
+			break;
+		}
+
+		/*
+		 * Grab the inodes before we drop the lock. if we found
+		 * nothing, nr == 0 and the loop will be skipped.
+		 */
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_inode *ip = batch[i];
+
+			if (done || !xfs_icwalk_igrab(goal, ip, icw))
+				batch[i] = NULL;
+
+			/*
+			 * Update the index for the next lookup. Catch
+			 * overflows into the next AG range which can occur if
+			 * we have inodes in the last block of the AG and we
+			 * are currently pointing to the last inode.
+			 *
+			 * Because we may see inodes that are from the wrong AG
+			 * due to RCU freeing and reallocation, only update the
+			 * index if it lies in this AG. It was a race that lead
+			 * us to see this inode, so another lookup from the
+			 * same index will not find it again.
+			 */
+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+				continue;
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = true;
+		}
+
+		/* unlock now we've grabbed the inodes. */
+		rcu_read_unlock();
+
+		for (i = 0; i < nr_found; i++) {
+			if (!batch[i])
+				continue;
+			error = xfs_icwalk_process_inode(goal, batch[i], pag,
+					icw);
+			if (error == -EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != -EFSCORRUPTED)
+				last_error = error;
+		}
+
+		/* bail out if the filesystem is corrupted.  */
+		if (error == -EFSCORRUPTED)
+			break;
+
+		cond_resched();
+
+		if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
+			icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
+			if (icw->icw_scan_limit <= 0)
+				break;
+		}
+	} while (nr_found && !done);
+
+	if (goal == XFS_ICWALK_RECLAIM) {
+		if (done)
+			first_index = 0;
+		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
+	}
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+	return last_error;
+}
+
+/* Walk all incore inodes to achieve a given goal. */
+static int
+xfs_icwalk(
+	struct xfs_mount	*mp,
+	enum xfs_icwalk_goal	goal,
+	struct xfs_icwalk	*icw)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		agno;
+
+	for_each_perag_tag(mp, agno, pag, goal) {
+		error = xfs_icwalk_ag(pag, goal, icw);
+		if (error) {
+			last_error = error;
+			if (error == -EFSCORRUPTED) {
+				xfs_perag_put(pag);
+				break;
+			}
+		}
+	}
+	return last_error;
+	BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
+}
+
+#ifdef DEBUG
+static void
+xfs_check_delalloc(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_bmbt_irec	got;
+	struct xfs_iext_cursor	icur;
+
+	if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
+		return;
+	do {
+		if (isnullstartblock(got.br_startblock)) {
+			xfs_warn(ip->i_mount,
+	"ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
+				ip->i_ino,
+				whichfork == XFS_DATA_FORK ? "data" : "cow",
+				got.br_startoff, got.br_blockcount);
+		}
+	} while (xfs_iext_next_extent(ifp, &icur, &got));
+}
+#else
+#define xfs_check_delalloc(ip, whichfork)	do { } while (0)
+#endif
+
+/* Schedule the inode for reclaim. */
+static void
+xfs_inodegc_set_reclaimable(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+
+	if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
+		xfs_check_delalloc(ip, XFS_DATA_FORK);
+		xfs_check_delalloc(ip, XFS_COW_FORK);
+		ASSERT(0);
+	}
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+
+	trace_xfs_inode_set_reclaimable(ip);
+	ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
+	ip->i_flags |= XFS_IRECLAIMABLE;
+	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			XFS_ICI_RECLAIM_TAG);
+
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+/*
+ * Free all speculative preallocations and possibly even the inode itself.
+ * This is the last chance to make changes to an otherwise unreferenced file
+ * before incore reclamation happens.
+ */
+static void
+xfs_inodegc_inactivate(
+	struct xfs_inode	*ip)
+{
+	trace_xfs_inode_inactivating(ip);
+	xfs_inactive(ip);
+	xfs_inodegc_set_reclaimable(ip);
+}
+
+void
+xfs_inodegc_worker(
+	struct work_struct	*work)
+{
+	struct xfs_inodegc	*gc = container_of(to_delayed_work(work),
+						struct xfs_inodegc, work);
+	struct llist_node	*node = llist_del_all(&gc->list);
+	struct xfs_inode	*ip, *n;
+
+	ASSERT(gc->cpu == smp_processor_id());
+
+	WRITE_ONCE(gc->items, 0);
+
+	if (!node)
+		return;
+
+	ip = llist_entry(node, struct xfs_inode, i_gclist);
+	trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
+
+	WRITE_ONCE(gc->shrinker_hits, 0);
+	llist_for_each_entry_safe(ip, n, node, i_gclist) {
+		xfs_iflags_set(ip, XFS_INACTIVATING);
+		xfs_inodegc_inactivate(ip);
+	}
+}
+
+/*
+ * Expedite all pending inodegc work to run immediately. This does not wait for
+ * completion of the work.
+ */
+void
+xfs_inodegc_push(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_is_inodegc_enabled(mp))
+		return;
+	trace_xfs_inodegc_push(mp, __return_address);
+	xfs_inodegc_queue_all(mp);
+}
+
+/*
+ * Force all currently queued inode inactivation work to run immediately and
+ * wait for the work to finish.
+ */
+void
+xfs_inodegc_flush(
+	struct xfs_mount	*mp)
+{
+	xfs_inodegc_push(mp);
+	trace_xfs_inodegc_flush(mp, __return_address);
+	flush_workqueue(mp->m_inodegc_wq);
+}
+
+/*
+ * Flush all the pending work and then disable the inode inactivation background
+ * workers and wait for them to stop.  Caller must hold sb->s_umount to
+ * coordinate changes in the inodegc_enabled state.
+ */
+void
+xfs_inodegc_stop(
+	struct xfs_mount	*mp)
+{
+	bool			rerun;
+
+	if (!xfs_clear_inodegc_enabled(mp))
+		return;
+
+	/*
+	 * Drain all pending inodegc work, including inodes that could be
+	 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
+	 * threads that sample the inodegc state just prior to us clearing it.
+	 * The inodegc flag state prevents new threads from queuing more
+	 * inodes, so we queue pending work items and flush the workqueue until
+	 * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
+	 * here because it does not allow other unserialized mechanisms to
+	 * reschedule inodegc work while this draining is in progress.
+	 */
+	xfs_inodegc_queue_all(mp);
+	do {
+		flush_workqueue(mp->m_inodegc_wq);
+		rerun = xfs_inodegc_queue_all(mp);
+	} while (rerun);
+
+	trace_xfs_inodegc_stop(mp, __return_address);
+}
+
+/*
+ * Enable the inode inactivation background workers and schedule deferred inode
+ * inactivation work if there is any.  Caller must hold sb->s_umount to
+ * coordinate changes in the inodegc_enabled state.
+ */
+void
+xfs_inodegc_start(
+	struct xfs_mount	*mp)
+{
+	if (xfs_set_inodegc_enabled(mp))
+		return;
+
+	trace_xfs_inodegc_start(mp, __return_address);
+	xfs_inodegc_queue_all(mp);
+}
+
+#ifdef CONFIG_XFS_RT
+static inline bool
+xfs_inodegc_want_queue_rt_file(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!XFS_IS_REALTIME_INODE(ip))
+		return false;
+
+	if (__percpu_counter_compare(&mp->m_frextents,
+				mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
+				XFS_FDBLOCKS_BATCH) < 0)
+		return true;
+
+	return false;
+}
+#else
+# define xfs_inodegc_want_queue_rt_file(ip)	(false)
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Schedule the inactivation worker when:
+ *
+ *  - We've accumulated more than one inode cluster buffer's worth of inodes.
+ *  - There is less than 5% free space left.
+ *  - Any of the quotas for this inode are near an enforcement limit.
+ */
+static inline bool
+xfs_inodegc_want_queue_work(
+	struct xfs_inode	*ip,
+	unsigned int		items)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (items > mp->m_ino_geo.inodes_per_cluster)
+		return true;
+
+	if (__percpu_counter_compare(&mp->m_fdblocks,
+				mp->m_low_space[XFS_LOWSP_5_PCNT],
+				XFS_FDBLOCKS_BATCH) < 0)
+		return true;
+
+	if (xfs_inodegc_want_queue_rt_file(ip))
+		return true;
+
+	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
+		return true;
+
+	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
+		return true;
+
+	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
+		return true;
+
+	return false;
+}
+
+/*
+ * Upper bound on the number of inodes in each AG that can be queued for
+ * inactivation at any given time, to avoid monopolizing the workqueue.
+ */
+#define XFS_INODEGC_MAX_BACKLOG		(4 * XFS_INODES_PER_CHUNK)
+
+/*
+ * Make the frontend wait for inactivations when:
+ *
+ *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
+ *  - The queue depth exceeds the maximum allowable percpu backlog.
+ *
+ * Note: If the current thread is running a transaction, we don't ever want to
+ * wait for other transactions because that could introduce a deadlock.
+ */
+static inline bool
+xfs_inodegc_want_flush_work(
+	struct xfs_inode	*ip,
+	unsigned int		items,
+	unsigned int		shrinker_hits)
+{
+	if (current->journal_info)
+		return false;
+
+	if (shrinker_hits > 0)
+		return true;
+
+	if (items > XFS_INODEGC_MAX_BACKLOG)
+		return true;
+
+	return false;
+}
+
+/*
+ * Queue a background inactivation worker if there are inodes that need to be
+ * inactivated and higher level xfs code hasn't disabled the background
+ * workers.
+ */
+static void
+xfs_inodegc_queue(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_inodegc	*gc;
+	int			items;
+	unsigned int		shrinker_hits;
+	unsigned long		queue_delay = 1;
+
+	trace_xfs_inode_set_need_inactive(ip);
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags |= XFS_NEED_INACTIVE;
+	spin_unlock(&ip->i_flags_lock);
+
+	gc = get_cpu_ptr(mp->m_inodegc);
+	llist_add(&ip->i_gclist, &gc->list);
+	items = READ_ONCE(gc->items);
+	WRITE_ONCE(gc->items, items + 1);
+	shrinker_hits = READ_ONCE(gc->shrinker_hits);
+
+	/*
+	 * We queue the work while holding the current CPU so that the work
+	 * is scheduled to run on this CPU.
+	 */
+	if (!xfs_is_inodegc_enabled(mp)) {
+		put_cpu_ptr(gc);
+		return;
+	}
+
+	if (xfs_inodegc_want_queue_work(ip, items))
+		queue_delay = 0;
+
+	trace_xfs_inodegc_queue(mp, __return_address);
+	mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
+			queue_delay);
+	put_cpu_ptr(gc);
+
+	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+		trace_xfs_inodegc_throttle(mp, __return_address);
+		flush_delayed_work(&gc->work);
+	}
+}
+
+/*
+ * Fold the dead CPU inodegc queue into the current CPUs queue.
+ */
+void
+xfs_inodegc_cpu_dead(
+	struct xfs_mount	*mp,
+	unsigned int		dead_cpu)
+{
+	struct xfs_inodegc	*dead_gc, *gc;
+	struct llist_node	*first, *last;
+	unsigned int		count = 0;
+
+	dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
+	cancel_delayed_work_sync(&dead_gc->work);
+
+	if (llist_empty(&dead_gc->list))
+		return;
+
+	first = dead_gc->list.first;
+	last = first;
+	while (last->next) {
+		last = last->next;
+		count++;
+	}
+	dead_gc->list.first = NULL;
+	dead_gc->items = 0;
+
+	/* Add pending work to current CPU */
+	gc = get_cpu_ptr(mp->m_inodegc);
+	llist_add_batch(first, last, &gc->list);
+	count += READ_ONCE(gc->items);
+	WRITE_ONCE(gc->items, count);
+
+	if (xfs_is_inodegc_enabled(mp)) {
+		trace_xfs_inodegc_queue(mp, __return_address);
+		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
+				0);
+	}
+	put_cpu_ptr(gc);
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.  Once we get tag
+ * lookups on the radix tree, this inode flag can go away.
+ *
+ * We always use background reclaim here because even if the inode is clean, it
+ * still may be under IO and hence we have wait for IO completion to occur
+ * before we can reclaim the inode. The background reclaim path handles this
+ * more efficiently than we can here, so simply let background reclaim tear down
+ * all inodes.
+ */
+void
+xfs_inode_mark_reclaimable(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	bool			need_inactive;
+
+	XFS_STATS_INC(mp, vn_reclaim);
+
+	/*
+	 * We should never get here with any of the reclaim flags already set.
+	 */
+	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
+
+	need_inactive = xfs_inode_needs_inactive(ip);
+	if (need_inactive) {
+		xfs_inodegc_queue(ip);
+		return;
+	}
+
+	/* Going straight to reclaim, so drop the dquots. */
+	xfs_qm_dqdetach(ip);
+	xfs_inodegc_set_reclaimable(ip);
+}
+
+/*
+ * Register a phony shrinker so that we can run background inodegc sooner when
+ * there's memory pressure.  Inactivation does not itself free any memory but
+ * it does make inodes reclaimable, which eventually frees memory.
+ *
+ * The count function, seek value, and batch value are crafted to trigger the
+ * scan function during the second round of scanning.  Hopefully this means
+ * that we reclaimed enough memory that initiating metadata transactions won't
+ * make things worse.
+ */
+#define XFS_INODEGC_SHRINKER_COUNT	(1UL << DEF_PRIORITY)
+#define XFS_INODEGC_SHRINKER_BATCH	((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
+
+static unsigned long
+xfs_inodegc_shrinker_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_mount	*mp = container_of(shrink, struct xfs_mount,
+						   m_inodegc_shrinker);
+	struct xfs_inodegc	*gc;
+	int			cpu;
+
+	if (!xfs_is_inodegc_enabled(mp))
+		return 0;
+
+	for_each_online_cpu(cpu) {
+		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+		if (!llist_empty(&gc->list))
+			return XFS_INODEGC_SHRINKER_COUNT;
+	}
+
+	return 0;
+}
+
+static unsigned long
+xfs_inodegc_shrinker_scan(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_mount	*mp = container_of(shrink, struct xfs_mount,
+						   m_inodegc_shrinker);
+	struct xfs_inodegc	*gc;
+	int			cpu;
+	bool			no_items = true;
+
+	if (!xfs_is_inodegc_enabled(mp))
+		return SHRINK_STOP;
+
+	trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
+
+	for_each_online_cpu(cpu) {
+		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+		if (!llist_empty(&gc->list)) {
+			unsigned int	h = READ_ONCE(gc->shrinker_hits);
+
+			WRITE_ONCE(gc->shrinker_hits, h + 1);
+			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+			no_items = false;
+		}
+	}
+
+	/*
+	 * If there are no inodes to inactivate, we don't want the shrinker
+	 * to think there's deferred work to call us back about.
+	 */
+	if (no_items)
+		return LONG_MAX;
+
+	return SHRINK_STOP;
+}
+
+/* Register a shrinker so we can accelerate inodegc and throttle queuing. */
+int
+xfs_inodegc_register_shrinker(
+	struct xfs_mount	*mp)
+{
+	struct shrinker		*shrink = &mp->m_inodegc_shrinker;
+
+	shrink->count_objects = xfs_inodegc_shrinker_count;
+	shrink->scan_objects = xfs_inodegc_shrinker_scan;
+	shrink->seeks = 0;
+	shrink->flags = SHRINKER_NONSLAB;
+	shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
+
+	return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
new file mode 100644
index 000000000..6cd180721
--- /dev/null
+++ b/fs/xfs/xfs_icache.h
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+struct xfs_icwalk {
+	__u32		icw_flags;
+	kuid_t		icw_uid;
+	kgid_t		icw_gid;
+	prid_t		icw_prid;
+	__u64		icw_min_file_size;
+	long		icw_scan_limit;
+};
+
+/* Flags that reflect xfs_fs_eofblocks functionality. */
+#define XFS_ICWALK_FLAG_SYNC		(1U << 0) /* sync/wait mode scan */
+#define XFS_ICWALK_FLAG_UID		(1U << 1) /* filter by uid */
+#define XFS_ICWALK_FLAG_GID		(1U << 2) /* filter by gid */
+#define XFS_ICWALK_FLAG_PRID		(1U << 3) /* filter by project id */
+#define XFS_ICWALK_FLAG_MINFILESIZE	(1U << 4) /* filter by min file size */
+
+#define XFS_ICWALK_FLAGS_VALID		(XFS_ICWALK_FLAG_SYNC | \
+					 XFS_ICWALK_FLAG_UID | \
+					 XFS_ICWALK_FLAG_GID | \
+					 XFS_ICWALK_FLAG_PRID | \
+					 XFS_ICWALK_FLAG_MINFILESIZE)
+
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE		0x1
+#define XFS_IGET_UNTRUSTED	0x2
+#define XFS_IGET_DONTCACHE	0x4
+#define XFS_IGET_INCORE		0x8	/* don't read from disk or reinit */
+
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+	     uint flags, uint lock_flags, xfs_inode_t **ipp);
+
+/* recovery needs direct inode allocation capability */
+struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
+void xfs_inode_free(struct xfs_inode *ip);
+
+void xfs_reclaim_worker(struct work_struct *work);
+
+void xfs_reclaim_inodes(struct xfs_mount *mp);
+long xfs_reclaim_inodes_count(struct xfs_mount *mp);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, unsigned long nr_to_scan);
+
+void xfs_inode_mark_reclaimable(struct xfs_inode *ip);
+
+int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
+		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+		unsigned int iwalk_flags);
+int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags);
+int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
+void xfs_blockgc_flush_all(struct xfs_mount *mp);
+
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+
+void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
+
+void xfs_blockgc_worker(struct work_struct *work);
+
+int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
+				  xfs_ino_t ino, bool *inuse);
+
+void xfs_blockgc_stop(struct xfs_mount *mp);
+void xfs_blockgc_start(struct xfs_mount *mp);
+
+void xfs_inodegc_worker(struct work_struct *work);
+void xfs_inodegc_push(struct xfs_mount *mp);
+void xfs_inodegc_flush(struct xfs_mount *mp);
+void xfs_inodegc_stop(struct xfs_mount *mp);
+void xfs_inodegc_start(struct xfs_mount *mp);
+void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
+int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
+
+#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000..b05314d48
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_icreate_item.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_ialloc.h"
+#include "xfs_trace.h"
+
+struct kmem_cache	*xfs_icreate_cache;		/* inode create item */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC void
+xfs_icreate_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_icreate_log);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
+			&icp->ic_format,
+			sizeof(struct xfs_icreate_log));
+}
+
+STATIC void
+xfs_icreate_item_release(
+	struct xfs_log_item	*lip)
+{
+	kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
+	kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
+}
+
+static const struct xfs_item_ops xfs_icreate_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
+	.iop_size	= xfs_icreate_item_size,
+	.iop_format	= xfs_icreate_item_format,
+	.iop_release	= xfs_icreate_item_release,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	unsigned int		count,
+	unsigned int		inode_size,
+	xfs_agblock_t		length,
+	unsigned int		generation)
+{
+	struct xfs_icreate_item	*icp;
+
+	icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+	xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+			  &xfs_icreate_item_ops);
+
+	icp->ic_format.icl_type = XFS_LI_ICREATE;
+	icp->ic_format.icl_size = 1;	/* single vector */
+	icp->ic_format.icl_ag = cpu_to_be32(agno);
+	icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+	icp->ic_format.icl_count = cpu_to_be32(count);
+	icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+	icp->ic_format.icl_length = cpu_to_be32(length);
+	icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+	xfs_trans_add_item(tp, &icp->ic_item);
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags);
+}
+
+static enum xlog_recover_reorder
+xlog_recover_icreate_reorder(
+		struct xlog_recover_item *item)
+{
+	/*
+	 * Inode allocation buffers must be replayed before subsequent inode
+	 * items try to modify those buffers.  ICREATE items are the logical
+	 * equivalent of logging a newly initialized inode buffer, so recover
+	 * these at the same time that we recover logged buffers.
+	 */
+	return XLOG_REORDER_BUFFER_LIST;
+}
+
+/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log.  It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be initialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_icreate_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_icreate_log		*icl;
+	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	unsigned int			count;
+	unsigned int			isize;
+	xfs_agblock_t			length;
+	int				bb_per_cluster;
+	int				cancel_count;
+	int				nbufs;
+	int				i;
+
+	icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+	if (icl->icl_type != XFS_LI_ICREATE) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+		return -EINVAL;
+	}
+
+	if (icl->icl_size != 1) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+		return -EINVAL;
+	}
+
+	agno = be32_to_cpu(icl->icl_ag);
+	if (agno >= mp->m_sb.sb_agcount) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+		return -EINVAL;
+	}
+	agbno = be32_to_cpu(icl->icl_agbno);
+	if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+		return -EINVAL;
+	}
+	isize = be32_to_cpu(icl->icl_isize);
+	if (isize != mp->m_sb.sb_inodesize) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+		return -EINVAL;
+	}
+	count = be32_to_cpu(icl->icl_count);
+	if (!count) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+		return -EINVAL;
+	}
+	length = be32_to_cpu(icl->icl_length);
+	if (!length || length >= mp->m_sb.sb_agblocks) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+		return -EINVAL;
+	}
+
+	/*
+	 * The inode chunk is either full or sparse and we only support
+	 * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
+	 */
+	if (length != igeo->ialloc_blks &&
+	    length != igeo->ialloc_min_blks) {
+		xfs_warn(log->l_mp,
+			 "%s: unsupported chunk length", __func__);
+		return -EINVAL;
+	}
+
+	/* verify inode count is consistent with extent length */
+	if ((count >> mp->m_sb.sb_inopblog) != length) {
+		xfs_warn(log->l_mp,
+			 "%s: inconsistent inode count and chunk length",
+			 __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * The icreate transaction can cover multiple cluster buffers and these
+	 * buffers could have been freed and reused. Check the individual
+	 * buffers for cancellation so we don't overwrite anything written after
+	 * a cancellation.
+	 */
+	bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+	nbufs = length / igeo->blocks_per_cluster;
+	for (i = 0, cancel_count = 0; i < nbufs; i++) {
+		xfs_daddr_t	daddr;
+
+		daddr = XFS_AGB_TO_DADDR(mp, agno,
+				agbno + i * igeo->blocks_per_cluster);
+		if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster))
+			cancel_count++;
+	}
+
+	/*
+	 * We currently only use icreate for a single allocation at a time. This
+	 * means we should expect either all or none of the buffers to be
+	 * cancelled. Be conservative and skip replay if at least one buffer is
+	 * cancelled, but warn the user that something is awry if the buffers
+	 * are not consistent.
+	 *
+	 * XXX: This must be refined to only skip cancelled clusters once we use
+	 * icreate for multiple chunk allocations.
+	 */
+	ASSERT(!cancel_count || cancel_count == nbufs);
+	if (cancel_count) {
+		if (cancel_count != nbufs)
+			xfs_warn(mp,
+	"WARNING: partial inode chunk cancellation, skipped icreate.");
+		trace_xfs_log_recover_icreate_cancel(log, icl);
+		return 0;
+	}
+
+	trace_xfs_log_recover_icreate_recover(log, icl);
+	return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
+				     length, be32_to_cpu(icl->icl_gen));
+}
+
+const struct xlog_recover_item_ops xlog_icreate_item_ops = {
+	.item_type		= XFS_LI_ICREATE,
+	.reorder		= xlog_recover_icreate_reorder,
+	.commit_pass2		= xlog_recover_icreate_commit_pass2,
+};
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000..649928231
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H	1
+
+/* in memory log item structure */
+struct xfs_icreate_item {
+	struct xfs_log_item	ic_item;
+	struct xfs_icreate_log	ic_format;
+};
+
+extern struct kmem_cache *xfs_icreate_cache;	/* inode create item */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+			xfs_agblock_t agbno, unsigned int count,
+			unsigned int inode_size, xfs_agblock_t length,
+			unsigned int generation);
+
+#endif	/* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
new file mode 100644
index 000000000..aa303be11
--- /dev/null
+++ b/fs/xfs/xfs_inode.c
@@ -0,0 +1,3540 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include <linux/iversion.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_attr.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_iunlink_item.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_filestream.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
+#include "xfs_ag.h"
+#include "xfs_log_priv.h"
+
+struct kmem_cache *xfs_inode_cache;
+
+/*
+ * Used in xfs_itruncate_extents().  This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define	XFS_ITRUNC_MAX_EXTENTS	2
+
+STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
+STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
+	struct xfs_inode *);
+
+/*
+ * helper function to extract extent size hint from inode
+ */
+xfs_extlen_t
+xfs_get_extsz_hint(
+	struct xfs_inode	*ip)
+{
+	/*
+	 * No point in aligning allocations if we need to COW to actually
+	 * write to them.
+	 */
+	if (xfs_is_always_cow_inode(ip))
+		return 0;
+	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+		return ip->i_extsize;
+	if (XFS_IS_REALTIME_INODE(ip))
+		return ip->i_mount->m_sb.sb_rextsize;
+	return 0;
+}
+
+/*
+ * Helper function to extract CoW extent size hint from inode.
+ * Between the extent size hint and the CoW extent size hint, we
+ * return the greater of the two.  If the value is zero (automatic),
+ * use the default size.
+ */
+xfs_extlen_t
+xfs_get_cowextsz_hint(
+	struct xfs_inode	*ip)
+{
+	xfs_extlen_t		a, b;
+
+	a = 0;
+	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+		a = ip->i_cowextsize;
+	b = xfs_get_extsz_hint(ip);
+
+	a = max(a, b);
+	if (a == 0)
+		return XFS_DEFAULT_COWEXTSZ_HINT;
+	return a;
+}
+
+/*
+ * These two are wrapper routines around the xfs_ilock() routine used to
+ * centralize some grungy code.  They are used in places that wish to lock the
+ * inode solely for reading the extents.  The reason these places can't just
+ * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
+ * bringing in of the extents from disk for a file in b-tree format.  If the
+ * inode is in b-tree format, then we need to lock the inode exclusively until
+ * the extents are read in.  Locking it exclusively all the time would limit
+ * our parallelism unnecessarily, though.  What we do instead is check to see
+ * if the extents have been read in yet, and only lock the inode exclusively
+ * if they have not.
+ *
+ * The functions return a value which should be given to the corresponding
+ * xfs_iunlock() call.
+ */
+uint
+xfs_ilock_data_map_shared(
+	struct xfs_inode	*ip)
+{
+	uint			lock_mode = XFS_ILOCK_SHARED;
+
+	if (xfs_need_iread_extents(&ip->i_df))
+		lock_mode = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lock_mode);
+	return lock_mode;
+}
+
+uint
+xfs_ilock_attr_map_shared(
+	struct xfs_inode	*ip)
+{
+	uint			lock_mode = XFS_ILOCK_SHARED;
+
+	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+		lock_mode = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lock_mode);
+	return lock_mode;
+}
+
+/*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
+ * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
+ * to set in lock_flags.
+ */
+static inline void
+xfs_lock_flags_assert(
+	uint		lock_flags)
+{
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+		(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+		(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+		(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+	ASSERT(lock_flags != 0);
+}
+
+/*
+ * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
+ * multi-reader locks: invalidate_lock and the i_lock.  This routine allows
+ * various combinations of the locks to be obtained.
+ *
+ * The 3 locks should always be ordered so that the IO lock is obtained first,
+ * the mmap lock second and the ilock last in order to prevent deadlock.
+ *
+ * Basic locking order:
+ *
+ * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
+ *
+ * mmap_lock locking order:
+ *
+ * i_rwsem -> page lock -> mmap_lock
+ * mmap_lock -> invalidate_lock -> page_lock
+ *
+ * The difference in mmap_lock locking order mean that we cannot hold the
+ * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
+ * can fault in pages during copy in/out (for buffered IO) or require the
+ * mmap_lock in get_user_pages() to map the user pages into the kernel address
+ * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
+ * fault because page faults already hold the mmap_lock.
+ *
+ * Hence to serialise fully against both syscall and mmap based IO, we need to
+ * take both the i_rwsem and the invalidate_lock. These locks should *only* be
+ * both taken in places where we need to invalidate the page cache in a race
+ * free manner (e.g. truncate, hole punch and other extent manipulation
+ * functions).
+ */
+void
+xfs_ilock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+	xfs_lock_flags_assert(lock_flags);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		down_write_nested(&VFS_I(ip)->i_rwsem,
+				  XFS_IOLOCK_DEP(lock_flags));
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		down_read_nested(&VFS_I(ip)->i_rwsem,
+				 XFS_IOLOCK_DEP(lock_flags));
+	}
+
+	if (lock_flags & XFS_MMAPLOCK_EXCL) {
+		down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+				  XFS_MMAPLOCK_DEP(lock_flags));
+	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+		down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+				 XFS_MMAPLOCK_DEP(lock_flags));
+	}
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *	 of valid values.
+ */
+int
+xfs_ilock_nowait(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+	xfs_lock_flags_assert(lock_flags);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
+			goto out;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
+			goto out;
+	}
+
+	if (lock_flags & XFS_MMAPLOCK_EXCL) {
+		if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
+			goto out_undo_iolock;
+	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+		if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
+			goto out_undo_iolock;
+	}
+
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_lock))
+			goto out_undo_mmaplock;
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_lock))
+			goto out_undo_mmaplock;
+	}
+	return 1;
+
+out_undo_mmaplock:
+	if (lock_flags & XFS_MMAPLOCK_EXCL)
+		up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
+	else if (lock_flags & XFS_MMAPLOCK_SHARED)
+		up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
+out_undo_iolock:
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		up_write(&VFS_I(ip)->i_rwsem);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		up_read(&VFS_I(ip)->i_rwsem);
+out:
+	return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *	 of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	xfs_lock_flags_assert(lock_flags);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		up_write(&VFS_I(ip)->i_rwsem);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		up_read(&VFS_I(ip)->i_rwsem);
+
+	if (lock_flags & XFS_MMAPLOCK_EXCL)
+		up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
+	else if (lock_flags & XFS_MMAPLOCK_SHARED)
+		up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrunlock_excl(&ip->i_lock);
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mrunlock_shared(&ip->i_lock);
+
+	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
+	ASSERT((lock_flags &
+		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrdemote(&ip->i_lock);
+	if (lock_flags & XFS_MMAPLOCK_EXCL)
+		downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		downgrade_write(&VFS_I(ip)->i_rwsem);
+
+	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+static inline bool
+__xfs_rwsem_islocked(
+	struct rw_semaphore	*rwsem,
+	bool			shared)
+{
+	if (!debug_locks)
+		return rwsem_is_locked(rwsem);
+
+	if (!shared)
+		return lockdep_is_held_type(rwsem, 0);
+
+	/*
+	 * We are checking that the lock is held at least in shared
+	 * mode but don't care that it might be held exclusively
+	 * (i.e. shared | excl). Hence we check if the lock is held
+	 * in any mode rather than an explicit shared mode.
+	 */
+	return lockdep_is_held_type(rwsem, -1);
+}
+
+bool
+xfs_isilocked(
+	struct xfs_inode	*ip,
+	uint			lock_flags)
+{
+	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+		if (!(lock_flags & XFS_ILOCK_SHARED))
+			return !!ip->i_lock.mr_writer;
+		return rwsem_is_locked(&ip->i_lock.mr_lock);
+	}
+
+	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
+		return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
+				(lock_flags & XFS_MMAPLOCK_SHARED));
+	}
+
+	if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+		return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+				(lock_flags & XFS_IOLOCK_SHARED));
+	}
+
+	ASSERT(0);
+	return false;
+}
+#endif
+
+/*
+ * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
+ * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
+ * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
+ * errors and warnings.
+ */
+#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
+static bool
+xfs_lockdep_subclass_ok(
+	int subclass)
+{
+	return subclass < MAX_LOCKDEP_SUBCLASSES;
+}
+#else
+#define xfs_lockdep_subclass_ok(subclass)	(true)
+#endif
+
+/*
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
+ * value. This can be called for any type of inode lock combination, including
+ * parent locking. Care must be taken to ensure we don't overrun the subclass
+ * storage fields in the class mask we build.
+ */
+static inline uint
+xfs_lock_inumorder(
+	uint	lock_mode,
+	uint	subclass)
+{
+	uint	class = 0;
+
+	ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
+			      XFS_ILOCK_RTSUM)));
+	ASSERT(xfs_lockdep_subclass_ok(subclass));
+
+	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+		ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
+		class += subclass << XFS_IOLOCK_SHIFT;
+	}
+
+	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
+		ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
+		class += subclass << XFS_MMAPLOCK_SHIFT;
+	}
+
+	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
+		ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
+		class += subclass << XFS_ILOCK_SHIFT;
+	}
+
+	return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
+}
+
+/*
+ * The following routine will lock n inodes in exclusive mode.  We assume the
+ * caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock is in the AIL and we
+ * start waiting for another inode that is locked by a thread in a long running
+ * transaction (such as truncate). This can result in deadlock since the long
+ * running trans might need to wait for the inode we just locked in order to
+ * push the tail and free space in the log.
+ *
+ * xfs_lock_inodes() can only be used to lock one type of lock at a time -
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
+ * lock more than one at a time, lockdep will report false positives saying we
+ * have violated locking orders.
+ */
+static void
+xfs_lock_inodes(
+	struct xfs_inode	**ips,
+	int			inodes,
+	uint			lock_mode)
+{
+	int			attempts = 0;
+	uint			i;
+	int			j;
+	bool			try_lock;
+	struct xfs_log_item	*lp;
+
+	/*
+	 * Currently supports between 2 and 5 inodes with exclusive locking.  We
+	 * support an arbitrary depth of locking here, but absolute limits on
+	 * inodes depend on the type of locking and the limits placed by
+	 * lockdep annotations in xfs_lock_inumorder.  These are all checked by
+	 * the asserts.
+	 */
+	ASSERT(ips && inodes >= 2 && inodes <= 5);
+	ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
+			    XFS_ILOCK_EXCL));
+	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
+			      XFS_ILOCK_SHARED)));
+	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
+		inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
+	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
+		inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
+
+	if (lock_mode & XFS_IOLOCK_EXCL) {
+		ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
+	} else if (lock_mode & XFS_MMAPLOCK_EXCL)
+		ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
+
+again:
+	try_lock = false;
+	i = 0;
+	for (; i < inodes; i++) {
+		ASSERT(ips[i]);
+
+		if (i && (ips[i] == ips[i - 1]))	/* Already locked */
+			continue;
+
+		/*
+		 * If try_lock is not set yet, make sure all locked inodes are
+		 * not in the AIL.  If any are, set try_lock to be used later.
+		 */
+		if (!try_lock) {
+			for (j = (i - 1); j >= 0 && !try_lock; j--) {
+				lp = &ips[j]->i_itemp->ili_item;
+				if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
+					try_lock = true;
+			}
+		}
+
+		/*
+		 * If any of the previous locks we have locked is in the AIL,
+		 * we must TRY to get the second and subsequent locks. If
+		 * we can't get any, we must release all we have
+		 * and try again.
+		 */
+		if (!try_lock) {
+			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+			continue;
+		}
+
+		/* try_lock means we have an inode locked that is in the AIL. */
+		ASSERT(i != 0);
+		if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
+			continue;
+
+		/*
+		 * Unlock all previous guys and try again.  xfs_iunlock will try
+		 * to push the tail if the inode is in the AIL.
+		 */
+		attempts++;
+		for (j = i - 1; j >= 0; j--) {
+			/*
+			 * Check to see if we've already unlocked this one.  Not
+			 * the first one going back, and the inode ptr is the
+			 * same.
+			 */
+			if (j != (i - 1) && ips[j] == ips[j + 1])
+				continue;
+
+			xfs_iunlock(ips[j], lock_mode);
+		}
+
+		if ((attempts % 5) == 0) {
+			delay(1); /* Don't just spin the CPU */
+		}
+		goto again;
+	}
+}
+
+/*
+ * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
+ * mmaplock must be double-locked separately since we use i_rwsem and
+ * invalidate_lock for that. We now support taking one lock EXCL and the
+ * other SHARED.
+ */
+void
+xfs_lock_two_inodes(
+	struct xfs_inode	*ip0,
+	uint			ip0_mode,
+	struct xfs_inode	*ip1,
+	uint			ip1_mode)
+{
+	int			attempts = 0;
+	struct xfs_log_item	*lp;
+
+	ASSERT(hweight32(ip0_mode) == 1);
+	ASSERT(hweight32(ip1_mode) == 1);
+	ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
+	ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
+	ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+	ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+	ASSERT(ip0->i_ino != ip1->i_ino);
+
+	if (ip0->i_ino > ip1->i_ino) {
+		swap(ip0, ip1);
+		swap(ip0_mode, ip1_mode);
+	}
+
+ again:
+	xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
+
+	/*
+	 * If the first lock we have locked is in the AIL, we must TRY to get
+	 * the second lock. If we can't get it, we must release the first one
+	 * and try again.
+	 */
+	lp = &ip0->i_itemp->ili_item;
+	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
+		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
+			xfs_iunlock(ip0, ip0_mode);
+			if ((++attempts % 5) == 0)
+				delay(1); /* Don't just spin the CPU */
+			goto again;
+		}
+	} else {
+		xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
+	}
+}
+
+uint
+xfs_ip2xflags(
+	struct xfs_inode	*ip)
+{
+	uint			flags = 0;
+
+	if (ip->i_diflags & XFS_DIFLAG_ANY) {
+		if (ip->i_diflags & XFS_DIFLAG_REALTIME)
+			flags |= FS_XFLAG_REALTIME;
+		if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
+			flags |= FS_XFLAG_PREALLOC;
+		if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
+			flags |= FS_XFLAG_IMMUTABLE;
+		if (ip->i_diflags & XFS_DIFLAG_APPEND)
+			flags |= FS_XFLAG_APPEND;
+		if (ip->i_diflags & XFS_DIFLAG_SYNC)
+			flags |= FS_XFLAG_SYNC;
+		if (ip->i_diflags & XFS_DIFLAG_NOATIME)
+			flags |= FS_XFLAG_NOATIME;
+		if (ip->i_diflags & XFS_DIFLAG_NODUMP)
+			flags |= FS_XFLAG_NODUMP;
+		if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
+			flags |= FS_XFLAG_RTINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+			flags |= FS_XFLAG_PROJINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
+			flags |= FS_XFLAG_NOSYMLINKS;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+			flags |= FS_XFLAG_EXTSIZE;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
+			flags |= FS_XFLAG_EXTSZINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
+			flags |= FS_XFLAG_NODEFRAG;
+		if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
+			flags |= FS_XFLAG_FILESTREAM;
+	}
+
+	if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
+		if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
+			flags |= FS_XFLAG_DAX;
+		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+			flags |= FS_XFLAG_COWEXTSIZE;
+	}
+
+	if (xfs_inode_has_attr_fork(ip))
+		flags |= FS_XFLAG_HASATTR;
+	return flags;
+}
+
+/*
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
+ */
+int
+xfs_lookup(
+	struct xfs_inode	*dp,
+	const struct xfs_name	*name,
+	struct xfs_inode	**ipp,
+	struct xfs_name		*ci_name)
+{
+	xfs_ino_t		inum;
+	int			error;
+
+	trace_xfs_lookup(dp, name);
+
+	if (xfs_is_shutdown(dp->i_mount))
+		return -EIO;
+
+	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+	if (error)
+		goto out_unlock;
+
+	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+	if (error)
+		goto out_free_name;
+
+	return 0;
+
+out_free_name:
+	if (ci_name)
+		kmem_free(ci_name->name);
+out_unlock:
+	*ipp = NULL;
+	return error;
+}
+
+/* Propagate di_flags from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags(
+	struct xfs_inode	*ip,
+	const struct xfs_inode	*pip)
+{
+	unsigned int		di_flags = 0;
+	xfs_failaddr_t		failaddr;
+	umode_t			mode = VFS_I(ip)->i_mode;
+
+	if (S_ISDIR(mode)) {
+		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+			ip->i_extsize = pip->i_extsize;
+		}
+		if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+			di_flags |= XFS_DIFLAG_PROJINHERIT;
+	} else if (S_ISREG(mode)) {
+		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+		    xfs_has_realtime(ip->i_mount))
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+			ip->i_extsize = pip->i_extsize;
+		}
+	}
+	if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
+	    xfs_inherit_noatime)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
+	    xfs_inherit_nodump)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
+	    xfs_inherit_sync)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
+	    xfs_inherit_nosymlinks)
+		di_flags |= XFS_DIFLAG_NOSYMLINKS;
+	if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
+	    xfs_inherit_nodefrag)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+
+	ip->i_diflags |= di_flags;
+
+	/*
+	 * Inode verifiers on older kernels only check that the extent size
+	 * hint is an integer multiple of the rt extent size on realtime files.
+	 * They did not check the hint alignment on a directory with both
+	 * rtinherit and extszinherit flags set.  If the misaligned hint is
+	 * propagated from a directory into a new realtime file, new file
+	 * allocations will fail due to math errors in the rt allocator and/or
+	 * trip the verifiers.  Validate the hint settings in the new file so
+	 * that we don't let broken hints propagate.
+	 */
+	failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
+			VFS_I(ip)->i_mode, ip->i_diflags);
+	if (failaddr) {
+		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+				   XFS_DIFLAG_EXTSZINHERIT);
+		ip->i_extsize = 0;
+	}
+}
+
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags2(
+	struct xfs_inode	*ip,
+	const struct xfs_inode	*pip)
+{
+	xfs_failaddr_t		failaddr;
+
+	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+		ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = pip->i_cowextsize;
+	}
+	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
+		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+
+	/* Don't let invalid cowextsize hints propagate. */
+	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
+			VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
+	if (failaddr) {
+		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = 0;
+	}
+}
+
+/*
+ * Initialise a newly allocated inode and return the in-core inode to the
+ * caller locked exclusively.
+ */
+int
+xfs_init_new_inode(
+	struct user_namespace	*mnt_userns,
+	struct xfs_trans	*tp,
+	struct xfs_inode	*pip,
+	xfs_ino_t		ino,
+	umode_t			mode,
+	xfs_nlink_t		nlink,
+	dev_t			rdev,
+	prid_t			prid,
+	bool			init_xattrs,
+	struct xfs_inode	**ipp)
+{
+	struct inode		*dir = pip ? VFS_I(pip) : NULL;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*ip;
+	unsigned int		flags;
+	int			error;
+	struct timespec64	tv;
+	struct inode		*inode;
+
+	/*
+	 * Protect against obviously corrupt allocation btree records. Later
+	 * xfs_iget checks will catch re-allocation of other active in-memory
+	 * and on-disk inodes. If we don't catch reallocating the parent inode
+	 * here we will deadlock in xfs_iget() so we have to do these checks
+	 * first.
+	 */
+	if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
+		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Get the in-core inode with the lock held exclusively to prevent
+	 * others from looking at until we're done.
+	 */
+	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+	if (error)
+		return error;
+
+	ASSERT(ip != NULL);
+	inode = VFS_I(ip);
+	set_nlink(inode, nlink);
+	inode->i_rdev = rdev;
+	ip->i_projid = prid;
+
+	if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
+		inode_fsuid_set(inode, mnt_userns);
+		inode->i_gid = dir->i_gid;
+		inode->i_mode = mode;
+	} else {
+		inode_init_owner(mnt_userns, inode, dir, mode);
+	}
+
+	/*
+	 * If the group ID of the new file does not match the effective group
+	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
+	 * (and only if the irix_sgid_inherit compatibility variable is set).
+	 */
+	if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
+	    !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)))
+		inode->i_mode &= ~S_ISGID;
+
+	ip->i_disk_size = 0;
+	ip->i_df.if_nextents = 0;
+	ASSERT(ip->i_nblocks == 0);
+
+	tv = current_time(inode);
+	inode->i_mtime = tv;
+	inode->i_atime = tv;
+	inode->i_ctime = tv;
+
+	ip->i_extsize = 0;
+	ip->i_diflags = 0;
+
+	if (xfs_has_v3inodes(mp)) {
+		inode_set_iversion(inode, 1);
+		ip->i_cowextsize = 0;
+		ip->i_crtime = tv;
+	}
+
+	flags = XFS_ILOG_CORE;
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
+		flags |= XFS_ILOG_DEV;
+		break;
+	case S_IFREG:
+	case S_IFDIR:
+		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
+			xfs_inode_inherit_flags(ip, pip);
+		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
+			xfs_inode_inherit_flags2(ip, pip);
+		fallthrough;
+	case S_IFLNK:
+		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+		ip->i_df.if_bytes = 0;
+		ip->i_df.if_u1.if_root = NULL;
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	/*
+	 * If we need to create attributes immediately after allocating the
+	 * inode, initialise an empty attribute fork right now. We use the
+	 * default fork offset for attributes here as we don't know exactly what
+	 * size or how many attributes we might be adding. We can do this
+	 * safely here because we know the data fork is completely empty and
+	 * this saves us from needing to run a separate transaction to set the
+	 * fork offset in the immediate future.
+	 */
+	if (init_xattrs && xfs_has_attr(mp)) {
+		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
+		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+	}
+
+	/*
+	 * Log the new values stuffed into the inode.
+	 */
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, flags);
+
+	/* now that we have an i_mode we can setup the inode structure */
+	xfs_setup_inode(ip);
+
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * Decrement the link count on an inode & log the change.  If this causes the
+ * link count to go to zero, move the inode to AGI unlinked list so that it can
+ * be freed when the last active reference goes away via xfs_inactive().
+ */
+static int			/* error */
+xfs_droplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	drop_nlink(VFS_I(ip));
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (VFS_I(ip)->i_nlink)
+		return 0;
+
+	return xfs_iunlink(tp, ip);
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+static void
+xfs_bumplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	inc_nlink(VFS_I(ip));
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+int
+xfs_create(
+	struct user_namespace	*mnt_userns,
+	xfs_inode_t		*dp,
+	struct xfs_name		*name,
+	umode_t			mode,
+	dev_t			rdev,
+	bool			init_xattrs,
+	xfs_inode_t		**ipp)
+{
+	int			is_dir = S_ISDIR(mode);
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_inode	*ip = NULL;
+	struct xfs_trans	*tp = NULL;
+	int			error;
+	bool                    unlock_dp_on_error = false;
+	prid_t			prid;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_trans_res	*tres;
+	uint			resblks;
+	xfs_ino_t		ino;
+
+	trace_xfs_create(dp, name);
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	prid = xfs_get_initial_prid(dp);
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+			mapped_fsgid(mnt_userns, &init_user_ns), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+			&udqp, &gdqp, &pdqp);
+	if (error)
+		return error;
+
+	if (is_dir) {
+		resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+		tres = &M_RES(mp)->tr_mkdir;
+	} else {
+		resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+		tres = &M_RES(mp)->tr_create;
+	}
+
+	/*
+	 * Initially assume that the file does not exist and
+	 * reserve the resources for that case.  If that is not
+	 * the case we'll drop the one we have and get a more
+	 * appropriate transaction later.
+	 */
+	error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+			&tp);
+	if (error == -ENOSPC) {
+		/* flush outstanding delalloc blocks and retry */
+		xfs_flush_inodes(mp);
+		error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
+				resblks, &tp);
+	}
+	if (error)
+		goto out_release_dquots;
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+	unlock_dp_on_error = true;
+
+	/*
+	 * A newly created regular or special file just has one directory
+	 * entry pointing to them, but a directory also the "." entry
+	 * pointing to itself.
+	 */
+	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+	if (!error)
+		error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode,
+				is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Now we join the directory inode to the transaction.  We do not do it
+	 * earlier because xfs_dialloc might commit the previous transaction
+	 * (and release all the locks).  An error from here on will result in
+	 * the transaction cancel unlocking dp so don't do it explicitly in the
+	 * error path.
+	 */
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	unlock_dp_on_error = false;
+
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+					resblks - XFS_IALLOC_SPACE_RES(mp));
+	if (error) {
+		ASSERT(error != -ENOSPC);
+		goto out_trans_cancel;
+	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	if (is_dir) {
+		error = xfs_dir_init(tp, ip, dp);
+		if (error)
+			goto out_trans_cancel;
+
+		xfs_bumplink(tp, dp);
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * create transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
+		xfs_trans_set_sync(tp);
+
+	/*
+	 * Attach the dquot(s) to the inodes and modify them incore.
+	 * These ids of the inode couldn't have changed since the new
+	 * inode has been locked ever since it was created.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_release_inode;
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	*ipp = ip;
+	return 0;
+
+ out_trans_cancel:
+	xfs_trans_cancel(tp);
+ out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to finish the
+	 * setup of the inode and release the inode.  This prevents recursive
+	 * transactions and deadlocks from xfs_inactive.
+	 */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		xfs_irele(ip);
+	}
+ out_release_dquots:
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	if (unlock_dp_on_error)
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+int
+xfs_create_tmpfile(
+	struct user_namespace	*mnt_userns,
+	struct xfs_inode	*dp,
+	umode_t			mode,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_inode	*ip = NULL;
+	struct xfs_trans	*tp = NULL;
+	int			error;
+	prid_t                  prid;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_trans_res	*tres;
+	uint			resblks;
+	xfs_ino_t		ino;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	prid = xfs_get_initial_prid(dp);
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+			mapped_fsgid(mnt_userns, &init_user_ns), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+			&udqp, &gdqp, &pdqp);
+	if (error)
+		return error;
+
+	resblks = XFS_IALLOC_SPACE_RES(mp);
+	tres = &M_RES(mp)->tr_create_tmpfile;
+
+	error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+			&tp);
+	if (error)
+		goto out_release_dquots;
+
+	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+	if (!error)
+		error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode,
+				0, 0, prid, false, &ip);
+	if (error)
+		goto out_trans_cancel;
+
+	if (xfs_has_wsync(mp))
+		xfs_trans_set_sync(tp);
+
+	/*
+	 * Attach the dquot(s) to the inodes and modify them incore.
+	 * These ids of the inode couldn't have changed since the new
+	 * inode has been locked ever since it was created.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+	error = xfs_iunlink(tp, ip);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_release_inode;
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	*ipp = ip;
+	return 0;
+
+ out_trans_cancel:
+	xfs_trans_cancel(tp);
+ out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to finish the
+	 * setup of the inode and release the inode.  This prevents recursive
+	 * transactions and deadlocks from xfs_inactive.
+	 */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		xfs_irele(ip);
+	}
+ out_release_dquots:
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	return error;
+}
+
+int
+xfs_link(
+	xfs_inode_t		*tdp,
+	xfs_inode_t		*sip,
+	struct xfs_name		*target_name)
+{
+	xfs_mount_t		*mp = tdp->i_mount;
+	xfs_trans_t		*tp;
+	int			error, nospace_error = 0;
+	int			resblks;
+
+	trace_xfs_link(tdp, target_name);
+
+	ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	error = xfs_qm_dqattach(sip);
+	if (error)
+		goto std_return;
+
+	error = xfs_qm_dqattach(tdp);
+	if (error)
+		goto std_return;
+
+	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+	error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
+			&tp, &nospace_error);
+	if (error)
+		goto std_return;
+
+	/*
+	 * If we are using project inheritance, we only allow hard link
+	 * creation in our tree when the project IDs are the same; else
+	 * the tree quota mechanism could be circumvented.
+	 */
+	if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+		     tdp->i_projid != sip->i_projid)) {
+		error = -EXDEV;
+		goto error_return;
+	}
+
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, tdp, target_name);
+		if (error)
+			goto error_return;
+	}
+
+	/*
+	 * Handle initial link state of O_TMPFILE inode
+	 */
+	if (VFS_I(sip)->i_nlink == 0) {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
+		error = xfs_iunlink_remove(tp, pag, sip);
+		xfs_perag_put(pag);
+		if (error)
+			goto error_return;
+	}
+
+	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+				   resblks);
+	if (error)
+		goto error_return;
+	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+	xfs_bumplink(tp, sip);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * link transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
+		xfs_trans_set_sync(tp);
+
+	return xfs_trans_commit(tp);
+
+ error_return:
+	xfs_trans_cancel(tp);
+ std_return:
+	if (error == -ENOSPC && nospace_error)
+		error = nospace_error;
+	return error;
+}
+
+/* Clear the reflink flag and the cowblocks tag if possible. */
+static void
+xfs_itruncate_clear_reflink_flags(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*dfork;
+	struct xfs_ifork	*cfork;
+
+	if (!xfs_is_reflink_inode(ip))
+		return;
+	dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
+	if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
+		ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+	if (cfork->if_bytes == 0)
+		xfs_inode_clear_cowblocks_tag(ip);
+}
+
+/*
+ * Free up the underlying blocks past new_size.  The new size must be smaller
+ * than the current size.  This routine can be used both for the attribute and
+ * data fork, and does not modify the inode size, which is left to the caller.
+ *
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
+ * returned to the caller to be committed.  The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On return the inode
+ * will be "held" within the returned transaction.  This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
+ */
+int
+xfs_itruncate_extents_flags(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_fsize_t		new_size,
+	int			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp = *tpp;
+	xfs_fileoff_t		first_unmap_block;
+	xfs_filblks_t		unmap_len;
+	int			error = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
+	       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(new_size <= XFS_ISIZE(ip));
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ip->i_itemp->ili_lock_flags == 0);
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+
+	trace_xfs_itruncate_extents_start(ip, new_size);
+
+	flags |= xfs_bmapi_aflag(whichfork);
+
+	/*
+	 * Since it is possible for space to become allocated beyond
+	 * the end of the file (in a crash where the space is allocated
+	 * but the inode size is not yet updated), simply remove any
+	 * blocks which show up between the new EOF and the maximum
+	 * possible file size.
+	 *
+	 * We have to free all the blocks to the bmbt maximum offset, even if
+	 * the page cache can't scale that far.
+	 */
+	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+	if (!xfs_verify_fileoff(mp, first_unmap_block)) {
+		WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
+		return 0;
+	}
+
+	unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
+	while (unmap_len > 0) {
+		ASSERT(tp->t_firstblock == NULLFSBLOCK);
+		error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
+				flags, XFS_ITRUNC_MAX_EXTENTS);
+		if (error)
+			goto out;
+
+		/* free the just unmapped extents */
+		error = xfs_defer_finish(&tp);
+		if (error)
+			goto out;
+	}
+
+	if (whichfork == XFS_DATA_FORK) {
+		/* Remove all pending CoW reservations. */
+		error = xfs_reflink_cancel_cow_blocks(ip, &tp,
+				first_unmap_block, XFS_MAX_FILEOFF, true);
+		if (error)
+			goto out;
+
+		xfs_itruncate_clear_reflink_flags(ip);
+	}
+
+	/*
+	 * Always re-log the inode so that our permanent transaction can keep
+	 * on rolling it forward in the log.
+	 */
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	trace_xfs_itruncate_extents_end(ip, new_size);
+
+out:
+	*tpp = tp;
+	return error;
+}
+
+int
+xfs_release(
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	int		error = 0;
+
+	if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
+		return 0;
+
+	/* If this is a read-only mount, don't do this (would generate I/O) */
+	if (xfs_is_readonly(mp))
+		return 0;
+
+	if (!xfs_is_shutdown(mp)) {
+		int truncated;
+
+		/*
+		 * If we previously truncated this file and removed old data
+		 * in the process, we want to initiate "early" writeout on
+		 * the last close.  This is an attempt to combat the notorious
+		 * NULL files problem which is particularly noticeable from a
+		 * truncate down, buffered (re-)write (delalloc), followed by
+		 * a crash.  What we are effectively doing here is
+		 * significantly reducing the time window where we'd otherwise
+		 * be exposed to that problem.
+		 */
+		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+		if (truncated) {
+			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+			if (ip->i_delayed_blks > 0) {
+				error = filemap_flush(VFS_I(ip)->i_mapping);
+				if (error)
+					return error;
+			}
+		}
+	}
+
+	if (VFS_I(ip)->i_nlink == 0)
+		return 0;
+
+	/*
+	 * If we can't get the iolock just skip truncating the blocks past EOF
+	 * because we could deadlock with the mmap_lock otherwise. We'll get
+	 * another chance to drop them once the last reference to the inode is
+	 * dropped, so we'll never leak blocks permanently.
+	 */
+	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
+		return 0;
+
+	if (xfs_can_free_eofblocks(ip, false)) {
+		/*
+		 * Check if the inode is being opened, written and closed
+		 * frequently and we have delayed allocation blocks outstanding
+		 * (e.g. streaming writes from the NFS server), truncating the
+		 * blocks past EOF will cause fragmentation to occur.
+		 *
+		 * In this case don't do the truncation, but we have to be
+		 * careful how we detect this case. Blocks beyond EOF show up as
+		 * i_delayed_blks even when the inode is clean, so we need to
+		 * truncate them away first before checking for a dirty release.
+		 * Hence on the first dirty close we will still remove the
+		 * speculative allocation, but after that we will leave it in
+		 * place.
+		 */
+		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+			goto out_unlock;
+
+		error = xfs_free_eofblocks(ip);
+		if (error)
+			goto out_unlock;
+
+		/* delalloc blocks after truncation means it really is dirty */
+		if (ip->i_delayed_blks)
+			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+	}
+
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_inactive_truncate
+ *
+ * Called to perform a truncate when an inode becomes unlinked.
+ */
+STATIC int
+xfs_inactive_truncate(
+	struct xfs_inode *ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error) {
+		ASSERT(xfs_is_shutdown(mp));
+		return error;
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Log the inode size first to prevent stale data exposure in the event
+	 * of a system crash before the truncate completes. See the related
+	 * comment in xfs_vn_setattr_size() for details.
+	 */
+	ip->i_disk_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+	if (error)
+		goto error_trans_cancel;
+
+	ASSERT(ip->i_df.if_nextents == 0);
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto error_unlock;
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+
+error_trans_cancel:
+	xfs_trans_cancel(tp);
+error_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_inactive_ifree()
+ *
+ * Perform the inode free when an inode is unlinked.
+ */
+STATIC int
+xfs_inactive_ifree(
+	struct xfs_inode *ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	/*
+	 * We try to use a per-AG reservation for any block needed by the finobt
+	 * tree, but as the finobt feature predates the per-AG reservation
+	 * support a degraded file system might not have enough space for the
+	 * reservation at mount time.  In that case try to dip into the reserved
+	 * pool and pray.
+	 *
+	 * Send a warning if the reservation does happen to fail, as the inode
+	 * now remains allocated and sits on the unlinked list until the fs is
+	 * repaired.
+	 */
+	if (unlikely(mp->m_finobt_nores)) {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
+				&tp);
+	} else {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
+	}
+	if (error) {
+		if (error == -ENOSPC) {
+			xfs_warn_ratelimited(mp,
+			"Failed to remove inode(s) from unlinked list. "
+			"Please free space, unmount and run xfs_repair.");
+		} else {
+			ASSERT(xfs_is_shutdown(mp));
+		}
+		return error;
+	}
+
+	/*
+	 * We do not hold the inode locked across the entire rolling transaction
+	 * here. We only need to hold it for the first transaction that
+	 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
+	 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
+	 * here breaks the relationship between cluster buffer invalidation and
+	 * stale inode invalidation on cluster buffer item journal commit
+	 * completion, and can result in leaving dirty stale inodes hanging
+	 * around in memory.
+	 *
+	 * We have no need for serialising this inode operation against other
+	 * operations - we freed the inode and hence reallocation is required
+	 * and that will serialise on reallocating the space the deferops need
+	 * to free. Hence we can unlock the inode on the first commit of
+	 * the transaction rather than roll it right through the deferops. This
+	 * avoids relogging the XFS_ISTALE inode.
+	 *
+	 * We check that xfs_ifree() hasn't grown an internal transaction roll
+	 * by asserting that the inode is still locked when it returns.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	error = xfs_ifree(tp, ip);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	if (error) {
+		/*
+		 * If we fail to free the inode, shut down.  The cancel
+		 * might do that, we need to make sure.  Otherwise the
+		 * inode might be lost for a long time or forever.
+		 */
+		if (!xfs_is_shutdown(mp)) {
+			xfs_notice(mp, "%s: xfs_ifree returned error %d",
+				__func__, error);
+			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		}
+		xfs_trans_cancel(tp);
+		return error;
+	}
+
+	/*
+	 * Credit the quota account(s). The inode is gone.
+	 */
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+	/*
+	 * Just ignore errors at this point.  There is nothing we can do except
+	 * to try to keep going. Make sure it's not a silent error.
+	 */
+	error = xfs_trans_commit(tp);
+	if (error)
+		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+			__func__, error);
+
+	return 0;
+}
+
+/*
+ * Returns true if we need to update the on-disk metadata before we can free
+ * the memory used by this inode.  Updates include freeing post-eof
+ * preallocations; freeing COW staging extents; and marking the inode free in
+ * the inobt if it is on the unlinked list.
+ */
+bool
+xfs_inode_needs_inactive(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+
+	/*
+	 * If the inode is already free, then there can be nothing
+	 * to clean up here.
+	 */
+	if (VFS_I(ip)->i_mode == 0)
+		return false;
+
+	/* If this is a read-only mount, don't do this (would generate I/O) */
+	if (xfs_is_readonly(mp))
+		return false;
+
+	/* If the log isn't running, push inodes straight to reclaim. */
+	if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
+		return false;
+
+	/* Metadata inodes require explicit resource cleanup. */
+	if (xfs_is_metadata_inode(ip))
+		return false;
+
+	/* Want to clean out the cow blocks if there are any. */
+	if (cow_ifp && cow_ifp->if_bytes > 0)
+		return true;
+
+	/* Unlinked files must be freed. */
+	if (VFS_I(ip)->i_nlink == 0)
+		return true;
+
+	/*
+	 * This file isn't being freed, so check if there are post-eof blocks
+	 * to free.  @force is true because we are evicting an inode from the
+	 * cache.  Post-eof blocks must be freed, lest we end up with broken
+	 * free space accounting.
+	 *
+	 * Note: don't bother with iolock here since lockdep complains about
+	 * acquiring it in reclaim context. We have the only reference to the
+	 * inode at this point anyways.
+	 */
+	return xfs_can_free_eofblocks(ip, true);
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+void
+xfs_inactive(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount	*mp;
+	int			error;
+	int			truncate = 0;
+
+	/*
+	 * If the inode is already free, then there can be nothing
+	 * to clean up here.
+	 */
+	if (VFS_I(ip)->i_mode == 0) {
+		ASSERT(ip->i_df.if_broot_bytes == 0);
+		goto out;
+	}
+
+	mp = ip->i_mount;
+	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
+
+	/* If this is a read-only mount, don't do this (would generate I/O) */
+	if (xfs_is_readonly(mp))
+		goto out;
+
+	/* Metadata inodes require explicit resource cleanup. */
+	if (xfs_is_metadata_inode(ip))
+		goto out;
+
+	/* Try to clean out the cow blocks if there are any. */
+	if (xfs_inode_has_cow_data(ip))
+		xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+
+	if (VFS_I(ip)->i_nlink != 0) {
+		/*
+		 * force is true because we are evicting an inode from the
+		 * cache. Post-eof blocks must be freed, lest we end up with
+		 * broken free space accounting.
+		 *
+		 * Note: don't bother with iolock here since lockdep complains
+		 * about acquiring it in reclaim context. We have the only
+		 * reference to the inode at this point anyways.
+		 */
+		if (xfs_can_free_eofblocks(ip, true))
+			xfs_free_eofblocks(ip);
+
+		goto out;
+	}
+
+	if (S_ISREG(VFS_I(ip)->i_mode) &&
+	    (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
+	     ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
+		truncate = 1;
+
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		goto out;
+
+	if (S_ISLNK(VFS_I(ip)->i_mode))
+		error = xfs_inactive_symlink(ip);
+	else if (truncate)
+		error = xfs_inactive_truncate(ip);
+	if (error)
+		goto out;
+
+	/*
+	 * If there are attributes associated with the file then blow them away
+	 * now.  The code calls a routine that recursively deconstructs the
+	 * attribute fork. If also blows away the in-core attribute fork.
+	 */
+	if (xfs_inode_has_attr_fork(ip)) {
+		error = xfs_attr_inactive(ip);
+		if (error)
+			goto out;
+	}
+
+	ASSERT(ip->i_forkoff == 0);
+
+	/*
+	 * Free the inode.
+	 */
+	xfs_inactive_ifree(ip);
+
+out:
+	/*
+	 * We're done making metadata updates for this inode, so we can release
+	 * the attached dquots.
+	 */
+	xfs_qm_dqdetach(ip);
+}
+
+/*
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
+ *
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
+ */
+
+/*
+ * Find an inode on the unlinked list. This does not take references to the
+ * inode as we have existence guarantees by holding the AGI buffer lock and that
+ * only unlinked, referenced inodes can be on the unlinked inode list.  If we
+ * don't find the inode in cache, then let the caller handle the situation.
+ */
+static struct xfs_inode *
+xfs_iunlink_lookup(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino)
+{
+	struct xfs_inode	*ip;
+
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+	/*
+	 * Inode not in memory or in RCU freeing limbo should not happen.
+	 * Warn about this and let the caller handle the failure.
+	 */
+	if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
+	rcu_read_unlock();
+	return ip;
+}
+
+/* Update the prev pointer of the next agino. */
+static int
+xfs_iunlink_update_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_inode	*ip;
+
+	/* No update necessary if we are at the end of the list. */
+	if (next_agino == NULLAGINO)
+		return 0;
+
+	ip = xfs_iunlink_lookup(pag, next_agino);
+	if (!ip)
+		return -EFSCORRUPTED;
+	ip->i_prev_unlinked = prev_agino;
+	return 0;
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results.  The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	unsigned int		bucket_index,
+	xfs_agino_t		new_agino)
+{
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		old_value;
+	int			offset;
+
+	ASSERT(xfs_verify_agino_or_null(pag, new_agino));
+
+	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
+			old_value, new_agino);
+
+	/*
+	 * We should never find the head of the list already set to the value
+	 * passed in because either we're adding or removing ourselves from the
+	 * head of the list.
+	 */
+	if (old_value == new_agino) {
+		xfs_buf_mark_corrupt(agibp);
+		return -EFSCORRUPTED;
+	}
+
+	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+	offset = offsetof(struct xfs_agi, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	return 0;
+}
+
+static int
+xfs_iunlink_insert_inode(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		next_agino;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			error;
+
+	/*
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the pointer isn't garbage and that this inode
+	 * isn't already on the list.
+	 */
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (next_agino == agino ||
+	    !xfs_verify_agino_or_null(pag, next_agino)) {
+		xfs_buf_mark_corrupt(agibp);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Update the prev pointer in the next inode to point back to this
+	 * inode.
+	 */
+	error = xfs_iunlink_update_backref(pag, agino, next_agino);
+	if (error)
+		return error;
+
+	if (next_agino != NULLAGINO) {
+		/*
+		 * There is already another inode in the bucket, so point this
+		 * inode to the current head of the list.
+		 */
+		error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
+		if (error)
+			return error;
+		ip->i_next_unlinked = next_agino;
+	}
+
+	/* Point the head of the list to point to this inode. */
+	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+ * list when the inode is freed.
+ */
+STATIC int
+xfs_iunlink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_perag	*pag;
+	struct xfs_buf		*agibp;
+	int			error;
+
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
+	trace_xfs_iunlink(ip);
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(pag, tp, &agibp);
+	if (error)
+		goto out;
+
+	error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
+static int
+xfs_iunlink_remove_inode(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	xfs_agino_t		head_agino;
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			error;
+
+	trace_xfs_iunlink_remove(ip);
+
+	/*
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the head pointer isn't garbage.
+	 */
+	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (!xfs_verify_agino(pag, head_agino)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				agi, sizeof(*agi));
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Set our inode's next_unlinked pointer to NULL and then return
+	 * the old pointer value so that we can update whatever was previous
+	 * to us in the list to point to whatever was next in the list.
+	 */
+	error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
+	if (error)
+		return error;
+
+	/*
+	 * Update the prev pointer in the next inode to point back to previous
+	 * inode in the chain.
+	 */
+	error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+			ip->i_next_unlinked);
+	if (error)
+		return error;
+
+	if (head_agino != agino) {
+		struct xfs_inode	*prev_ip;
+
+		prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+		if (!prev_ip)
+			return -EFSCORRUPTED;
+
+		error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+				ip->i_next_unlinked);
+		prev_ip->i_next_unlinked = ip->i_next_unlinked;
+	} else {
+		/* Point the head of the list to the next unlinked inode. */
+		error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+				ip->i_next_unlinked);
+	}
+
+	ip->i_next_unlinked = NULLAGINO;
+	ip->i_prev_unlinked = NULLAGINO;
+	return error;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	struct xfs_buf		*agibp;
+	int			error;
+
+	trace_xfs_iunlink_remove(ip);
+
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(pag, tp, &agibp);
+	if (error)
+		return error;
+
+	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
+}
+
+/*
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
+ * mark it stale. We should only find clean inodes in this lookup that aren't
+ * already stale.
+ */
+static void
+xfs_ifree_mark_inode_stale(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*free_ip,
+	xfs_ino_t		inum)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_inode_log_item *iip;
+	struct xfs_inode	*ip;
+
+retry:
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+	/* Inode not in memory, nothing to do */
+	if (!ip) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/*
+	 * because this is an RCU protected lookup, we could find a recently
+	 * freed or even reallocated inode during the lookup. We need to check
+	 * under the i_flags_lock for a valid inode here. Skip it if it is not
+	 * valid, the wrong inode or stale.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
+		goto out_iflags_unlock;
+
+	/*
+	 * Don't try to lock/unlock the current inode, but we _cannot_ skip the
+	 * other inodes that we did not find in the list attached to the buffer
+	 * and are not already marked stale. If we can't lock it, back off and
+	 * retry.
+	 */
+	if (ip != free_ip) {
+		if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+			spin_unlock(&ip->i_flags_lock);
+			rcu_read_unlock();
+			delay(1);
+			goto retry;
+		}
+	}
+	ip->i_flags |= XFS_ISTALE;
+
+	/*
+	 * If the inode is flushing, it is already attached to the buffer.  All
+	 * we needed to do here is mark the inode stale so buffer IO completion
+	 * will remove it from the AIL.
+	 */
+	iip = ip->i_itemp;
+	if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
+		ASSERT(!list_empty(&iip->ili_item.li_bio_list));
+		ASSERT(iip->ili_last_fields);
+		goto out_iunlock;
+	}
+
+	/*
+	 * Inodes not attached to the buffer can be released immediately.
+	 * Everything else has to go through xfs_iflush_abort() on journal
+	 * commit as the flock synchronises removal of the inode from the
+	 * cluster buffer against inode reclaim.
+	 */
+	if (!iip || list_empty(&iip->ili_item.li_bio_list))
+		goto out_iunlock;
+
+	__xfs_iflags_set(ip, XFS_IFLUSHING);
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+
+	/* we have a dirty inode in memory that has not yet been flushed. */
+	spin_lock(&iip->ili_lock);
+	iip->ili_last_fields = iip->ili_fields;
+	iip->ili_fields = 0;
+	iip->ili_fsync_fields = 0;
+	spin_unlock(&iip->ili_lock);
+	ASSERT(iip->ili_last_fields);
+
+	if (ip != free_ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return;
+
+out_iunlock:
+	if (ip != free_ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_iflags_unlock:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+}
+
+/*
+ * A big issue when freeing the inode cluster is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
+static int
+xfs_ifree_cluster(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_inode	*free_ip,
+	struct xfs_icluster	*xic)
+{
+	struct xfs_mount	*mp = free_ip->i_mount;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	struct xfs_buf		*bp;
+	xfs_daddr_t		blkno;
+	xfs_ino_t		inum = xic->first_ino;
+	int			nbufs;
+	int			i, j;
+	int			ioffset;
+	int			error;
+
+	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
+
+	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
+		/*
+		 * The allocation bitmap tells us which inodes of the chunk were
+		 * physically allocated. Skip the cluster if an inode falls into
+		 * a sparse region.
+		 */
+		ioffset = inum - xic->first_ino;
+		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+			ASSERT(ioffset % igeo->inodes_per_cluster == 0);
+			continue;
+		}
+
+		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
+					 XFS_INO_TO_AGBNO(mp, inum));
+
+		/*
+		 * We obtain and lock the backing buffer first in the process
+		 * here to ensure dirty inodes attached to the buffer remain in
+		 * the flushing state while we mark them stale.
+		 *
+		 * If we scan the in-memory inodes first, then buffer IO can
+		 * complete before we get a lock on it, and hence we may fail
+		 * to mark all the active inodes on the buffer stale.
+		 */
+		error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+				mp->m_bsize * igeo->blocks_per_cluster,
+				XBF_UNMAPPED, &bp);
+		if (error)
+			return error;
+
+		/*
+		 * This buffer may not have been correctly initialised as we
+		 * didn't read it from disk. That's not important because we are
+		 * only using to mark the buffer as stale in the log, and to
+		 * attach stale cached inodes on it. That means it will never be
+		 * dispatched for IO. If it is, we want to know about it, and we
+		 * want it to fail. We can acheive this by adding a write
+		 * verifier to the buffer.
+		 */
+		bp->b_ops = &xfs_inode_buf_ops;
+
+		/*
+		 * Now we need to set all the cached clean inodes as XFS_ISTALE,
+		 * too. This requires lookups, and will skip inodes that we've
+		 * already marked XFS_ISTALE.
+		 */
+		for (i = 0; i < igeo->inodes_per_cluster; i++)
+			xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
+
+		xfs_trans_stale_inode_buf(tp, bp);
+		xfs_trans_binval(tp, bp);
+	}
+	return 0;
+}
+
+/*
+ * This is called to return an inode to the inode free list.  The inode should
+ * already be truncated to 0 length and have no pages associated with it.  This
+ * routine also assumes that the inode is already a part of the transaction.
+ *
+ * The on-disk copy of the inode will have been added to the list of unlinked
+ * inodes in the AGI. We need to remove the inode from that list atomically with
+ * respect to freeing it here.
+ */
+int
+xfs_ifree(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	struct xfs_icluster	xic = { 0 };
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(ip->i_df.if_nextents == 0);
+	ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
+	ASSERT(ip->i_nblocks == 0);
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	/*
+	 * Free the inode first so that we guarantee that the AGI lock is going
+	 * to be taken before we remove the inode from the unlinked list. This
+	 * makes the AGI lock -> unlinked list modification order the same as
+	 * used in O_TMPFILE creation.
+	 */
+	error = xfs_difree(tp, pag, ip->i_ino, &xic);
+	if (error)
+		goto out;
+
+	error = xfs_iunlink_remove(tp, pag, ip);
+	if (error)
+		goto out;
+
+	/*
+	 * Free any local-format data sitting around before we reset the
+	 * data fork to extents format.  Note that the attr fork data has
+	 * already been freed by xfs_attr_inactive.
+	 */
+	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		kmem_free(ip->i_df.if_u1.if_data);
+		ip->i_df.if_u1.if_data = NULL;
+		ip->i_df.if_bytes = 0;
+	}
+
+	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */
+	ip->i_diflags = 0;
+	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
+	ip->i_forkoff = 0;		/* mark the attr fork not in use */
+	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+	if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
+		xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
+
+	/* Don't attempt to replay owner changes for a deleted inode */
+	spin_lock(&iip->ili_lock);
+	iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
+	spin_unlock(&iip->ili_lock);
+
+	/*
+	 * Bump the generation count so no one will be confused
+	 * by reincarnations of this inode.
+	 */
+	VFS_I(ip)->i_generation++;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (xic.deleted)
+		error = xfs_ifree_cluster(tp, pag, ip, &xic);
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * This is called to unpin an inode.  The caller must have the inode locked
+ * in at least shared mode so that the buffer cannot be subsequently pinned
+ * once someone is waiting for it to be unpinned.
+ */
+static void
+xfs_iunpin(
+	struct xfs_inode	*ip)
+{
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+
+	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+
+	/* Give the log a push to start the unpinning I/O */
+	xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
+
+}
+
+static void
+__xfs_iunpin_wait(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+	xfs_iunpin(ip);
+
+	do {
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+		if (xfs_ipincount(ip))
+			io_schedule();
+	} while (xfs_ipincount(ip));
+	finish_wait(wq, &wait.wq_entry);
+}
+
+void
+xfs_iunpin_wait(
+	struct xfs_inode	*ip)
+{
+	if (xfs_ipincount(ip))
+		__xfs_iunpin_wait(ip);
+}
+
+/*
+ * Removing an inode from the namespace involves removing the directory entry
+ * and dropping the link count on the inode. Removing the directory entry can
+ * result in locking an AGF (directory blocks were freed) and removing a link
+ * count can result in placing the inode on an unlinked list which results in
+ * locking an AGI.
+ *
+ * The big problem here is that we have an ordering constraint on AGF and AGI
+ * locking - inode allocation locks the AGI, then can allocate a new extent for
+ * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
+ * removes the inode from the unlinked list, requiring that we lock the AGI
+ * first, and then freeing the inode can result in an inode chunk being freed
+ * and hence freeing disk space requiring that we lock an AGF.
+ *
+ * Hence the ordering that is imposed by other parts of the code is AGI before
+ * AGF. This means we cannot remove the directory entry before we drop the inode
+ * reference count and put it on the unlinked list as this results in a lock
+ * order of AGF then AGI, and this can deadlock against inode allocation and
+ * freeing. Therefore we must drop the link counts before we remove the
+ * directory entry.
+ *
+ * This is still safe from a transactional point of view - it is not until we
+ * get to xfs_defer_finish() that we have the possibility of multiple
+ * transactions in this operation. Hence as long as we remove the directory
+ * entry and drop the link count in the first transaction of the remove
+ * operation, there are no transactional constraints on the ordering here.
+ */
+int
+xfs_remove(
+	xfs_inode_t             *dp,
+	struct xfs_name		*name,
+	xfs_inode_t		*ip)
+{
+	xfs_mount_t		*mp = dp->i_mount;
+	xfs_trans_t             *tp = NULL;
+	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);
+	int			dontcare;
+	int                     error = 0;
+	uint			resblks;
+
+	trace_xfs_remove(dp, name);
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	error = xfs_qm_dqattach(dp);
+	if (error)
+		goto std_return;
+
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		goto std_return;
+
+	/*
+	 * We try to get the real space reservation first, allowing for
+	 * directory btree deletion(s) implying possible bmap insert(s).  If we
+	 * can't get the space reservation then we use 0 instead, and avoid the
+	 * bmap btree insert(s) in the directory code by, if the bmap insert
+	 * tries to happen, instead trimming the LAST block from the directory.
+	 *
+	 * Ignore EDQUOT and ENOSPC being returned via nospace_error because
+	 * the directory code can handle a reservationless update and we don't
+	 * want to prevent a user from trying to free space by deleting things.
+	 */
+	resblks = XFS_REMOVE_SPACE_RES(mp);
+	error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
+			&tp, &dontcare);
+	if (error) {
+		ASSERT(error != -ENOSPC);
+		goto std_return;
+	}
+
+	/*
+	 * If we're removing a directory perform some additional validation.
+	 */
+	if (is_dir) {
+		ASSERT(VFS_I(ip)->i_nlink >= 2);
+		if (VFS_I(ip)->i_nlink != 2) {
+			error = -ENOTEMPTY;
+			goto out_trans_cancel;
+		}
+		if (!xfs_dir_isempty(ip)) {
+			error = -ENOTEMPTY;
+			goto out_trans_cancel;
+		}
+
+		/* Drop the link from ip's "..".  */
+		error = xfs_droplink(tp, dp);
+		if (error)
+			goto out_trans_cancel;
+
+		/* Drop the "." link from ip to self.  */
+		error = xfs_droplink(tp, ip);
+		if (error)
+			goto out_trans_cancel;
+
+		/*
+		 * Point the unlinked child directory's ".." entry to the root
+		 * directory to eliminate back-references to inodes that may
+		 * get freed before the child directory is closed.  If the fs
+		 * gets shrunk, this can lead to dirent inode validation errors.
+		 */
+		if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
+			error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+					tp->t_mountp->m_sb.sb_rootino, 0);
+			if (error)
+				return error;
+		}
+	} else {
+		/*
+		 * When removing a non-directory we need to log the parent
+		 * inode here.  For a directory this is done implicitly
+		 * by the xfs_droplink call for the ".." entry.
+		 */
+		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/* Drop the link from dp to ip. */
+	error = xfs_droplink(tp, ip);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
+	if (error) {
+		ASSERT(error != -ENOENT);
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * remove transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto std_return;
+
+	if (is_dir && xfs_inode_is_filestream(ip))
+		xfs_filestream_deassociate(ip);
+
+	return 0;
+
+ out_trans_cancel:
+	xfs_trans_cancel(tp);
+ std_return:
+	return error;
+}
+
+/*
+ * Enter all inodes for a rename transaction into a sorted array.
+ */
+#define __XFS_SORT_INODES	5
+STATIC void
+xfs_sort_for_rename(
+	struct xfs_inode	*dp1,	/* in: old (source) directory inode */
+	struct xfs_inode	*dp2,	/* in: new (target) directory inode */
+	struct xfs_inode	*ip1,	/* in: inode of old entry */
+	struct xfs_inode	*ip2,	/* in: inode of new entry */
+	struct xfs_inode	*wip,	/* in: whiteout inode */
+	struct xfs_inode	**i_tab,/* out: sorted array of inodes */
+	int			*num_inodes)  /* in/out: inodes in array */
+{
+	int			i, j;
+
+	ASSERT(*num_inodes == __XFS_SORT_INODES);
+	memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
+
+	/*
+	 * i_tab contains a list of pointers to inodes.  We initialize
+	 * the table here & we'll sort it.  We will then use it to
+	 * order the acquisition of the inode locks.
+	 *
+	 * Note that the table may contain duplicates.  e.g., dp1 == dp2.
+	 */
+	i = 0;
+	i_tab[i++] = dp1;
+	i_tab[i++] = dp2;
+	i_tab[i++] = ip1;
+	if (ip2)
+		i_tab[i++] = ip2;
+	if (wip)
+		i_tab[i++] = wip;
+	*num_inodes = i;
+
+	/*
+	 * Sort the elements via bubble sort.  (Remember, there are at
+	 * most 5 elements to sort, so this is adequate.)
+	 */
+	for (i = 0; i < *num_inodes; i++) {
+		for (j = 1; j < *num_inodes; j++) {
+			if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+				struct xfs_inode *temp = i_tab[j];
+				i_tab[j] = i_tab[j-1];
+				i_tab[j-1] = temp;
+			}
+		}
+	}
+}
+
+static int
+xfs_finish_rename(
+	struct xfs_trans	*tp)
+{
+	/*
+	 * If this is a synchronous mount, make sure that the rename transaction
+	 * goes to disk before returning to the user.
+	 */
+	if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
+		xfs_trans_set_sync(tp);
+
+	return xfs_trans_commit(tp);
+}
+
+/*
+ * xfs_cross_rename()
+ *
+ * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
+ */
+STATIC int
+xfs_cross_rename(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp1,
+	struct xfs_name		*name1,
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*dp2,
+	struct xfs_name		*name2,
+	struct xfs_inode	*ip2,
+	int			spaceres)
+{
+	int		error = 0;
+	int		ip1_flags = 0;
+	int		ip2_flags = 0;
+	int		dp2_flags = 0;
+
+	/* Swap inode number for dirent in first parent */
+	error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
+	if (error)
+		goto out_trans_abort;
+
+	/* Swap inode number for dirent in second parent */
+	error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
+	if (error)
+		goto out_trans_abort;
+
+	/*
+	 * If we're renaming one or more directories across different parents,
+	 * update the respective ".." entries (and link counts) to match the new
+	 * parents.
+	 */
+	if (dp1 != dp2) {
+		dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+
+		if (S_ISDIR(VFS_I(ip2)->i_mode)) {
+			error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
+						dp1->i_ino, spaceres);
+			if (error)
+				goto out_trans_abort;
+
+			/* transfer ip2 ".." reference to dp1 */
+			if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
+				error = xfs_droplink(tp, dp2);
+				if (error)
+					goto out_trans_abort;
+				xfs_bumplink(tp, dp1);
+			}
+
+			/*
+			 * Although ip1 isn't changed here, userspace needs
+			 * to be warned about the change, so that applications
+			 * relying on it (like backup ones), will properly
+			 * notify the change
+			 */
+			ip1_flags |= XFS_ICHGTIME_CHG;
+			ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+		}
+
+		if (S_ISDIR(VFS_I(ip1)->i_mode)) {
+			error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
+						dp2->i_ino, spaceres);
+			if (error)
+				goto out_trans_abort;
+
+			/* transfer ip1 ".." reference to dp2 */
+			if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
+				error = xfs_droplink(tp, dp1);
+				if (error)
+					goto out_trans_abort;
+				xfs_bumplink(tp, dp2);
+			}
+
+			/*
+			 * Although ip2 isn't changed here, userspace needs
+			 * to be warned about the change, so that applications
+			 * relying on it (like backup ones), will properly
+			 * notify the change
+			 */
+			ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+			ip2_flags |= XFS_ICHGTIME_CHG;
+		}
+	}
+
+	if (ip1_flags) {
+		xfs_trans_ichgtime(tp, ip1, ip1_flags);
+		xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+	}
+	if (ip2_flags) {
+		xfs_trans_ichgtime(tp, ip2, ip2_flags);
+		xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+	}
+	if (dp2_flags) {
+		xfs_trans_ichgtime(tp, dp2, dp2_flags);
+		xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
+	}
+	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+	return xfs_finish_rename(tp);
+
+out_trans_abort:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+/*
+ * xfs_rename_alloc_whiteout()
+ *
+ * Return a referenced, unlinked, unlocked inode that can be used as a
+ * whiteout in a rename transaction. We use a tmpfile inode here so that if we
+ * crash between allocating the inode and linking it into the rename transaction
+ * recovery will free the inode and we won't leak it.
+ */
+static int
+xfs_rename_alloc_whiteout(
+	struct user_namespace	*mnt_userns,
+	struct xfs_name		*src_name,
+	struct xfs_inode	*dp,
+	struct xfs_inode	**wip)
+{
+	struct xfs_inode	*tmpfile;
+	struct qstr		name;
+	int			error;
+
+	error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
+				   &tmpfile);
+	if (error)
+		return error;
+
+	name.name = src_name->name;
+	name.len = src_name->len;
+	error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
+	if (error) {
+		xfs_finish_inode_setup(tmpfile);
+		xfs_irele(tmpfile);
+		return error;
+	}
+
+	/*
+	 * Prepare the tmpfile inode as if it were created through the VFS.
+	 * Complete the inode setup and flag it as linkable.  nlink is already
+	 * zero, so we can skip the drop_nlink.
+	 */
+	xfs_setup_iops(tmpfile);
+	xfs_finish_inode_setup(tmpfile);
+	VFS_I(tmpfile)->i_state |= I_LINKABLE;
+
+	*wip = tmpfile;
+	return 0;
+}
+
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+	struct user_namespace	*mnt_userns,
+	struct xfs_inode	*src_dp,
+	struct xfs_name		*src_name,
+	struct xfs_inode	*src_ip,
+	struct xfs_inode	*target_dp,
+	struct xfs_name		*target_name,
+	struct xfs_inode	*target_ip,
+	unsigned int		flags)
+{
+	struct xfs_mount	*mp = src_dp->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_inode	*wip = NULL;		/* whiteout inode */
+	struct xfs_inode	*inodes[__XFS_SORT_INODES];
+	int			i;
+	int			num_inodes = __XFS_SORT_INODES;
+	bool			new_parent = (src_dp != target_dp);
+	bool			src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
+	int			spaceres;
+	bool			retried = false;
+	int			error, nospace_error = 0;
+
+	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
+
+	if ((flags & RENAME_EXCHANGE) && !target_ip)
+		return -EINVAL;
+
+	/*
+	 * If we are doing a whiteout operation, allocate the whiteout inode
+	 * we will be placing at the target and ensure the type is set
+	 * appropriately.
+	 */
+	if (flags & RENAME_WHITEOUT) {
+		error = xfs_rename_alloc_whiteout(mnt_userns, src_name,
+						  target_dp, &wip);
+		if (error)
+			return error;
+
+		/* setup target dirent info as whiteout */
+		src_name->type = XFS_DIR3_FT_CHRDEV;
+	}
+
+	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
+				inodes, &num_inodes);
+
+retry:
+	nospace_error = 0;
+	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
+	if (error == -ENOSPC) {
+		nospace_error = error;
+		spaceres = 0;
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
+				&tp);
+	}
+	if (error)
+		goto out_release_wip;
+
+	/*
+	 * Attach the dquots to the inodes
+	 */
+	error = xfs_qm_vop_rename_dqattach(inodes);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Lock all the participating inodes. Depending upon whether
+	 * the target_name exists in the target directory, and
+	 * whether the target directory is the same as the source
+	 * directory, we can lock from 2 to 5 inodes.
+	 */
+	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
+
+	/*
+	 * Join all the inodes to the transaction. From this point on,
+	 * we can rely on either trans_commit or trans_cancel to unlock
+	 * them.
+	 */
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+	if (new_parent)
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+	if (target_ip)
+		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+	if (wip)
+		xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we are using project inheritance, we only allow renames
+	 * into our tree when the project IDs are the same; else the
+	 * tree quota mechanism would be circumvented.
+	 */
+	if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+		     target_dp->i_projid != src_ip->i_projid)) {
+		error = -EXDEV;
+		goto out_trans_cancel;
+	}
+
+	/* RENAME_EXCHANGE is unique from here on. */
+	if (flags & RENAME_EXCHANGE)
+		return xfs_cross_rename(tp, src_dp, src_name, src_ip,
+					target_dp, target_name, target_ip,
+					spaceres);
+
+	/*
+	 * Try to reserve quota to handle an expansion of the target directory.
+	 * We'll allow the rename to continue in reservationless mode if we hit
+	 * a space usage constraint.  If we trigger reservationless mode, save
+	 * the errno if there isn't any free space in the target directory.
+	 */
+	if (spaceres != 0) {
+		error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
+				0, false);
+		if (error == -EDQUOT || error == -ENOSPC) {
+			if (!retried) {
+				xfs_trans_cancel(tp);
+				xfs_blockgc_free_quota(target_dp, 0);
+				retried = true;
+				goto retry;
+			}
+
+			nospace_error = error;
+			spaceres = 0;
+			error = 0;
+		}
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * Check for expected errors before we dirty the transaction
+	 * so we can return an error without a transaction abort.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If there's no space reservation, check the entry will
+		 * fit before actually inserting it.
+		 */
+		if (!spaceres) {
+			error = xfs_dir_canenter(tp, target_dp, target_name);
+			if (error)
+				goto out_trans_cancel;
+		}
+	} else {
+		/*
+		 * If target exists and it's a directory, check that whether
+		 * it can be destroyed.
+		 */
+		if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+		    (!xfs_dir_isempty(target_ip) ||
+		     (VFS_I(target_ip)->i_nlink > 2))) {
+			error = -EEXIST;
+			goto out_trans_cancel;
+		}
+	}
+
+	/*
+	 * Lock the AGI buffers we need to handle bumping the nlink of the
+	 * whiteout inode off the unlinked list and to handle dropping the
+	 * nlink of the target inode.  Per locking order rules, do this in
+	 * increasing AG order and before directory block allocation tries to
+	 * grab AGFs because we grab AGIs before AGFs.
+	 *
+	 * The (vfs) caller must ensure that if src is a directory then
+	 * target_ip is either null or an empty directory.
+	 */
+	for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
+		if (inodes[i] == wip ||
+		    (inodes[i] == target_ip &&
+		     (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
+			struct xfs_perag	*pag;
+			struct xfs_buf		*bp;
+
+			pag = xfs_perag_get(mp,
+					XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
+			error = xfs_read_agi(pag, tp, &bp);
+			xfs_perag_put(pag);
+			if (error)
+				goto out_trans_cancel;
+		}
+	}
+
+	/*
+	 * Directory entry creation below may acquire the AGF. Remove
+	 * the whiteout from the unlinked list first to preserve correct
+	 * AGI/AGF locking order. This dirties the transaction so failures
+	 * after this point will abort and log recovery will clean up the
+	 * mess.
+	 *
+	 * For whiteouts, we need to bump the link count on the whiteout
+	 * inode. After this point, we have a real link, clear the tmpfile
+	 * state flag from the inode so it doesn't accidentally get misused
+	 * in future.
+	 */
+	if (wip) {
+		struct xfs_perag	*pag;
+
+		ASSERT(VFS_I(wip)->i_nlink == 0);
+
+		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
+		error = xfs_iunlink_remove(tp, pag, wip);
+		xfs_perag_put(pag);
+		if (error)
+			goto out_trans_cancel;
+
+		xfs_bumplink(tp, wip);
+		VFS_I(wip)->i_state &= ~I_LINKABLE;
+	}
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If target does not exist and the rename crosses
+		 * directories, adjust the target directory link count
+		 * to account for the ".." reference from the new entry.
+		 */
+		error = xfs_dir_createname(tp, target_dp, target_name,
+					   src_ip->i_ino, spaceres);
+		if (error)
+			goto out_trans_cancel;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		if (new_parent && src_is_directory) {
+			xfs_bumplink(tp, target_dp);
+		}
+	} else { /* target_ip != NULL */
+		/*
+		 * Link the source inode under the target name.
+		 * If the source inode is a directory and we are moving
+		 * it across directories, its ".." entry will be
+		 * inconsistent until we replace that down below.
+		 *
+		 * In case there is already an entry with the same
+		 * name at the destination directory, remove it first.
+		 */
+		error = xfs_dir_replace(tp, target_dp, target_name,
+					src_ip->i_ino, spaceres);
+		if (error)
+			goto out_trans_cancel;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		/*
+		 * Decrement the link count on the target since the target
+		 * dir no longer points to it.
+		 */
+		error = xfs_droplink(tp, target_ip);
+		if (error)
+			goto out_trans_cancel;
+
+		if (src_is_directory) {
+			/*
+			 * Drop the link from the old "." entry.
+			 */
+			error = xfs_droplink(tp, target_ip);
+			if (error)
+				goto out_trans_cancel;
+		}
+	} /* target_ip != NULL */
+
+	/*
+	 * Remove the source.
+	 */
+	if (new_parent && src_is_directory) {
+		/*
+		 * Rewrite the ".." entry to point to the new
+		 * directory.
+		 */
+		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+					target_dp->i_ino, spaceres);
+		ASSERT(error != -EEXIST);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * We always want to hit the ctime on the source inode.
+	 *
+	 * This isn't strictly required by the standards since the source
+	 * inode isn't really being changed, but old unix file systems did
+	 * it and some incremental backup programs won't work without it.
+	 */
+	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+	/*
+	 * Adjust the link count on src_dp.  This is necessary when
+	 * renaming a directory, either within one parent when
+	 * the target existed, or across two parent directories.
+	 */
+	if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+		/*
+		 * Decrement link count on src_directory since the
+		 * entry that's moved no longer points to it.
+		 */
+		error = xfs_droplink(tp, src_dp);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * For whiteouts, we only need to update the source dirent with the
+	 * inode number of the whiteout inode rather than removing it
+	 * altogether.
+	 */
+	if (wip)
+		error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
+					spaceres);
+	else
+		error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+					   spaceres);
+
+	if (error)
+		goto out_trans_cancel;
+
+	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+	if (new_parent)
+		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
+	error = xfs_finish_rename(tp);
+	if (wip)
+		xfs_irele(wip);
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+out_release_wip:
+	if (wip)
+		xfs_irele(wip);
+	if (error == -ENOSPC && nospace_error)
+		error = nospace_error;
+	return error;
+}
+
+static int
+xfs_iflush(
+	struct xfs_inode	*ip,
+	struct xfs_buf		*bp)
+{
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	struct xfs_dinode	*dip;
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
+	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+	ASSERT(iip->ili_item.li_buf == bp);
+
+	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
+
+	/*
+	 * We don't flush the inode if any of the following checks fail, but we
+	 * do still update the log item and attach to the backing buffer as if
+	 * the flush happened. This is a formality to facilitate predictable
+	 * error handling as the caller will shutdown and fail the buffer.
+	 */
+	error = -EFSCORRUPTED;
+	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
+			       mp, XFS_ERRTAG_IFLUSH_1)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
+			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+		goto flush_out;
+	}
+	if (S_ISREG(VFS_I(ip)->i_mode)) {
+		if (XFS_TEST_ERROR(
+		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
+		    mp, XFS_ERRTAG_IFLUSH_3)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad regular inode %llu, ptr "PTR_FMT,
+				__func__, ip->i_ino, ip);
+			goto flush_out;
+		}
+	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		if (XFS_TEST_ERROR(
+		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
+		    ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
+		    mp, XFS_ERRTAG_IFLUSH_4)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad directory inode %llu, ptr "PTR_FMT,
+				__func__, ip->i_ino, ip);
+			goto flush_out;
+		}
+	}
+	if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
+				ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: detected corrupt incore inode %llu, "
+			"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
+			__func__, ip->i_ino,
+			ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
+			ip->i_nblocks, ip);
+		goto flush_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
+				mp, XFS_ERRTAG_IFLUSH_6)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
+			__func__, ip->i_ino, ip->i_forkoff, ip);
+		goto flush_out;
+	}
+
+	/*
+	 * Inode item log recovery for v2 inodes are dependent on the flushiter
+	 * count for correct sequencing.  We bump the flush iteration count so
+	 * we can detect flushes which postdate a log record during recovery.
+	 * This is redundant as we now log every change and hence this can't
+	 * happen but we need to still do it to ensure backwards compatibility
+	 * with old kernels that predate logging all inode changes.
+	 */
+	if (!xfs_has_v3inodes(mp))
+		ip->i_flushiter++;
+
+	/*
+	 * If there are inline format data / attr forks attached to this inode,
+	 * make sure they are not corrupt.
+	 */
+	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
+	    xfs_ifork_verify_local_data(ip))
+		goto flush_out;
+	if (xfs_inode_has_attr_fork(ip) &&
+	    ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
+	    xfs_ifork_verify_local_attr(ip))
+		goto flush_out;
+
+	/*
+	 * Copy the dirty parts of the inode into the on-disk inode.  We always
+	 * copy out the core of the inode, because if the inode is dirty at all
+	 * the core must be.
+	 */
+	xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
+
+	/* Wrap, we never let the log put out DI_MAX_FLUSH */
+	if (!xfs_has_v3inodes(mp)) {
+		if (ip->i_flushiter == DI_MAX_FLUSH)
+			ip->i_flushiter = 0;
+	}
+
+	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
+	if (xfs_inode_has_attr_fork(ip))
+		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
+
+	/*
+	 * We've recorded everything logged in the inode, so we'd like to clear
+	 * the ili_fields bits so we don't log and flush things unnecessarily.
+	 * However, we can't stop logging all this information until the data
+	 * we've copied into the disk buffer is written to disk.  If we did we
+	 * might overwrite the copy of the inode in the log with all the data
+	 * after re-logging only part of it, and in the face of a crash we
+	 * wouldn't have all the data we need to recover.
+	 *
+	 * What we do is move the bits to the ili_last_fields field.  When
+	 * logging the inode, these bits are moved back to the ili_fields field.
+	 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
+	 * we know that the information those bits represent is permanently on
+	 * disk.  As long as the flush completes before the inode is logged
+	 * again, then both ili_fields and ili_last_fields will be cleared.
+	 */
+	error = 0;
+flush_out:
+	spin_lock(&iip->ili_lock);
+	iip->ili_last_fields = iip->ili_fields;
+	iip->ili_fields = 0;
+	iip->ili_fsync_fields = 0;
+	spin_unlock(&iip->ili_lock);
+
+	/*
+	 * Store the current LSN of the inode so that we can tell whether the
+	 * item has moved in the AIL from xfs_buf_inode_iodone().
+	 */
+	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+				&iip->ili_item.li_lsn);
+
+	/* generate the checksum. */
+	xfs_dinode_calc_crc(mp, dip);
+	return error;
+}
+
+/*
+ * Non-blocking flush of dirty inode metadata into the backing buffer.
+ *
+ * The caller must have a reference to the inode and hold the cluster buffer
+ * locked. The function will walk across all the inodes on the cluster buffer it
+ * can find and lock without blocking, and flush them to the cluster buffer.
+ *
+ * On successful flushing of at least one inode, the caller must write out the
+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
+ * the caller needs to release the buffer. On failure, the filesystem will be
+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
+ * will be returned.
+ */
+int
+xfs_iflush_cluster(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_log_item	*lip, *n;
+	struct xfs_inode	*ip;
+	struct xfs_inode_log_item *iip;
+	int			clcount = 0;
+	int			error = 0;
+
+	/*
+	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
+	 * will remove itself from the list.
+	 */
+	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+		iip = (struct xfs_inode_log_item *)lip;
+		ip = iip->ili_inode;
+
+		/*
+		 * Quick and dirty check to avoid locks if possible.
+		 */
+		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
+			continue;
+		if (xfs_ipincount(ip))
+			continue;
+
+		/*
+		 * The inode is still attached to the buffer, which means it is
+		 * dirty but reclaim might try to grab it. Check carefully for
+		 * that, and grab the ilock while still holding the i_flags_lock
+		 * to guarantee reclaim will not be able to reclaim this inode
+		 * once we drop the i_flags_lock.
+		 */
+		spin_lock(&ip->i_flags_lock);
+		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
+			spin_unlock(&ip->i_flags_lock);
+			continue;
+		}
+
+		/*
+		 * ILOCK will pin the inode against reclaim and prevent
+		 * concurrent transactions modifying the inode while we are
+		 * flushing the inode. If we get the lock, set the flushing
+		 * state before we drop the i_flags_lock.
+		 */
+		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+			spin_unlock(&ip->i_flags_lock);
+			continue;
+		}
+		__xfs_iflags_set(ip, XFS_IFLUSHING);
+		spin_unlock(&ip->i_flags_lock);
+
+		/*
+		 * Abort flushing this inode if we are shut down because the
+		 * inode may not currently be in the AIL. This can occur when
+		 * log I/O failure unpins the inode without inserting into the
+		 * AIL, leaving a dirty/unpinned inode attached to the buffer
+		 * that otherwise looks like it should be flushed.
+		 */
+		if (xlog_is_shutdown(mp->m_log)) {
+			xfs_iunpin_wait(ip);
+			xfs_iflush_abort(ip);
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			error = -EIO;
+			continue;
+		}
+
+		/* don't block waiting on a log force to unpin dirty inodes */
+		if (xfs_ipincount(ip)) {
+			xfs_iflags_clear(ip, XFS_IFLUSHING);
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			continue;
+		}
+
+		if (!xfs_inode_clean(ip))
+			error = xfs_iflush(ip, bp);
+		else
+			xfs_iflags_clear(ip, XFS_IFLUSHING);
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		if (error)
+			break;
+		clcount++;
+	}
+
+	if (error) {
+		/*
+		 * Shutdown first so we kill the log before we release this
+		 * buffer. If it is an INODE_ALLOC buffer and pins the tail
+		 * of the log, failing it before the _log_ is shut down can
+		 * result in the log tail being moved forward in the journal
+		 * on disk because log writes can still be taking place. Hence
+		 * unpinning the tail will allow the ICREATE intent to be
+		 * removed from the log an recovery will fail with uninitialised
+		 * inode cluster buffers.
+		 */
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		bp->b_flags |= XBF_ASYNC;
+		xfs_buf_ioend_fail(bp);
+		return error;
+	}
+
+	if (!clcount)
+		return -EAGAIN;
+
+	XFS_STATS_INC(mp, xs_icluster_flushcnt);
+	XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
+	return 0;
+
+}
+
+/* Release an inode. */
+void
+xfs_irele(
+	struct xfs_inode	*ip)
+{
+	trace_xfs_irele(ip, _RET_IP_);
+	iput(VFS_I(ip));
+}
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+	struct xfs_inode	*ip)
+{
+	xfs_csn_t		seq = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_ipincount(ip))
+		seq = ip->i_itemp->ili_commit_seq;
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!seq)
+		return 0;
+	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
+}
+
+/*
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding.  The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
+ */
+static int
+xfs_iolock_two_inodes_and_break_layout(
+	struct inode		*src,
+	struct inode		*dest)
+{
+	int			error;
+
+	if (src > dest)
+		swap(src, dest);
+
+retry:
+	/* Wait to break both inodes' layouts before we start locking. */
+	error = break_layout(src, true);
+	if (error)
+		return error;
+	if (src != dest) {
+		error = break_layout(dest, true);
+		if (error)
+			return error;
+	}
+
+	/* Lock one inode and make sure nobody got in and leased it. */
+	inode_lock(src);
+	error = break_layout(src, false);
+	if (error) {
+		inode_unlock(src);
+		if (error == -EWOULDBLOCK)
+			goto retry;
+		return error;
+	}
+
+	if (src == dest)
+		return 0;
+
+	/* Lock the other inode and make sure nobody got in and leased it. */
+	inode_lock_nested(dest, I_MUTEX_NONDIR2);
+	error = break_layout(dest, false);
+	if (error) {
+		inode_unlock(src);
+		inode_unlock(dest);
+		if (error == -EWOULDBLOCK)
+			goto retry;
+		return error;
+	}
+
+	return 0;
+}
+
+static int
+xfs_mmaplock_two_inodes_and_break_dax_layout(
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*ip2)
+{
+	int			error;
+	bool			retry;
+	struct page		*page;
+
+	if (ip1->i_ino > ip2->i_ino)
+		swap(ip1, ip2);
+
+again:
+	retry = false;
+	/* Lock the first inode */
+	xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+	error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
+	if (error || retry) {
+		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+		if (error == 0 && retry)
+			goto again;
+		return error;
+	}
+
+	if (ip1 == ip2)
+		return 0;
+
+	/* Nested lock the second inode */
+	xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
+	/*
+	 * We cannot use xfs_break_dax_layouts() directly here because it may
+	 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
+	 * for this nested lock case.
+	 */
+	page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
+	if (page && page_ref_count(page) != 1) {
+		xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+		goto again;
+	}
+
+	return 0;
+}
+
+/*
+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
+ * mmap activity.
+ */
+int
+xfs_ilock2_io_mmap(
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*ip2)
+{
+	int			ret;
+
+	ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+	if (ret)
+		return ret;
+
+	if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
+		ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
+		if (ret) {
+			inode_unlock(VFS_I(ip2));
+			if (ip1 != ip2)
+				inode_unlock(VFS_I(ip1));
+			return ret;
+		}
+	} else
+		filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+					    VFS_I(ip2)->i_mapping);
+
+	return 0;
+}
+
+/* Unlock both inodes to allow IO and mmap activity. */
+void
+xfs_iunlock2_io_mmap(
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*ip2)
+{
+	if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
+		xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+		if (ip1 != ip2)
+			xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+	} else
+		filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+					      VFS_I(ip2)->i_mapping);
+
+	inode_unlock(VFS_I(ip2));
+	if (ip1 != ip2)
+		inode_unlock(VFS_I(ip1));
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
new file mode 100644
index 000000000..fa780f08d
--- /dev/null
+++ b/fs/xfs/xfs_inode.h
@@ -0,0 +1,578 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_INODE_H__
+#define	__XFS_INODE_H__
+
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+
+/*
+ * Kernel only inode definitions
+ */
+struct xfs_dinode;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+
+typedef struct xfs_inode {
+	/* Inode linking and identification information. */
+	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
+	struct xfs_dquot	*i_udquot;	/* user dquot */
+	struct xfs_dquot	*i_gdquot;	/* group dquot */
+	struct xfs_dquot	*i_pdquot;	/* project dquot */
+
+	/* Inode location stuff */
+	xfs_ino_t		i_ino;		/* inode number (agno/agino)*/
+	struct xfs_imap		i_imap;		/* location for xfs_imap() */
+
+	/* Extent information. */
+	struct xfs_ifork	*i_cowfp;	/* copy on write extents */
+	struct xfs_ifork	i_df;		/* data fork */
+	struct xfs_ifork	i_af;		/* attribute fork */
+
+	/* Transaction and locking information. */
+	struct xfs_inode_log_item *i_itemp;	/* logging information */
+	mrlock_t		i_lock;		/* inode lock */
+	atomic_t		i_pincount;	/* inode pin count */
+	struct llist_node	i_gclist;	/* deferred inactivation list */
+
+	/*
+	 * Bitsets of inode metadata that have been checked and/or are sick.
+	 * Callers must hold i_flags_lock before accessing this field.
+	 */
+	uint16_t		i_checked;
+	uint16_t		i_sick;
+
+	spinlock_t		i_flags_lock;	/* inode i_flags lock */
+	/* Miscellaneous state. */
+	unsigned long		i_flags;	/* see defined flags below */
+	uint64_t		i_delayed_blks;	/* count of delay alloc blks */
+	xfs_fsize_t		i_disk_size;	/* number of bytes in file */
+	xfs_rfsblock_t		i_nblocks;	/* # of direct & btree blocks */
+	prid_t			i_projid;	/* owner's project id */
+	xfs_extlen_t		i_extsize;	/* basic/minimum extent size */
+	/* cowextsize is only used for v3 inodes, flushiter for v1/2 */
+	union {
+		xfs_extlen_t	i_cowextsize;	/* basic cow extent size */
+		uint16_t	i_flushiter;	/* incremented on flush */
+	};
+	uint8_t			i_forkoff;	/* attr fork offset >> 3 */
+	uint16_t		i_diflags;	/* XFS_DIFLAG_... */
+	uint64_t		i_diflags2;	/* XFS_DIFLAG2_... */
+	struct timespec64	i_crtime;	/* time created */
+
+	/* unlinked list pointers */
+	xfs_agino_t		i_next_unlinked;
+	xfs_agino_t		i_prev_unlinked;
+
+	/* VFS inode */
+	struct inode		i_vnode;	/* embedded VFS inode */
+
+	/* pending io completions */
+	spinlock_t		i_ioend_lock;
+	struct work_struct	i_ioend_work;
+	struct list_head	i_ioend_list;
+} xfs_inode_t;
+
+static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
+{
+	return ip->i_forkoff > 0;
+}
+
+static inline struct xfs_ifork *
+xfs_ifork_ptr(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		return &ip->i_df;
+	case XFS_ATTR_FORK:
+		if (!xfs_inode_has_attr_fork(ip))
+			return NULL;
+		return &ip->i_af;
+	case XFS_COW_FORK:
+		return ip->i_cowfp;
+	default:
+		ASSERT(0);
+		return NULL;
+	}
+}
+
+static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip)
+{
+	return ip->i_forkoff << 3;
+}
+
+static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip)
+{
+	if (xfs_inode_has_attr_fork(ip))
+		return xfs_inode_fork_boff(ip);
+
+	return XFS_LITINO(ip->i_mount);
+}
+
+static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip)
+{
+	if (xfs_inode_has_attr_fork(ip))
+		return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip);
+	return 0;
+}
+
+static inline unsigned int
+xfs_inode_fork_size(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		return xfs_inode_data_fork_size(ip);
+	case XFS_ATTR_FORK:
+		return xfs_inode_attr_fork_size(ip);
+	default:
+		return 0;
+	}
+}
+
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+	return container_of(inode, struct xfs_inode, i_vnode);
+}
+
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+	return &ip->i_vnode;
+}
+
+/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk.  Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+	if (S_ISREG(VFS_I(ip)->i_mode))
+		return i_size_read(VFS_I(ip));
+	return ip->i_disk_size;
+}
+
+/*
+ * If this I/O goes past the on-disk inode size update it unless it would
+ * be past the current in-core inode size.
+ */
+static inline xfs_fsize_t
+xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
+{
+	xfs_fsize_t i_size = i_size_read(VFS_I(ip));
+
+	if (new_size > i_size || new_size < 0)
+		new_size = i_size;
+	return new_size > ip->i_disk_size ? new_size : 0;
+}
+
+/*
+ * i_flags helper functions
+ */
+static inline void
+__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+{
+	ip->i_flags |= flags;
+}
+
+static inline void
+xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+{
+	spin_lock(&ip->i_flags_lock);
+	__xfs_iflags_set(ip, flags);
+	spin_unlock(&ip->i_flags_lock);
+}
+
+static inline void
+xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
+{
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags &= ~flags;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+static inline int
+__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+{
+	return (ip->i_flags & flags);
+}
+
+static inline int
+xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+	spin_lock(&ip->i_flags_lock);
+	ret = __xfs_iflags_test(ip, flags);
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+static inline int
+xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+
+	spin_lock(&ip->i_flags_lock);
+	ret = ip->i_flags & flags;
+	if (ret)
+		ip->i_flags &= ~flags;
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+
+	spin_lock(&ip->i_flags_lock);
+	ret = ip->i_flags & flags;
+	if (!ret)
+		ip->i_flags |= flags;
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+static inline prid_t
+xfs_get_initial_prid(struct xfs_inode *dp)
+{
+	if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT)
+		return dp->i_projid;
+
+	return XFS_PROJID_DEFAULT;
+}
+
+static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
+{
+	return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
+}
+
+static inline bool xfs_is_metadata_inode(struct xfs_inode *ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	return ip == mp->m_rbmip || ip == mp->m_rsumip ||
+		xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
+}
+
+/*
+ * Check if an inode has any data in the COW fork.  This might be often false
+ * even for inodes with the reflink flag when there is no pending COW operation.
+ */
+static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
+{
+	return ip->i_cowfp && ip->i_cowfp->if_bytes;
+}
+
+static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
+{
+	return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
+}
+
+static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
+{
+	return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
+}
+
+/*
+ * Return the buftarg used for data allocations on a given inode.
+ */
+#define xfs_inode_buftarg(ip) \
+	(XFS_IS_REALTIME_INODE(ip) ? \
+		(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
+
+/*
+ * In-core inode flags.
+ */
+#define XFS_IRECLAIM		(1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE		(1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE	(1 << 2) /* inode can be reclaimed */
+#define XFS_INEW		(1 << 3) /* inode has just been allocated */
+#define XFS_IPRESERVE_DM_FIELDS	(1 << 4) /* has legacy DMAPI fields set */
+#define XFS_ITRUNCATED		(1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
+#define XFS_IFLUSHING		(1 << 7) /* inode is being flushed */
+#define __XFS_IPINNED_BIT	8	 /* wakeup key for zero pin count */
+#define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
+#define XFS_IEOFBLOCKS		(1 << 9) /* has the preallocblocks tag set */
+#define XFS_NEED_INACTIVE	(1 << 10) /* see XFS_INACTIVATING below */
+/*
+ * If this unlinked inode is in the middle of recovery, don't let drop_inode
+ * truncate and free the inode.  This can happen if we iget the inode during
+ * log recovery to replay a bmap operation on the inode.
+ */
+#define XFS_IRECOVERY		(1 << 11)
+#define XFS_ICOWBLOCKS		(1 << 12)/* has the cowblocks tag set */
+
+/*
+ * If we need to update on-disk metadata before this IRECLAIMABLE inode can be
+ * freed, then NEED_INACTIVE will be set.  Once we start the updates, the
+ * INACTIVATING bit will be set to keep iget away from this inode.  After the
+ * inactivation completes, both flags will be cleared and the inode is a
+ * plain old IRECLAIMABLE inode.
+ */
+#define XFS_INACTIVATING	(1 << 13)
+
+/* All inode state flags related to inode reclaim. */
+#define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
+				 XFS_IRECLAIM | \
+				 XFS_NEED_INACTIVE | \
+				 XFS_INACTIVATING)
+
+/*
+ * Per-lifetime flags need to be reset when re-using a reclaimable inode during
+ * inode lookup. This prevents unintended behaviour on the new inode from
+ * ocurring.
+ */
+#define XFS_IRECLAIM_RESET_FLAGS	\
+	(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
+	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
+	 XFS_INACTIVATING)
+
+/*
+ * Flags for inode locking.
+ * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
+ *		1<<16 - 1<<32-1 -- lockdep annotation (integers)
+ */
+#define	XFS_IOLOCK_EXCL		(1u << 0)
+#define	XFS_IOLOCK_SHARED	(1u << 1)
+#define	XFS_ILOCK_EXCL		(1u << 2)
+#define	XFS_ILOCK_SHARED	(1u << 3)
+#define	XFS_MMAPLOCK_EXCL	(1u << 4)
+#define	XFS_MMAPLOCK_SHARED	(1u << 5)
+
+#define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
+				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
+				| XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
+
+#define XFS_LOCK_FLAGS \
+	{ XFS_IOLOCK_EXCL,	"IOLOCK_EXCL" }, \
+	{ XFS_IOLOCK_SHARED,	"IOLOCK_SHARED" }, \
+	{ XFS_ILOCK_EXCL,	"ILOCK_EXCL" }, \
+	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }, \
+	{ XFS_MMAPLOCK_EXCL,	"MMAPLOCK_EXCL" }, \
+	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }
+
+
+/*
+ * Flags for lockdep annotations.
+ *
+ * XFS_LOCK_PARENT - for directory operations that require locking a
+ * parent directory inode and a child entry inode. IOLOCK requires nesting,
+ * MMAPLOCK does not support this class, ILOCK requires a single subclass
+ * to differentiate parent from child.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
+ *
+ * XFS_LOCK_INUMORDER - for locking several inodes at the some time
+ * with xfs_lock_inodes().  This flag is used as the starting subclass
+ * and each subsequent lock acquired will increment the subclass by one.
+ * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly
+ * limited to the subclasses we can represent via nesting. We need at least
+ * 5 inodes nest depth for the ILOCK through rename, and we also have to support
+ * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP
+ * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all
+ * 8 subclasses supported by lockdep.
+ *
+ * This also means we have to number the sub-classes in the lowest bits of
+ * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep
+ * mask and we can't use bit-masking to build the subclasses. What a mess.
+ *
+ * Bit layout:
+ *
+ * Bit		Lock Region
+ * 16-19	XFS_IOLOCK_SHIFT dependencies
+ * 20-23	XFS_MMAPLOCK_SHIFT dependencies
+ * 24-31	XFS_ILOCK_SHIFT dependencies
+ *
+ * IOLOCK values
+ *
+ * 0-3		subclass value
+ * 4-7		unused
+ *
+ * MMAPLOCK values
+ *
+ * 0-3		subclass value
+ * 4-7		unused
+ *
+ * ILOCK values
+ * 0-4		subclass values
+ * 5		PARENT subclass (not nestable)
+ * 6		RTBITMAP subclass (not nestable)
+ * 7		RTSUM subclass (not nestable)
+ * 
+ */
+#define XFS_IOLOCK_SHIFT		16
+#define XFS_IOLOCK_MAX_SUBCLASS		3
+#define XFS_IOLOCK_DEP_MASK		0x000f0000u
+
+#define XFS_MMAPLOCK_SHIFT		20
+#define XFS_MMAPLOCK_NUMORDER		0
+#define XFS_MMAPLOCK_MAX_SUBCLASS	3
+#define XFS_MMAPLOCK_DEP_MASK		0x00f00000u
+
+#define XFS_ILOCK_SHIFT			24
+#define XFS_ILOCK_PARENT_VAL		5u
+#define XFS_ILOCK_MAX_SUBCLASS		(XFS_ILOCK_PARENT_VAL - 1)
+#define XFS_ILOCK_RTBITMAP_VAL		6u
+#define XFS_ILOCK_RTSUM_VAL		7u
+#define XFS_ILOCK_DEP_MASK		0xff000000u
+#define	XFS_ILOCK_PARENT		(XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_RTBITMAP		(XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_RTSUM			(XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
+
+#define XFS_LOCK_SUBCLASS_MASK	(XFS_IOLOCK_DEP_MASK | \
+				 XFS_MMAPLOCK_DEP_MASK | \
+				 XFS_ILOCK_DEP_MASK)
+
+#define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) \
+					>> XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_DEP(flags)	(((flags) & XFS_MMAPLOCK_DEP_MASK) \
+					>> XFS_MMAPLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) \
+					>> XFS_ILOCK_SHIFT)
+
+/*
+ * Layouts are broken in the BREAK_WRITE case to ensure that
+ * layout-holders do not collide with local writes. Additionally,
+ * layouts are broken in the BREAK_UNMAP case to make sure the
+ * layout-holder has a consistent view of the file's extent map. While
+ * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
+ * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
+ * go idle.
+ */
+enum layout_break_reason {
+        BREAK_WRITE,
+        BREAK_UNMAP,
+};
+
+/*
+ * For multiple groups support: if S_ISGID bit is set in the parent
+ * directory, group of new file is set to that of the parent, and
+ * new subdirectory gets S_ISGID bit from parent.
+ */
+#define XFS_INHERIT_GID(pip)	\
+	(xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID))
+
+int		xfs_release(struct xfs_inode *ip);
+void		xfs_inactive(struct xfs_inode *ip);
+int		xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
+			   struct xfs_inode **ipp, struct xfs_name *ci_name);
+int		xfs_create(struct user_namespace *mnt_userns,
+			   struct xfs_inode *dp, struct xfs_name *name,
+			   umode_t mode, dev_t rdev, bool need_xattr,
+			   struct xfs_inode **ipp);
+int		xfs_create_tmpfile(struct user_namespace *mnt_userns,
+			   struct xfs_inode *dp, umode_t mode,
+			   struct xfs_inode **ipp);
+int		xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+			   struct xfs_inode *ip);
+int		xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
+			 struct xfs_name *target_name);
+int		xfs_rename(struct user_namespace *mnt_userns,
+			   struct xfs_inode *src_dp, struct xfs_name *src_name,
+			   struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+			   struct xfs_name *target_name,
+			   struct xfs_inode *target_ip, unsigned int flags);
+
+void		xfs_ilock(xfs_inode_t *, uint);
+int		xfs_ilock_nowait(xfs_inode_t *, uint);
+void		xfs_iunlock(xfs_inode_t *, uint);
+void		xfs_ilock_demote(xfs_inode_t *, uint);
+bool		xfs_isilocked(struct xfs_inode *, uint);
+uint		xfs_ilock_data_map_shared(struct xfs_inode *);
+uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
+
+uint		xfs_ip2xflags(struct xfs_inode *);
+int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
+int		xfs_itruncate_extents_flags(struct xfs_trans **,
+				struct xfs_inode *, int, xfs_fsize_t, int);
+void		xfs_iext_realloc(xfs_inode_t *, int, int);
+
+int		xfs_log_force_inode(struct xfs_inode *ip);
+void		xfs_iunpin_wait(xfs_inode_t *);
+#define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
+
+int		xfs_iflush_cluster(struct xfs_buf *);
+void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
+				struct xfs_inode *ip1, uint ip1_mode);
+
+xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
+
+int xfs_init_new_inode(struct user_namespace *mnt_userns, struct xfs_trans *tp,
+		struct xfs_inode *pip, xfs_ino_t ino, umode_t mode,
+		xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs,
+		struct xfs_inode **ipp);
+
+static inline int
+xfs_itruncate_extents(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_fsize_t		new_size)
+{
+	return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0);
+}
+
+/* from xfs_file.c */
+int	xfs_break_dax_layouts(struct inode *inode, bool *retry);
+int	xfs_break_layouts(struct inode *inode, uint *iolock,
+		enum layout_break_reason reason);
+
+/* from xfs_iops.c */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
+extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+
+/*
+ * When setting up a newly allocated inode, we need to call
+ * xfs_finish_inode_setup() once the inode is fully instantiated at
+ * the VFS level to prevent the rest of the world seeing the inode
+ * before we've completed instantiation. Otherwise we can do it
+ * the moment the inode lookup is complete.
+ */
+static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
+{
+	xfs_iflags_clear(ip, XFS_INEW);
+	barrier();
+	unlock_new_inode(VFS_I(ip));
+}
+
+static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
+{
+	xfs_setup_inode(ip);
+	xfs_setup_iops(ip);
+	xfs_finish_inode_setup(ip);
+}
+
+void xfs_irele(struct xfs_inode *ip);
+
+extern struct kmem_cache	*xfs_inode_cache;
+
+/* The default CoW extent size hint. */
+#define XFS_DEFAULT_COWEXTSZ_HINT 32
+
+bool xfs_inode_needs_inactive(struct xfs_inode *ip);
+
+void xfs_end_io(struct work_struct *work);
+
+int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+
+#endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
new file mode 100644
index 000000000..ca2941ab6
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.c
@@ -0,0 +1,1012 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_error.h"
+
+#include <linux/iversion.h>
+
+struct kmem_cache	*xfs_ili_cache;		/* inode log item */
+
+static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_inode_log_item, ili_item);
+}
+
+/*
+ * The logged size of an inode fork is always the current size of the inode
+ * fork. This means that when an inode fork is relogged, the size of the logged
+ * region is determined by the current state, not the combination of the
+ * previously logged state + the current state. This is different relogging
+ * behaviour to most other log items which will retain the size of the
+ * previously logged changes when smaller regions are relogged.
+ *
+ * Hence operations that remove data from the inode fork (e.g. shortform
+ * dir/attr remove, extent form extent removal, etc), the size of the relogged
+ * inode gets -smaller- rather than stays the same size as the previously logged
+ * size and this can result in the committing transaction reducing the amount of
+ * space being consumed by the CIL.
+ */
+STATIC void
+xfs_inode_item_data_fork_size(
+	struct xfs_inode_log_item *iip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	switch (ip->i_df.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+		    ip->i_df.if_nextents > 0 &&
+		    ip->i_df.if_bytes > 0) {
+			/* worst case, doesn't subtract delalloc extents */
+			*nbytes += xfs_inode_data_fork_size(ip);
+			*nvecs += 1;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+		    ip->i_df.if_broot_bytes > 0) {
+			*nbytes += ip->i_df.if_broot_bytes;
+			*nvecs += 1;
+		}
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+		    ip->i_df.if_bytes > 0) {
+			*nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes);
+			*nvecs += 1;
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+STATIC void
+xfs_inode_item_attr_fork_size(
+	struct xfs_inode_log_item *iip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	switch (ip->i_af.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+		    ip->i_af.if_nextents > 0 &&
+		    ip->i_af.if_bytes > 0) {
+			/* worst case, doesn't subtract unused space */
+			*nbytes += xfs_inode_attr_fork_size(ip);
+			*nvecs += 1;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+		    ip->i_af.if_broot_bytes > 0) {
+			*nbytes += ip->i_af.if_broot_bytes;
+			*nvecs += 1;
+		}
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+		    ip->i_af.if_bytes > 0) {
+			*nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes);
+			*nvecs += 1;
+		}
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	*nvecs += 2;
+	*nbytes += sizeof(struct xfs_inode_log_format) +
+		   xfs_log_dinode_size(ip->i_mount);
+
+	xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
+	if (xfs_inode_has_attr_fork(ip))
+		xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
+}
+
+STATIC void
+xfs_inode_item_format_data_fork(
+	struct xfs_inode_log_item *iip,
+	struct xfs_inode_log_format *ilf,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+	size_t			data_bytes;
+
+	switch (ip->i_df.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
+
+		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+		    ip->i_df.if_nextents > 0 &&
+		    ip->i_df.if_bytes > 0) {
+			struct xfs_bmbt_rec *p;
+
+			ASSERT(xfs_iext_count(&ip->i_df) > 0);
+
+			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
+			data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
+			xlog_finish_iovec(lv, *vecp, data_bytes);
+
+			ASSERT(data_bytes <= ip->i_df.if_bytes);
+
+			ilf->ilf_dsize = data_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_DEXT;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		iip->ili_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
+
+		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+		    ip->i_df.if_broot_bytes > 0) {
+			ASSERT(ip->i_df.if_broot != NULL);
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
+					ip->i_df.if_broot,
+					ip->i_df.if_broot_bytes);
+			ilf->ilf_dsize = ip->i_df.if_broot_bytes;
+			ilf->ilf_size++;
+		} else {
+			ASSERT(!(iip->ili_fields &
+				 XFS_ILOG_DBROOT));
+			iip->ili_fields &= ~XFS_ILOG_DBROOT;
+		}
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_fields &=
+			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
+		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+		    ip->i_df.if_bytes > 0) {
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_disk_size > 0);
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
+					ip->i_df.if_u1.if_data,
+					ip->i_df.if_bytes);
+			ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_DDATA;
+		}
+		break;
+	case XFS_DINODE_FMT_DEV:
+		iip->ili_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT);
+		if (iip->ili_fields & XFS_ILOG_DEV)
+			ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+STATIC void
+xfs_inode_item_format_attr_fork(
+	struct xfs_inode_log_item *iip,
+	struct xfs_inode_log_format *ilf,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+	size_t			data_bytes;
+
+	switch (ip->i_af.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+
+		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+		    ip->i_af.if_nextents > 0 &&
+		    ip->i_af.if_bytes > 0) {
+			struct xfs_bmbt_rec *p;
+
+			ASSERT(xfs_iext_count(&ip->i_af) ==
+				ip->i_af.if_nextents);
+
+			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
+			data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
+			xlog_finish_iovec(lv, *vecp, data_bytes);
+
+			ilf->ilf_asize = data_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_AEXT;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		iip->ili_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+
+		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+		    ip->i_af.if_broot_bytes > 0) {
+			ASSERT(ip->i_af.if_broot != NULL);
+
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
+					ip->i_af.if_broot,
+					ip->i_af.if_broot_bytes);
+			ilf->ilf_asize = ip->i_af.if_broot_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_ABROOT;
+		}
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_fields &=
+			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+
+		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+		    ip->i_af.if_bytes > 0) {
+			ASSERT(ip->i_af.if_u1.if_data != NULL);
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
+					ip->i_af.if_u1.if_data,
+					ip->i_af.if_bytes);
+			ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
+			ilf->ilf_size++;
+		} else {
+			iip->ili_fields &= ~XFS_ILOG_ADATA;
+		}
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/*
+ * Convert an incore timestamp to a log timestamp.  Note that the log format
+ * specifies host endian format!
+ */
+static inline xfs_log_timestamp_t
+xfs_inode_to_log_dinode_ts(
+	struct xfs_inode		*ip,
+	const struct timespec64		tv)
+{
+	struct xfs_log_legacy_timestamp	*lits;
+	xfs_log_timestamp_t		its;
+
+	if (xfs_inode_has_bigtime(ip))
+		return xfs_inode_encode_bigtime(tv);
+
+	lits = (struct xfs_log_legacy_timestamp *)&its;
+	lits->t_sec = tv.tv_sec;
+	lits->t_nsec = tv.tv_nsec;
+
+	return its;
+}
+
+/*
+ * The legacy DMAPI fields are only present in the on-disk and in-log inodes,
+ * but not in the in-memory one.  But we are guaranteed to have an inode buffer
+ * in memory when logging an inode, so we can just copy it from the on-disk
+ * inode to the in-log inode here so that recovery of file system with these
+ * fields set to non-zero values doesn't lose them.  For all other cases we zero
+ * the fields.
+ */
+static void
+xfs_copy_dm_fields_to_log_dinode(
+	struct xfs_inode	*ip,
+	struct xfs_log_dinode	*to)
+{
+	struct xfs_dinode	*dip;
+
+	dip = xfs_buf_offset(ip->i_itemp->ili_item.li_buf,
+			     ip->i_imap.im_boffset);
+
+	if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) {
+		to->di_dmevmask = be32_to_cpu(dip->di_dmevmask);
+		to->di_dmstate = be16_to_cpu(dip->di_dmstate);
+	} else {
+		to->di_dmevmask = 0;
+		to->di_dmstate = 0;
+	}
+}
+
+static inline void
+xfs_inode_to_log_dinode_iext_counters(
+	struct xfs_inode	*ip,
+	struct xfs_log_dinode	*to)
+{
+	if (xfs_inode_has_large_extent_counts(ip)) {
+		to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
+		to->di_big_anextents = xfs_ifork_nextents(&ip->i_af);
+		to->di_nrext64_pad = 0;
+	} else {
+		to->di_nextents = xfs_ifork_nextents(&ip->i_df);
+		to->di_anextents = xfs_ifork_nextents(&ip->i_af);
+	}
+}
+
+static void
+xfs_inode_to_log_dinode(
+	struct xfs_inode	*ip,
+	struct xfs_log_dinode	*to,
+	xfs_lsn_t		lsn)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	to->di_magic = XFS_DINODE_MAGIC;
+	to->di_format = xfs_ifork_format(&ip->i_df);
+	to->di_uid = i_uid_read(inode);
+	to->di_gid = i_gid_read(inode);
+	to->di_projid_lo = ip->i_projid & 0xffff;
+	to->di_projid_hi = ip->i_projid >> 16;
+
+	memset(to->di_pad3, 0, sizeof(to->di_pad3));
+	to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
+	to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+	to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
+	to->di_nlink = inode->i_nlink;
+	to->di_gen = inode->i_generation;
+	to->di_mode = inode->i_mode;
+
+	to->di_size = ip->i_disk_size;
+	to->di_nblocks = ip->i_nblocks;
+	to->di_extsize = ip->i_extsize;
+	to->di_forkoff = ip->i_forkoff;
+	to->di_aformat = xfs_ifork_format(&ip->i_af);
+	to->di_flags = ip->i_diflags;
+
+	xfs_copy_dm_fields_to_log_dinode(ip, to);
+
+	/* log a dummy value to ensure log structure is fully initialised */
+	to->di_next_unlinked = NULLAGINO;
+
+	if (xfs_has_v3inodes(ip->i_mount)) {
+		to->di_version = 3;
+		to->di_changecount = inode_peek_iversion(inode);
+		to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
+		to->di_flags2 = ip->i_diflags2;
+		to->di_cowextsize = ip->i_cowextsize;
+		to->di_ino = ip->i_ino;
+		to->di_lsn = lsn;
+		memset(to->di_pad2, 0, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+		to->di_v3_pad = 0;
+	} else {
+		to->di_version = 2;
+		to->di_flushiter = ip->i_flushiter;
+		memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
+	}
+
+	xfs_inode_to_log_dinode_iext_counters(ip, to);
+}
+
+/*
+ * Format the inode core. Current timestamp data is only in the VFS inode
+ * fields, so we need to grab them from there. Hence rather than just copying
+ * the XFS inode core structure, format the fields directly into the iovec.
+ */
+static void
+xfs_inode_item_format_core(
+	struct xfs_inode	*ip,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp)
+{
+	struct xfs_log_dinode	*dic;
+
+	dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
+	xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
+	xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount));
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the given inode
+ * log item.  It fills the first item with an inode log format structure,
+ * the second with the on-disk inode structure, and a possible third and/or
+ * fourth with the inode data/extents/b-tree root and inode attributes
+ * data/extents/b-tree root.
+ *
+ * Note: Always use the 64 bit inode log format structure so we don't
+ * leave an uninitialised hole in the format item on 64 bit systems. Log
+ * recovery on 32 bit systems handles this just fine, so there's no reason
+ * for not using an initialising the properly padded structure all the time.
+ */
+STATIC void
+xfs_inode_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	struct xfs_log_iovec	*vecp = NULL;
+	struct xfs_inode_log_format *ilf;
+
+	ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
+	ilf->ilf_type = XFS_LI_INODE;
+	ilf->ilf_ino = ip->i_ino;
+	ilf->ilf_blkno = ip->i_imap.im_blkno;
+	ilf->ilf_len = ip->i_imap.im_len;
+	ilf->ilf_boffset = ip->i_imap.im_boffset;
+	ilf->ilf_fields = XFS_ILOG_CORE;
+	ilf->ilf_size = 2; /* format + core */
+
+	/*
+	 * make sure we don't leak uninitialised data into the log in the case
+	 * when we don't log every field in the inode.
+	 */
+	ilf->ilf_dsize = 0;
+	ilf->ilf_asize = 0;
+	ilf->ilf_pad = 0;
+	memset(&ilf->ilf_u, 0, sizeof(ilf->ilf_u));
+
+	xlog_finish_iovec(lv, vecp, sizeof(*ilf));
+
+	xfs_inode_item_format_core(ip, lv, &vecp);
+	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
+	if (xfs_inode_has_attr_fork(ip)) {
+		xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
+	} else {
+		iip->ili_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+	}
+
+	/* update the format with the exact fields we actually logged */
+	ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
+}
+
+/*
+ * This is called to pin the inode associated with the inode log
+ * item in memory so it cannot be written out.
+ */
+STATIC void
+xfs_inode_item_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(lip->li_buf);
+
+	trace_xfs_inode_pin(ip, _RET_IP_);
+	atomic_inc(&ip->i_pincount);
+}
+
+
+/*
+ * This is called to unpin the inode associated with the inode log
+ * item which was previously pinned with a call to xfs_inode_item_pin().
+ *
+ * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
+ *
+ * Note that unpin can race with inode cluster buffer freeing marking the buffer
+ * stale. In that case, flush completions are run from the buffer unpin call,
+ * which may happen before the inode is unpinned. If we lose the race, there
+ * will be no buffer attached to the log item, but the inode will be marked
+ * XFS_ISTALE.
+ */
+STATIC void
+xfs_inode_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+
+	trace_xfs_inode_unpin(ip, _RET_IP_);
+	ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
+	ASSERT(atomic_read(&ip->i_pincount) > 0);
+	if (atomic_dec_and_test(&ip->i_pincount))
+		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+}
+
+STATIC uint
+xfs_inode_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+		__releases(&lip->li_ailp->ail_lock)
+		__acquires(&lip->li_ailp->ail_lock)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	struct xfs_buf		*bp = lip->li_buf;
+	uint			rval = XFS_ITEM_SUCCESS;
+	int			error;
+
+	if (!bp || (ip->i_flags & XFS_ISTALE)) {
+		/*
+		 * Inode item/buffer is being aborted due to cluster
+		 * buffer deletion. Trigger a log force to have that operation
+		 * completed and items removed from the AIL before the next push
+		 * attempt.
+		 */
+		return XFS_ITEM_PINNED;
+	}
+
+	if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
+		return XFS_ITEM_PINNED;
+
+	if (xfs_iflags_test(ip, XFS_IFLUSHING))
+		return XFS_ITEM_FLUSHING;
+
+	if (!xfs_buf_trylock(bp))
+		return XFS_ITEM_LOCKED;
+
+	spin_unlock(&lip->li_ailp->ail_lock);
+
+	/*
+	 * We need to hold a reference for flushing the cluster buffer as it may
+	 * fail the buffer without IO submission. In which case, we better get a
+	 * reference for that completion because otherwise we don't get a
+	 * reference for IO until we queue the buffer for delwri submission.
+	 */
+	xfs_buf_hold(bp);
+	error = xfs_iflush_cluster(bp);
+	if (!error) {
+		if (!xfs_buf_delwri_queue(bp, buffer_list))
+			rval = XFS_ITEM_FLUSHING;
+		xfs_buf_relse(bp);
+	} else {
+		/*
+		 * Release the buffer if we were unable to flush anything. On
+		 * any other error, the buffer has already been released.
+		 */
+		if (error == -EAGAIN)
+			xfs_buf_relse(bp);
+		rval = XFS_ITEM_LOCKED;
+	}
+
+	spin_lock(&lip->li_ailp->ail_lock);
+	return rval;
+}
+
+/*
+ * Unlock the inode associated with the inode log item.
+ */
+STATIC void
+xfs_inode_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	unsigned short		lock_flags;
+
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	lock_flags = iip->ili_lock_flags;
+	iip->ili_lock_flags = 0;
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+}
+
+/*
+ * This is called to find out where the oldest active copy of the inode log
+ * item in the on disk log resides now that the last log write of it completed
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
+ * latest copy in the on disk log is the only one that matters.  Therefore,
+ * simply return the given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completes before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, just unpin the inode directly and return a LSN of -1 so the
+ * transaction committed code knows that it does not need to do any further
+ * processing on the item.
+ */
+STATIC xfs_lsn_t
+xfs_inode_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	if (xfs_iflags_test(ip, XFS_ISTALE)) {
+		xfs_inode_item_unpin(lip, 0);
+		return -1;
+	}
+	return lsn;
+}
+
+STATIC void
+xfs_inode_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_csn_t		seq)
+{
+	INODE_ITEM(lip)->ili_commit_seq = seq;
+	return xfs_inode_item_release(lip);
+}
+
+static const struct xfs_item_ops xfs_inode_item_ops = {
+	.iop_size	= xfs_inode_item_size,
+	.iop_format	= xfs_inode_item_format,
+	.iop_pin	= xfs_inode_item_pin,
+	.iop_unpin	= xfs_inode_item_unpin,
+	.iop_release	= xfs_inode_item_release,
+	.iop_committed	= xfs_inode_item_committed,
+	.iop_push	= xfs_inode_item_push,
+	.iop_committing	= xfs_inode_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ */
+void
+xfs_inode_item_init(
+	struct xfs_inode	*ip,
+	struct xfs_mount	*mp)
+{
+	struct xfs_inode_log_item *iip;
+
+	ASSERT(ip->i_itemp == NULL);
+	iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_cache,
+					      GFP_KERNEL | __GFP_NOFAIL);
+
+	iip->ili_inode = ip;
+	spin_lock_init(&iip->ili_lock);
+	xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
+						&xfs_inode_item_ops);
+}
+
+/*
+ * Free the inode log item and any memory hanging off of it.
+ */
+void
+xfs_inode_item_destroy(
+	struct xfs_inode	*ip)
+{
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+
+	ASSERT(iip->ili_item.li_buf == NULL);
+
+	ip->i_itemp = NULL;
+	kmem_free(iip->ili_item.li_lv_shadow);
+	kmem_cache_free(xfs_ili_cache, iip);
+}
+
+
+/*
+ * We only want to pull the item from the AIL if it is actually there
+ * and its location in the log has not changed since we started the
+ * flush.  Thus, we only bother if the inode's lsn has not changed.
+ */
+static void
+xfs_iflush_ail_updates(
+	struct xfs_ail		*ailp,
+	struct list_head	*list)
+{
+	struct xfs_log_item	*lip;
+	xfs_lsn_t		tail_lsn = 0;
+
+	/* this is an opencoded batch version of xfs_trans_ail_delete */
+	spin_lock(&ailp->ail_lock);
+	list_for_each_entry(lip, list, li_bio_list) {
+		xfs_lsn_t	lsn;
+
+		clear_bit(XFS_LI_FAILED, &lip->li_flags);
+		if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
+			continue;
+
+		/*
+		 * dgc: Not sure how this happens, but it happens very
+		 * occassionaly via generic/388.  xfs_iflush_abort() also
+		 * silently handles this same "under writeback but not in AIL at
+		 * shutdown" condition via xfs_trans_ail_delete().
+		 */
+		if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
+			ASSERT(xlog_is_shutdown(lip->li_log));
+			continue;
+		}
+
+		lsn = xfs_ail_delete_one(ailp, lip);
+		if (!tail_lsn && lsn)
+			tail_lsn = lsn;
+	}
+	xfs_ail_update_finish(ailp, tail_lsn);
+}
+
+/*
+ * Walk the list of inodes that have completed their IOs. If they are clean
+ * remove them from the list and dissociate them from the buffer. Buffers that
+ * are still dirty remain linked to the buffer and on the list. Caller must
+ * handle them appropriately.
+ */
+static void
+xfs_iflush_finish(
+	struct xfs_buf		*bp,
+	struct list_head	*list)
+{
+	struct xfs_log_item	*lip, *n;
+
+	list_for_each_entry_safe(lip, n, list, li_bio_list) {
+		struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+		bool	drop_buffer = false;
+
+		spin_lock(&iip->ili_lock);
+
+		/*
+		 * Remove the reference to the cluster buffer if the inode is
+		 * clean in memory and drop the buffer reference once we've
+		 * dropped the locks we hold.
+		 */
+		ASSERT(iip->ili_item.li_buf == bp);
+		if (!iip->ili_fields) {
+			iip->ili_item.li_buf = NULL;
+			list_del_init(&lip->li_bio_list);
+			drop_buffer = true;
+		}
+		iip->ili_last_fields = 0;
+		iip->ili_flush_lsn = 0;
+		spin_unlock(&iip->ili_lock);
+		xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
+		if (drop_buffer)
+			xfs_buf_rele(bp);
+	}
+}
+
+/*
+ * Inode buffer IO completion routine.  It is responsible for removing inodes
+ * attached to the buffer from the AIL if they have not been re-logged and
+ * completing the inode flush.
+ */
+void
+xfs_buf_inode_iodone(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*lip, *n;
+	LIST_HEAD(flushed_inodes);
+	LIST_HEAD(ail_updates);
+
+	/*
+	 * Pull the attached inodes from the buffer one at a time and take the
+	 * appropriate action on them.
+	 */
+	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+		struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+		if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) {
+			xfs_iflush_abort(iip->ili_inode);
+			continue;
+		}
+		if (!iip->ili_last_fields)
+			continue;
+
+		/* Do an unlocked check for needing the AIL lock. */
+		if (iip->ili_flush_lsn == lip->li_lsn ||
+		    test_bit(XFS_LI_FAILED, &lip->li_flags))
+			list_move_tail(&lip->li_bio_list, &ail_updates);
+		else
+			list_move_tail(&lip->li_bio_list, &flushed_inodes);
+	}
+
+	if (!list_empty(&ail_updates)) {
+		xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates);
+		list_splice_tail(&ail_updates, &flushed_inodes);
+	}
+
+	xfs_iflush_finish(bp, &flushed_inodes);
+	if (!list_empty(&flushed_inodes))
+		list_splice_tail(&flushed_inodes, &bp->b_li_list);
+}
+
+void
+xfs_buf_inode_io_fail(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*lip;
+
+	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+		set_bit(XFS_LI_FAILED, &lip->li_flags);
+}
+
+/*
+ * Clear the inode logging fields so no more flushes are attempted.  If we are
+ * on a buffer list, it is now safe to remove it because the buffer is
+ * guaranteed to be locked. The caller will drop the reference to the buffer
+ * the log item held.
+ */
+static void
+xfs_iflush_abort_clean(
+	struct xfs_inode_log_item *iip)
+{
+	iip->ili_last_fields = 0;
+	iip->ili_fields = 0;
+	iip->ili_fsync_fields = 0;
+	iip->ili_flush_lsn = 0;
+	iip->ili_item.li_buf = NULL;
+	list_del_init(&iip->ili_item.li_bio_list);
+}
+
+/*
+ * Abort flushing the inode from a context holding the cluster buffer locked.
+ *
+ * This is the normal runtime method of aborting writeback of an inode that is
+ * attached to a cluster buffer. It occurs when the inode and the backing
+ * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster
+ * flushing or buffer IO completion encounters a log shutdown situation.
+ *
+ * If we need to abort inode writeback and we don't already hold the buffer
+ * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be
+ * necessary in a shutdown situation.
+ */
+void
+xfs_iflush_abort(
+	struct xfs_inode	*ip)
+{
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	struct xfs_buf		*bp;
+
+	if (!iip) {
+		/* clean inode, nothing to do */
+		xfs_iflags_clear(ip, XFS_IFLUSHING);
+		return;
+	}
+
+	/*
+	 * Remove the inode item from the AIL before we clear its internal
+	 * state. Whilst the inode is in the AIL, it should have a valid buffer
+	 * pointer for push operations to access - it is only safe to remove the
+	 * inode from the buffer once it has been removed from the AIL.
+	 *
+	 * We also clear the failed bit before removing the item from the AIL
+	 * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
+	 * references the inode item owns and needs to hold until we've fully
+	 * aborted the inode log item and detached it from the buffer.
+	 */
+	clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
+	xfs_trans_ail_delete(&iip->ili_item, 0);
+
+	/*
+	 * Grab the inode buffer so can we release the reference the inode log
+	 * item holds on it.
+	 */
+	spin_lock(&iip->ili_lock);
+	bp = iip->ili_item.li_buf;
+	xfs_iflush_abort_clean(iip);
+	spin_unlock(&iip->ili_lock);
+
+	xfs_iflags_clear(ip, XFS_IFLUSHING);
+	if (bp)
+		xfs_buf_rele(bp);
+}
+
+/*
+ * Abort an inode flush in the case of a shutdown filesystem. This can be called
+ * from anywhere with just an inode reference and does not require holding the
+ * inode cluster buffer locked. If the inode is attached to a cluster buffer,
+ * it will grab and lock it safely, then abort the inode flush.
+ */
+void
+xfs_iflush_shutdown_abort(
+	struct xfs_inode	*ip)
+{
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	struct xfs_buf		*bp;
+
+	if (!iip) {
+		/* clean inode, nothing to do */
+		xfs_iflags_clear(ip, XFS_IFLUSHING);
+		return;
+	}
+
+	spin_lock(&iip->ili_lock);
+	bp = iip->ili_item.li_buf;
+	if (!bp) {
+		spin_unlock(&iip->ili_lock);
+		xfs_iflush_abort(ip);
+		return;
+	}
+
+	/*
+	 * We have to take a reference to the buffer so that it doesn't get
+	 * freed when we drop the ili_lock and then wait to lock the buffer.
+	 * We'll clean up the extra reference after we pick up the ili_lock
+	 * again.
+	 */
+	xfs_buf_hold(bp);
+	spin_unlock(&iip->ili_lock);
+	xfs_buf_lock(bp);
+
+	spin_lock(&iip->ili_lock);
+	if (!iip->ili_item.li_buf) {
+		/*
+		 * Raced with another removal, hold the only reference
+		 * to bp now. Inode should not be in the AIL now, so just clean
+		 * up and return;
+		 */
+		ASSERT(list_empty(&iip->ili_item.li_bio_list));
+		ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags));
+		xfs_iflush_abort_clean(iip);
+		spin_unlock(&iip->ili_lock);
+		xfs_iflags_clear(ip, XFS_IFLUSHING);
+		xfs_buf_relse(bp);
+		return;
+	}
+
+	/*
+	 * Got two references to bp. The first will get dropped by
+	 * xfs_iflush_abort() when the item is removed from the buffer list, but
+	 * we can't drop our reference until _abort() returns because we have to
+	 * unlock the buffer as well. Hence we abort and then unlock and release
+	 * our reference to the buffer.
+	 */
+	ASSERT(iip->ili_item.li_buf == bp);
+	spin_unlock(&iip->ili_lock);
+	xfs_iflush_abort(ip);
+	xfs_buf_relse(bp);
+}
+
+
+/*
+ * convert an xfs_inode_log_format struct from the old 32 bit version
+ * (which can have different field alignments) to the native 64 bit version
+ */
+int
+xfs_inode_item_format_convert(
+	struct xfs_log_iovec		*buf,
+	struct xfs_inode_log_format	*in_f)
+{
+	struct xfs_inode_log_format_32	*in_f32 = buf->i_addr;
+
+	if (buf->i_len != sizeof(*in_f32)) {
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+		return -EFSCORRUPTED;
+	}
+
+	in_f->ilf_type = in_f32->ilf_type;
+	in_f->ilf_size = in_f32->ilf_size;
+	in_f->ilf_fields = in_f32->ilf_fields;
+	in_f->ilf_asize = in_f32->ilf_asize;
+	in_f->ilf_dsize = in_f32->ilf_dsize;
+	in_f->ilf_ino = in_f32->ilf_ino;
+	memcpy(&in_f->ilf_u, &in_f32->ilf_u, sizeof(in_f->ilf_u));
+	in_f->ilf_blkno = in_f32->ilf_blkno;
+	in_f->ilf_len = in_f32->ilf_len;
+	in_f->ilf_boffset = in_f32->ilf_boffset;
+	return 0;
+}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
new file mode 100644
index 000000000..bbd836a44
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.h
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_INODE_ITEM_H__
+#define	__XFS_INODE_ITEM_H__
+
+/* kernel only definitions */
+
+struct xfs_buf;
+struct xfs_bmbt_rec;
+struct xfs_inode;
+struct xfs_mount;
+
+struct xfs_inode_log_item {
+	struct xfs_log_item	ili_item;	   /* common portion */
+	struct xfs_inode	*ili_inode;	   /* inode ptr */
+	unsigned short		ili_lock_flags;	   /* inode lock flags */
+	/*
+	 * The ili_lock protects the interactions between the dirty state and
+	 * the flush state of the inode log item. This allows us to do atomic
+	 * modifications of multiple state fields without having to hold a
+	 * specific inode lock to serialise them.
+	 *
+	 * We need atomic changes between inode dirtying, inode flushing and
+	 * inode completion, but these all hold different combinations of
+	 * ILOCK and IFLUSHING and hence we need some other method of
+	 * serialising updates to the flush state.
+	 */
+	spinlock_t		ili_lock;	   /* flush state lock */
+	unsigned int		ili_last_fields;   /* fields when flushed */
+	unsigned int		ili_fields;	   /* fields to be logged */
+	unsigned int		ili_fsync_fields;  /* logged since last fsync */
+	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
+	xfs_csn_t		ili_commit_seq;	   /* last transaction commit */
+};
+
+static inline int xfs_inode_clean(struct xfs_inode *ip)
+{
+	return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
+}
+
+extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
+extern void xfs_inode_item_destroy(struct xfs_inode *);
+extern void xfs_iflush_abort(struct xfs_inode *);
+extern void xfs_iflush_shutdown_abort(struct xfs_inode *);
+extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
+					 struct xfs_inode_log_format *);
+
+extern struct kmem_cache	*xfs_ili_cache;
+
+#endif	/* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
new file mode 100644
index 000000000..e6609067e
--- /dev/null
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+	struct xlog                     *log,
+	struct xlog_recover_item        *item)
+{
+	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+		struct xfs_inode_log_format	*ilfp = item->ri_buf[0].i_addr;
+
+		xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
+				   &xfs_inode_buf_ra_ops);
+	} else {
+		struct xfs_inode_log_format_32	*ilfp = item->ri_buf[0].i_addr;
+
+		xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
+				   &xfs_inode_buf_ra_ops);
+	}
+}
+
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one.  This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip,
+	struct xfs_inode_log_format *in_f,
+	struct list_head	*buffer_list)
+{
+	struct xfs_inode	*ip;
+	int			error;
+
+	ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+	ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+	if (!ip)
+		return -ENOMEM;
+
+	/* instantiate the inode */
+	ASSERT(dip->di_version >= 3);
+
+	error = xfs_inode_from_disk(ip, dip);
+	if (error)
+		goto out_free_ip;
+
+	if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+		ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+		error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+					      ip->i_ino, buffer_list);
+		if (error)
+			goto out_free_ip;
+	}
+
+	if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+		ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+		error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+					      ip->i_ino, buffer_list);
+		if (error)
+			goto out_free_ip;
+	}
+
+out_free_ip:
+	xfs_inode_free(ip);
+	return error;
+}
+
+static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
+{
+	return ld->di_version >= 3 &&
+	       (ld->di_flags2 & XFS_DIFLAG2_BIGTIME);
+}
+
+/* Convert a log timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_log_dinode_to_disk_ts(
+	struct xfs_log_dinode		*from,
+	const xfs_log_timestamp_t	its)
+{
+	struct xfs_legacy_timestamp	*lts;
+	struct xfs_log_legacy_timestamp	*lits;
+	xfs_timestamp_t			ts;
+
+	if (xfs_log_dinode_has_bigtime(from))
+		return cpu_to_be64(its);
+
+	lts = (struct xfs_legacy_timestamp *)&ts;
+	lits = (struct xfs_log_legacy_timestamp *)&its;
+	lts->t_sec = cpu_to_be32(lits->t_sec);
+	lts->t_nsec = cpu_to_be32(lits->t_nsec);
+
+	return ts;
+}
+
+static inline bool xfs_log_dinode_has_large_extent_counts(
+		const struct xfs_log_dinode *ld)
+{
+	return ld->di_version >= 3 &&
+	       (ld->di_flags2 & XFS_DIFLAG2_NREXT64);
+}
+
+static inline void
+xfs_log_dinode_to_disk_iext_counters(
+	struct xfs_log_dinode	*from,
+	struct xfs_dinode	*to)
+{
+	if (xfs_log_dinode_has_large_extent_counts(from)) {
+		to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
+		to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
+		to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
+	} else {
+		to->di_nextents = cpu_to_be32(from->di_nextents);
+		to->di_anextents = cpu_to_be16(from->di_anextents);
+	}
+
+}
+
+STATIC void
+xfs_log_dinode_to_disk(
+	struct xfs_log_dinode	*from,
+	struct xfs_dinode	*to,
+	xfs_lsn_t		lsn)
+{
+	to->di_magic = cpu_to_be16(from->di_magic);
+	to->di_mode = cpu_to_be16(from->di_mode);
+	to->di_version = from->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = 0;
+	to->di_uid = cpu_to_be32(from->di_uid);
+	to->di_gid = cpu_to_be32(from->di_gid);
+	to->di_nlink = cpu_to_be32(from->di_nlink);
+	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+
+	to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
+	to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
+	to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime);
+
+	to->di_size = cpu_to_be64(from->di_size);
+	to->di_nblocks = cpu_to_be64(from->di_nblocks);
+	to->di_extsize = cpu_to_be32(from->di_extsize);
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat = from->di_aformat;
+	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+	to->di_dmstate = cpu_to_be16(from->di_dmstate);
+	to->di_flags = cpu_to_be16(from->di_flags);
+	to->di_gen = cpu_to_be32(from->di_gen);
+
+	if (from->di_version == 3) {
+		to->di_changecount = cpu_to_be64(from->di_changecount);
+		to->di_crtime = xfs_log_dinode_to_disk_ts(from,
+							  from->di_crtime);
+		to->di_flags2 = cpu_to_be64(from->di_flags2);
+		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
+		to->di_ino = cpu_to_be64(from->di_ino);
+		to->di_lsn = cpu_to_be64(lsn);
+		memset(to->di_pad2, 0, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+		to->di_v3_pad = 0;
+	} else {
+		to->di_flushiter = cpu_to_be16(from->di_flushiter);
+		memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
+	}
+
+	xfs_log_dinode_to_disk_iext_counters(from, to);
+}
+
+STATIC int
+xlog_dinode_verify_extent_counts(
+	struct xfs_mount	*mp,
+	struct xfs_log_dinode	*ldip)
+{
+	xfs_extnum_t		nextents;
+	xfs_aextnum_t		anextents;
+
+	if (xfs_log_dinode_has_large_extent_counts(ldip)) {
+		if (!xfs_has_large_extent_counts(mp) ||
+		    (ldip->di_nrext64_pad != 0)) {
+			XFS_CORRUPTION_ERROR(
+				"Bad log dinode large extent count format",
+				XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+			xfs_alert(mp,
+				"Bad inode 0x%llx, large extent counts %d, padding 0x%x",
+				ldip->di_ino, xfs_has_large_extent_counts(mp),
+				ldip->di_nrext64_pad);
+			return -EFSCORRUPTED;
+		}
+
+		nextents = ldip->di_big_nextents;
+		anextents = ldip->di_big_anextents;
+	} else {
+		if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
+			XFS_CORRUPTION_ERROR(
+				"Bad log dinode di_v3_pad",
+				XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+			xfs_alert(mp,
+				"Bad inode 0x%llx, di_v3_pad 0x%llx",
+				ldip->di_ino, ldip->di_v3_pad);
+			return -EFSCORRUPTED;
+		}
+
+		nextents = ldip->di_nextents;
+		anextents = ldip->di_anextents;
+	}
+
+	if (unlikely(nextents + anextents > ldip->di_nblocks)) {
+		XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
+				XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+		xfs_alert(mp,
+			"Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
+			ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
+			anextents, ldip->di_nblocks);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+STATIC int
+xlog_recover_inode_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			current_lsn)
+{
+	struct xfs_inode_log_format	*in_f;
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_buf			*bp;
+	struct xfs_dinode		*dip;
+	int				len;
+	char				*src;
+	char				*dest;
+	int				error;
+	int				attr_index;
+	uint				fields;
+	struct xfs_log_dinode		*ldip;
+	uint				isize;
+	int				need_free = 0;
+
+	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+		in_f = item->ri_buf[0].i_addr;
+	} else {
+		in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
+		need_free = 1;
+		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+		if (error)
+			goto error;
+	}
+
+	/*
+	 * Inode buffers can be freed, look out for it,
+	 * and do not replay the inode.
+	 */
+	if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) {
+		error = 0;
+		trace_xfs_log_recover_inode_cancel(log, in_f);
+		goto error;
+	}
+	trace_xfs_log_recover_inode_recover(log, in_f);
+
+	error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
+			0, &bp, &xfs_inode_buf_ops);
+	if (error)
+		goto error;
+	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+	dip = xfs_buf_offset(bp, in_f->ilf_boffset);
+
+	/*
+	 * Make sure the place we're flushing out to really looks
+	 * like an inode!
+	 */
+	if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
+		xfs_alert(mp,
+	"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld",
+			__func__, dip, bp, in_f->ilf_ino);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+	ldip = item->ri_buf[1].i_addr;
+	if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
+		xfs_alert(mp,
+			"%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld",
+			__func__, item, in_f->ilf_ino);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+
+	/*
+	 * If the inode has an LSN in it, recover the inode only if the on-disk
+	 * inode's LSN is older than the lsn of the transaction we are
+	 * replaying. We can have multiple checkpoints with the same start LSN,
+	 * so the current LSN being equal to the on-disk LSN doesn't necessarily
+	 * mean that the on-disk inode is more recent than the change being
+	 * replayed.
+	 *
+	 * We must check the current_lsn against the on-disk inode
+	 * here because the we can't trust the log dinode to contain a valid LSN
+	 * (see comment below before replaying the log dinode for details).
+	 *
+	 * Note: we still need to replay an owner change even though the inode
+	 * is more recent than the transaction as there is no guarantee that all
+	 * the btree blocks are more recent than this transaction, too.
+	 */
+	if (dip->di_version >= 3) {
+		xfs_lsn_t	lsn = be64_to_cpu(dip->di_lsn);
+
+		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
+			trace_xfs_log_recover_inode_skip(log, in_f);
+			error = 0;
+			goto out_owner_change;
+		}
+	}
+
+	/*
+	 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+	 * are transactional and if ordering is necessary we can determine that
+	 * more accurately by the LSN field in the V3 inode core. Don't trust
+	 * the inode versions we might be changing them here - use the
+	 * superblock flag to determine whether we need to look at di_flushiter
+	 * to skip replay when the on disk inode is newer than the log one
+	 */
+	if (!xfs_has_v3inodes(mp)) {
+		if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+			/*
+			 * Deal with the wrap case, DI_MAX_FLUSH is less
+			 * than smaller numbers
+			 */
+			if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+			    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+				/* do nothing */
+			} else {
+				trace_xfs_log_recover_inode_skip(log, in_f);
+				error = 0;
+				goto out_release;
+			}
+		}
+
+		/* Take the opportunity to reset the flush iteration count */
+		ldip->di_flushiter = 0;
+	}
+
+
+	if (unlikely(S_ISREG(ldip->di_mode))) {
+		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
+			XFS_CORRUPTION_ERROR(
+				"Bad log dinode data fork format for regular file",
+				XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+			xfs_alert(mp,
+				"Bad inode 0x%llx, data fork format 0x%x",
+				in_f->ilf_ino, ldip->di_format);
+			error = -EFSCORRUPTED;
+			goto out_release;
+		}
+	} else if (unlikely(S_ISDIR(ldip->di_mode))) {
+		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
+		    (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
+			XFS_CORRUPTION_ERROR(
+				"Bad log dinode data fork format for directory",
+				XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+			xfs_alert(mp,
+				"Bad inode 0x%llx, data fork format 0x%x",
+				in_f->ilf_ino, ldip->di_format);
+			error = -EFSCORRUPTED;
+			goto out_release;
+		}
+	}
+
+	error = xlog_dinode_verify_extent_counts(mp, ldip);
+	if (error)
+		goto out_release;
+
+	if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
+		XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
+				XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+		xfs_alert(mp,
+			"Bad inode 0x%llx, di_forkoff 0x%x",
+			in_f->ilf_ino, ldip->di_forkoff);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+	isize = xfs_log_dinode_size(mp);
+	if (unlikely(item->ri_buf[1].i_len > isize)) {
+		XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
+				     mp, ldip, sizeof(*ldip));
+		xfs_alert(mp,
+			"Bad inode 0x%llx log dinode size 0x%x",
+			in_f->ilf_ino, item->ri_buf[1].i_len);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+
+	/*
+	 * Recover the log dinode inode into the on disk inode.
+	 *
+	 * The LSN in the log dinode is garbage - it can be zero or reflect
+	 * stale in-memory runtime state that isn't coherent with the changes
+	 * logged in this transaction or the changes written to the on-disk
+	 * inode.  Hence we write the current lSN into the inode because that
+	 * matches what xfs_iflush() would write inode the inode when flushing
+	 * the changes in this transaction.
+	 */
+	xfs_log_dinode_to_disk(ldip, dip, current_lsn);
+
+	fields = in_f->ilf_fields;
+	if (fields & XFS_ILOG_DEV)
+		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
+
+	if (in_f->ilf_size == 2)
+		goto out_owner_change;
+	len = item->ri_buf[2].i_len;
+	src = item->ri_buf[2].i_addr;
+	ASSERT(in_f->ilf_size <= 4);
+	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+	ASSERT(!(fields & XFS_ILOG_DFORK) ||
+	       (len == xlog_calc_iovec_len(in_f->ilf_dsize)));
+
+	switch (fields & XFS_ILOG_DFORK) {
+	case XFS_ILOG_DDATA:
+	case XFS_ILOG_DEXT:
+		memcpy(XFS_DFORK_DPTR(dip), src, len);
+		break;
+
+	case XFS_ILOG_DBROOT:
+		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+				 (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip),
+				 XFS_DFORK_DSIZE(dip, mp));
+		break;
+
+	default:
+		/*
+		 * There are no data fork flags set.
+		 */
+		ASSERT((fields & XFS_ILOG_DFORK) == 0);
+		break;
+	}
+
+	/*
+	 * If we logged any attribute data, recover it.  There may or
+	 * may not have been any other non-core data logged in this
+	 * transaction.
+	 */
+	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+			attr_index = 3;
+		} else {
+			attr_index = 2;
+		}
+		len = item->ri_buf[attr_index].i_len;
+		src = item->ri_buf[attr_index].i_addr;
+		ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
+
+		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		case XFS_ILOG_ADATA:
+		case XFS_ILOG_AEXT:
+			dest = XFS_DFORK_APTR(dip);
+			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+			memcpy(dest, src, len);
+			break;
+
+		case XFS_ILOG_ABROOT:
+			dest = XFS_DFORK_APTR(dip);
+			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+					 len, (struct xfs_bmdr_block *)dest,
+					 XFS_DFORK_ASIZE(dip, mp));
+			break;
+
+		default:
+			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
+			ASSERT(0);
+			error = -EFSCORRUPTED;
+			goto out_release;
+		}
+	}
+
+out_owner_change:
+	/* Recover the swapext owner change unless inode has been deleted */
+	if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
+	    (dip->di_mode != 0))
+		error = xfs_recover_inode_owner_change(mp, dip, in_f,
+						       buffer_list);
+	/* re-generate the checksum. */
+	xfs_dinode_calc_crc(log->l_mp, dip);
+
+	ASSERT(bp->b_mount == mp);
+	bp->b_flags |= _XBF_LOGRECOVERY;
+	xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+	xfs_buf_relse(bp);
+error:
+	if (need_free)
+		kmem_free(in_f);
+	return error;
+}
+
+const struct xlog_recover_item_ops xlog_inode_item_ops = {
+	.item_type		= XFS_LI_INODE,
+	.ra_pass2		= xlog_recover_inode_ra_pass2,
+	.commit_pass2		= xlog_recover_inode_commit_pass2,
+};
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
new file mode 100644
index 000000000..1f783e979
--- /dev/null
+++ b/fs/xfs/xfs_ioctl.c
@@ -0,0 +1,2159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtalloc.h"
+#include "xfs_iwalk.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_fsops.h"
+#include "xfs_discard.h"
+#include "xfs_quota.h"
+#include "xfs_export.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_acl.h"
+#include "xfs_btree.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
+#include "scrub/xfs_scrub.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_health.h"
+#include "xfs_reflink.h"
+#include "xfs_ioctl.h"
+#include "xfs_xattr.h"
+
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fileattr.h>
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ *    returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ *    returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ *    returns full handle for a path
+ */
+int
+xfs_find_handle(
+	unsigned int		cmd,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	int			hsize;
+	xfs_handle_t		handle;
+	struct inode		*inode;
+	struct fd		f = {NULL};
+	struct path		path;
+	int			error;
+	struct xfs_inode	*ip;
+
+	if (cmd == XFS_IOC_FD_TO_HANDLE) {
+		f = fdget(hreq->fd);
+		if (!f.file)
+			return -EBADF;
+		inode = file_inode(f.file);
+	} else {
+		error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
+		if (error)
+			return error;
+		inode = d_inode(path.dentry);
+	}
+	ip = XFS_I(inode);
+
+	/*
+	 * We can only generate handles for inodes residing on a XFS filesystem,
+	 * and only for regular files, directories or symbolic links.
+	 */
+	error = -EINVAL;
+	if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+		goto out_put;
+
+	error = -EBADF;
+	if (!S_ISREG(inode->i_mode) &&
+	    !S_ISDIR(inode->i_mode) &&
+	    !S_ISLNK(inode->i_mode))
+		goto out_put;
+
+
+	memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+	if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
+		/*
+		 * This handle only contains an fsid, zero the rest.
+		 */
+		memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
+		hsize = sizeof(xfs_fsid_t);
+	} else {
+		handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
+					sizeof(handle.ha_fid.fid_len);
+		handle.ha_fid.fid_pad = 0;
+		handle.ha_fid.fid_gen = inode->i_generation;
+		handle.ha_fid.fid_ino = ip->i_ino;
+		hsize = sizeof(xfs_handle_t);
+	}
+
+	error = -EFAULT;
+	if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+	    copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+		goto out_put;
+
+	error = 0;
+
+ out_put:
+	if (cmd == XFS_IOC_FD_TO_HANDLE)
+		fdput(f);
+	else
+		path_put(&path);
+	return error;
+}
+
+/*
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
+ */
+STATIC int
+xfs_handle_acceptable(
+	void			*context,
+	struct dentry		*dentry)
+{
+	return 1;
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen)
+{
+	xfs_handle_t		handle;
+	struct xfs_fid64	fid;
+
+	/*
+	 * Only allow handle opens under a directory.
+	 */
+	if (!S_ISDIR(file_inode(parfilp)->i_mode))
+		return ERR_PTR(-ENOTDIR);
+
+	if (hlen != sizeof(xfs_handle_t))
+		return ERR_PTR(-EINVAL);
+	if (copy_from_user(&handle, uhandle, hlen))
+		return ERR_PTR(-EFAULT);
+	if (handle.ha_fid.fid_len !=
+	    sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
+		return ERR_PTR(-EINVAL);
+
+	memset(&fid, 0, sizeof(struct fid));
+	fid.ino = handle.ha_fid.fid_ino;
+	fid.gen = handle.ha_fid.fid_gen;
+
+	return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
+			FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+			xfs_handle_acceptable, NULL);
+}
+
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
+}
+
+int
+xfs_open_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	const struct cred	*cred = current_cred();
+	int			error;
+	int			fd;
+	int			permflag;
+	struct file		*filp;
+	struct inode		*inode;
+	struct dentry		*dentry;
+	fmode_t			fmode;
+	struct path		path;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	inode = d_inode(dentry);
+
+	/* Restrict xfs_open_by_handle to directories & regular files. */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+		error = -EPERM;
+		goto out_dput;
+	}
+
+#if BITS_PER_LONG != 32
+	hreq->oflags |= O_LARGEFILE;
+#endif
+
+	permflag = hreq->oflags;
+	fmode = OPEN_FMODE(permflag);
+	if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
+	    (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
+		error = -EPERM;
+		goto out_dput;
+	}
+
+	if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+		error = -EPERM;
+		goto out_dput;
+	}
+
+	/* Can't write directories. */
+	if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
+		error = -EISDIR;
+		goto out_dput;
+	}
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0) {
+		error = fd;
+		goto out_dput;
+	}
+
+	path.mnt = parfilp->f_path.mnt;
+	path.dentry = dentry;
+	filp = dentry_open(&path, hreq->oflags, cred);
+	dput(dentry);
+	if (IS_ERR(filp)) {
+		put_unused_fd(fd);
+		return PTR_ERR(filp);
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		filp->f_flags |= O_NOATIME;
+		filp->f_mode |= FMODE_NOCMTIME;
+	}
+
+	fd_install(fd, filp);
+	return fd;
+
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+int
+xfs_readlink_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	struct dentry		*dentry;
+	__u32			olen;
+	int			error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	/* Restrict this handle operation to symlinks only. */
+	if (!d_is_symlink(dentry)) {
+		error = -EINVAL;
+		goto out_dput;
+	}
+
+	if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
+		error = -EFAULT;
+		goto out_dput;
+	}
+
+	error = vfs_readlink(dentry, hreq->ohandle, olen);
+
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+static void
+xfs_ioc_attr_put_listent(
+	struct xfs_attr_list_context *context,
+	int			flags,
+	unsigned char		*name,
+	int			namelen,
+	int			valuelen)
+{
+	struct xfs_attrlist	*alist = context->buffer;
+	struct xfs_attrlist_ent	*aep;
+	int			arraytop;
+
+	ASSERT(!context->seen_enough);
+	ASSERT(context->count >= 0);
+	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+	ASSERT(context->firstu >= sizeof(*alist));
+	ASSERT(context->firstu <= context->bufsize);
+
+	/*
+	 * Only list entries in the right namespace.
+	 */
+	if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK))
+		return;
+
+	arraytop = sizeof(*alist) +
+			context->count * sizeof(alist->al_offset[0]);
+
+	/* decrement by the actual bytes used by the attr */
+	context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) +
+			namelen + 1, sizeof(uint32_t));
+	if (context->firstu < arraytop) {
+		trace_xfs_attr_list_full(context);
+		alist->al_more = 1;
+		context->seen_enough = 1;
+		return;
+	}
+
+	aep = context->buffer + context->firstu;
+	aep->a_valuelen = valuelen;
+	memcpy(aep->a_name, name, namelen);
+	aep->a_name[namelen] = 0;
+	alist->al_offset[context->count++] = context->firstu;
+	alist->al_count = context->count;
+	trace_xfs_attr_list_add(context);
+}
+
+static unsigned int
+xfs_attr_filter(
+	u32			ioc_flags)
+{
+	if (ioc_flags & XFS_IOC_ATTR_ROOT)
+		return XFS_ATTR_ROOT;
+	if (ioc_flags & XFS_IOC_ATTR_SECURE)
+		return XFS_ATTR_SECURE;
+	return 0;
+}
+
+static unsigned int
+xfs_attr_flags(
+	u32			ioc_flags)
+{
+	if (ioc_flags & XFS_IOC_ATTR_CREATE)
+		return XATTR_CREATE;
+	if (ioc_flags & XFS_IOC_ATTR_REPLACE)
+		return XATTR_REPLACE;
+	return 0;
+}
+
+int
+xfs_ioc_attr_list(
+	struct xfs_inode		*dp,
+	void __user			*ubuf,
+	size_t				bufsize,
+	int				flags,
+	struct xfs_attrlist_cursor __user *ucursor)
+{
+	struct xfs_attr_list_context	context = { };
+	struct xfs_attrlist		*alist;
+	void				*buffer;
+	int				error;
+
+	if (bufsize < sizeof(struct xfs_attrlist) ||
+	    bufsize > XFS_XATTR_LIST_MAX)
+		return -EINVAL;
+
+	/*
+	 * Reject flags, only allow namespaces.
+	 */
+	if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+		return -EINVAL;
+	if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+		return -EINVAL;
+
+	/*
+	 * Validate the cursor.
+	 */
+	if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor)))
+		return -EFAULT;
+	if (context.cursor.pad1 || context.cursor.pad2)
+		return -EINVAL;
+	if (!context.cursor.initted &&
+	    (context.cursor.hashval || context.cursor.blkno ||
+	     context.cursor.offset))
+		return -EINVAL;
+
+	buffer = kvzalloc(bufsize, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	/*
+	 * Initialize the output buffer.
+	 */
+	context.dp = dp;
+	context.resynch = 1;
+	context.attr_filter = xfs_attr_filter(flags);
+	context.buffer = buffer;
+	context.bufsize = round_down(bufsize, sizeof(uint32_t));
+	context.firstu = context.bufsize;
+	context.put_listent = xfs_ioc_attr_put_listent;
+
+	alist = context.buffer;
+	alist->al_count = 0;
+	alist->al_more = 0;
+	alist->al_offset[0] = context.bufsize;
+
+	error = xfs_attr_list(&context);
+	if (error)
+		goto out_free;
+
+	if (copy_to_user(ubuf, buffer, bufsize) ||
+	    copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
+		error = -EFAULT;
+out_free:
+	kmem_free(buffer);
+	return error;
+}
+
+STATIC int
+xfs_attrlist_by_handle(
+	struct file		*parfilp,
+	struct xfs_fsop_attrlist_handlereq __user *p)
+{
+	struct xfs_fsop_attrlist_handlereq al_hreq;
+	struct dentry		*dentry;
+	int			error = -ENOMEM;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
+		return -EFAULT;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer,
+				  al_hreq.buflen, al_hreq.flags, &p->pos);
+	dput(dentry);
+	return error;
+}
+
+static int
+xfs_attrmulti_attr_get(
+	struct inode		*inode,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
+	uint32_t		*len,
+	uint32_t		flags)
+{
+	struct xfs_da_args	args = {
+		.dp		= XFS_I(inode),
+		.attr_filter	= xfs_attr_filter(flags),
+		.attr_flags	= xfs_attr_flags(flags),
+		.name		= name,
+		.namelen	= strlen(name),
+		.valuelen	= *len,
+	};
+	int			error;
+
+	if (*len > XFS_XATTR_SIZE_MAX)
+		return -EINVAL;
+
+	error = xfs_attr_get(&args);
+	if (error)
+		goto out_kfree;
+
+	*len = args.valuelen;
+	if (copy_to_user(ubuf, args.value, args.valuelen))
+		error = -EFAULT;
+
+out_kfree:
+	kmem_free(args.value);
+	return error;
+}
+
+static int
+xfs_attrmulti_attr_set(
+	struct inode		*inode,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
+	uint32_t		len,
+	uint32_t		flags)
+{
+	struct xfs_da_args	args = {
+		.dp		= XFS_I(inode),
+		.attr_filter	= xfs_attr_filter(flags),
+		.attr_flags	= xfs_attr_flags(flags),
+		.name		= name,
+		.namelen	= strlen(name),
+	};
+	int			error;
+
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+
+	if (ubuf) {
+		if (len > XFS_XATTR_SIZE_MAX)
+			return -EINVAL;
+		args.value = memdup_user(ubuf, len);
+		if (IS_ERR(args.value))
+			return PTR_ERR(args.value);
+		args.valuelen = len;
+	}
+
+	error = xfs_attr_change(&args);
+	if (!error && (flags & XFS_IOC_ATTR_ROOT))
+		xfs_forget_acl(inode, name);
+	kfree(args.value);
+	return error;
+}
+
+int
+xfs_ioc_attrmulti_one(
+	struct file		*parfilp,
+	struct inode		*inode,
+	uint32_t		opcode,
+	void __user		*uname,
+	void __user		*value,
+	uint32_t		*len,
+	uint32_t		flags)
+{
+	unsigned char		*name;
+	int			error;
+
+	if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE))
+		return -EINVAL;
+
+	name = strndup_user(uname, MAXNAMELEN);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	switch (opcode) {
+	case ATTR_OP_GET:
+		error = xfs_attrmulti_attr_get(inode, name, value, len, flags);
+		break;
+	case ATTR_OP_REMOVE:
+		value = NULL;
+		*len = 0;
+		fallthrough;
+	case ATTR_OP_SET:
+		error = mnt_want_write_file(parfilp);
+		if (error)
+			break;
+		error = xfs_attrmulti_attr_set(inode, name, value, *len, flags);
+		mnt_drop_write_file(parfilp);
+		break;
+	default:
+		error = -EINVAL;
+		break;
+	}
+
+	kfree(name);
+	return error;
+}
+
+STATIC int
+xfs_attrmulti_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	xfs_attr_multiop_t	*ops;
+	xfs_fsop_attrmulti_handlereq_t am_hreq;
+	struct dentry		*dentry;
+	unsigned int		i, size;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
+		return -EFAULT;
+
+	/* overflow check */
+	if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+		return -E2BIG;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = -E2BIG;
+	size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
+	if (!size || size > 16 * PAGE_SIZE)
+		goto out_dput;
+
+	ops = memdup_user(am_hreq.ops, size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
+		goto out_dput;
+	}
+
+	error = 0;
+	for (i = 0; i < am_hreq.opcount; i++) {
+		ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+				d_inode(dentry), ops[i].am_opcode,
+				ops[i].am_attrname, ops[i].am_attrvalue,
+				&ops[i].am_length, ops[i].am_flags);
+	}
+
+	if (copy_to_user(am_hreq.ops, ops, size))
+		error = -EFAULT;
+
+	kfree(ops);
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+/* Return 0 on success or positive error */
+int
+xfs_fsbulkstat_one_fmt(
+	struct xfs_ibulk		*breq,
+	const struct xfs_bulkstat	*bstat)
+{
+	struct xfs_bstat		bs1;
+
+	xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat);
+	if (copy_to_user(breq->ubuffer, &bs1, sizeof(bs1)))
+		return -EFAULT;
+	return xfs_ibulk_advance(breq, sizeof(struct xfs_bstat));
+}
+
+int
+xfs_fsinumbers_fmt(
+	struct xfs_ibulk		*breq,
+	const struct xfs_inumbers	*igrp)
+{
+	struct xfs_inogrp		ig1;
+
+	xfs_inumbers_to_inogrp(&ig1, igrp);
+	if (copy_to_user(breq->ubuffer, &ig1, sizeof(struct xfs_inogrp)))
+		return -EFAULT;
+	return xfs_ibulk_advance(breq, sizeof(struct xfs_inogrp));
+}
+
+STATIC int
+xfs_ioc_fsbulkstat(
+	struct file		*file,
+	unsigned int		cmd,
+	void			__user *arg)
+{
+	struct xfs_mount	*mp = XFS_I(file_inode(file))->i_mount;
+	struct xfs_fsop_bulkreq	bulkreq;
+	struct xfs_ibulk	breq = {
+		.mp		= mp,
+		.mnt_userns	= file_mnt_user_ns(file),
+		.ocount		= 0,
+	};
+	xfs_ino_t		lastino;
+	int			error;
+
+	/* done = 1 if there are more stats to get and if bulkstat */
+	/* should be called again (unused here, but used in dmapi) */
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq)))
+		return -EFAULT;
+
+	if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64)))
+		return -EFAULT;
+
+	if (bulkreq.icount <= 0)
+		return -EINVAL;
+
+	if (bulkreq.ubuffer == NULL)
+		return -EINVAL;
+
+	breq.ubuffer = bulkreq.ubuffer;
+	breq.icount = bulkreq.icount;
+
+	/*
+	 * FSBULKSTAT_SINGLE expects that *lastip contains the inode number
+	 * that we want to stat.  However, FSINUMBERS and FSBULKSTAT expect
+	 * that *lastip contains either zero or the number of the last inode to
+	 * be examined by the previous call and return results starting with
+	 * the next inode after that.  The new bulk request back end functions
+	 * take the inode to start with, so we have to compute the startino
+	 * parameter from lastino to maintain correct function.  lastino == 0
+	 * is a special case because it has traditionally meant "first inode
+	 * in filesystem".
+	 */
+	if (cmd == XFS_IOC_FSINUMBERS) {
+		breq.startino = lastino ? lastino + 1 : 0;
+		error = xfs_inumbers(&breq, xfs_fsinumbers_fmt);
+		lastino = breq.startino - 1;
+	} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) {
+		breq.startino = lastino;
+		breq.icount = 1;
+		error = xfs_bulkstat_one(&breq, xfs_fsbulkstat_one_fmt);
+	} else {	/* XFS_IOC_FSBULKSTAT */
+		breq.startino = lastino ? lastino + 1 : 0;
+		error = xfs_bulkstat(&breq, xfs_fsbulkstat_one_fmt);
+		lastino = breq.startino - 1;
+	}
+
+	if (error)
+		return error;
+
+	if (bulkreq.lastip != NULL &&
+	    copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t)))
+		return -EFAULT;
+
+	if (bulkreq.ocount != NULL &&
+	    copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* Return 0 on success or positive error */
+static int
+xfs_bulkstat_fmt(
+	struct xfs_ibulk		*breq,
+	const struct xfs_bulkstat	*bstat)
+{
+	if (copy_to_user(breq->ubuffer, bstat, sizeof(struct xfs_bulkstat)))
+		return -EFAULT;
+	return xfs_ibulk_advance(breq, sizeof(struct xfs_bulkstat));
+}
+
+/*
+ * Check the incoming bulk request @hdr from userspace and initialize the
+ * internal @breq bulk request appropriately.  Returns 0 if the bulk request
+ * should proceed; -ECANCELED if there's nothing to do; or the usual
+ * negative error code.
+ */
+static int
+xfs_bulk_ireq_setup(
+	struct xfs_mount	*mp,
+	struct xfs_bulk_ireq	*hdr,
+	struct xfs_ibulk	*breq,
+	void __user		*ubuffer)
+{
+	if (hdr->icount == 0 ||
+	    (hdr->flags & ~XFS_BULK_IREQ_FLAGS_ALL) ||
+	    memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+		return -EINVAL;
+
+	breq->startino = hdr->ino;
+	breq->ubuffer = ubuffer;
+	breq->icount = hdr->icount;
+	breq->ocount = 0;
+	breq->flags = 0;
+
+	/*
+	 * The @ino parameter is a special value, so we must look it up here.
+	 * We're not allowed to have IREQ_AGNO, and we only return one inode
+	 * worth of data.
+	 */
+	if (hdr->flags & XFS_BULK_IREQ_SPECIAL) {
+		if (hdr->flags & XFS_BULK_IREQ_AGNO)
+			return -EINVAL;
+
+		switch (hdr->ino) {
+		case XFS_BULK_IREQ_SPECIAL_ROOT:
+			hdr->ino = mp->m_sb.sb_rootino;
+			break;
+		default:
+			return -EINVAL;
+		}
+		breq->icount = 1;
+	}
+
+	/*
+	 * The IREQ_AGNO flag means that we only want results from a given AG.
+	 * If @hdr->ino is zero, we start iterating in that AG.  If @hdr->ino is
+	 * beyond the specified AG then we return no results.
+	 */
+	if (hdr->flags & XFS_BULK_IREQ_AGNO) {
+		if (hdr->agno >= mp->m_sb.sb_agcount)
+			return -EINVAL;
+
+		if (breq->startino == 0)
+			breq->startino = XFS_AGINO_TO_INO(mp, hdr->agno, 0);
+		else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno)
+			return -EINVAL;
+
+		breq->flags |= XFS_IBULK_SAME_AG;
+
+		/* Asking for an inode past the end of the AG?  We're done! */
+		if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
+			return -ECANCELED;
+	} else if (hdr->agno)
+		return -EINVAL;
+
+	/* Asking for an inode past the end of the FS?  We're done! */
+	if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
+		return -ECANCELED;
+
+	if (hdr->flags & XFS_BULK_IREQ_NREXT64)
+		breq->flags |= XFS_IBULK_NREXT64;
+
+	return 0;
+}
+
+/*
+ * Update the userspace bulk request @hdr to reflect the end state of the
+ * internal bulk request @breq.
+ */
+static void
+xfs_bulk_ireq_teardown(
+	struct xfs_bulk_ireq	*hdr,
+	struct xfs_ibulk	*breq)
+{
+	hdr->ino = breq->startino;
+	hdr->ocount = breq->ocount;
+}
+
+/* Handle the v5 bulkstat ioctl. */
+STATIC int
+xfs_ioc_bulkstat(
+	struct file			*file,
+	unsigned int			cmd,
+	struct xfs_bulkstat_req __user	*arg)
+{
+	struct xfs_mount		*mp = XFS_I(file_inode(file))->i_mount;
+	struct xfs_bulk_ireq		hdr;
+	struct xfs_ibulk		breq = {
+		.mp			= mp,
+		.mnt_userns		= file_mnt_user_ns(file),
+	};
+	int				error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
+		return -EFAULT;
+
+	error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
+	if (error == -ECANCELED)
+		goto out_teardown;
+	if (error < 0)
+		return error;
+
+	error = xfs_bulkstat(&breq, xfs_bulkstat_fmt);
+	if (error)
+		return error;
+
+out_teardown:
+	xfs_bulk_ireq_teardown(&hdr, &breq);
+	if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr)))
+		return -EFAULT;
+
+	return 0;
+}
+
+STATIC int
+xfs_inumbers_fmt(
+	struct xfs_ibulk		*breq,
+	const struct xfs_inumbers	*igrp)
+{
+	if (copy_to_user(breq->ubuffer, igrp, sizeof(struct xfs_inumbers)))
+		return -EFAULT;
+	return xfs_ibulk_advance(breq, sizeof(struct xfs_inumbers));
+}
+
+/* Handle the v5 inumbers ioctl. */
+STATIC int
+xfs_ioc_inumbers(
+	struct xfs_mount		*mp,
+	unsigned int			cmd,
+	struct xfs_inumbers_req __user	*arg)
+{
+	struct xfs_bulk_ireq		hdr;
+	struct xfs_ibulk		breq = {
+		.mp			= mp,
+	};
+	int				error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
+		return -EFAULT;
+
+	error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
+	if (error == -ECANCELED)
+		goto out_teardown;
+	if (error < 0)
+		return error;
+
+	error = xfs_inumbers(&breq, xfs_inumbers_fmt);
+	if (error)
+		return error;
+
+out_teardown:
+	xfs_bulk_ireq_teardown(&hdr, &breq);
+	if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr)))
+		return -EFAULT;
+
+	return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry(
+	struct xfs_mount	*mp,
+	void			__user *arg,
+	int			struct_version)
+{
+	struct xfs_fsop_geom	fsgeo;
+	size_t			len;
+
+	xfs_fs_geometry(mp, &fsgeo, struct_version);
+
+	if (struct_version <= 3)
+		len = sizeof(struct xfs_fsop_geom_v1);
+	else if (struct_version == 4)
+		len = sizeof(struct xfs_fsop_geom_v4);
+	else {
+		xfs_fsop_geom_health(mp, &fsgeo);
+		len = sizeof(fsgeo);
+	}
+
+	if (copy_to_user(arg, &fsgeo, len))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_ioc_ag_geometry(
+	struct xfs_mount	*mp,
+	void			__user *arg)
+{
+	struct xfs_perag	*pag;
+	struct xfs_ag_geometry	ageo;
+	int			error;
+
+	if (copy_from_user(&ageo, arg, sizeof(ageo)))
+		return -EFAULT;
+	if (ageo.ag_flags)
+		return -EINVAL;
+	if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
+		return -EINVAL;
+
+	pag = xfs_perag_get(mp, ageo.ag_number);
+	if (!pag)
+		return -EINVAL;
+
+	error = xfs_ag_get_geometry(pag, &ageo);
+	xfs_perag_put(pag);
+	if (error)
+		return error;
+
+	if (copy_to_user(arg, &ageo, sizeof(ageo)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * Linux extended inode flags interface.
+ */
+
+static void
+xfs_fill_fsxattr(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct fileattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	fileattr_fill_xflags(fa, xfs_ip2xflags(ip));
+
+	if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) {
+		fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize);
+	} else if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+		/*
+		 * Don't let a misaligned extent size hint on a directory
+		 * escape to userspace if it won't pass the setattr checks
+		 * later.
+		 */
+		if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+		    ip->i_extsize % mp->m_sb.sb_rextsize > 0) {
+			fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE |
+					    FS_XFLAG_EXTSZINHERIT);
+			fa->fsx_extsize = 0;
+		} else {
+			fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize);
+		}
+	}
+
+	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+		fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+	fa->fsx_projid = ip->i_projid;
+	if (ifp && !xfs_need_iread_extents(ifp))
+		fa->fsx_nextents = xfs_iext_count(ifp);
+	else
+		fa->fsx_nextents = xfs_ifork_nextents(ifp);
+}
+
+STATIC int
+xfs_ioc_fsgetxattra(
+	xfs_inode_t		*ip,
+	void			__user *arg)
+{
+	struct fileattr		fa;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	return copy_fsxattr_to_user(&fa, arg);
+}
+
+int
+xfs_fileattr_get(
+	struct dentry		*dentry,
+	struct fileattr		*fa)
+{
+	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+
+	if (d_is_special(dentry))
+		return -ENOTTY;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	return 0;
+}
+
+STATIC uint16_t
+xfs_flags2diflags(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	/* can't set PREALLOC this way, just preserve it */
+	uint16_t		di_flags =
+		(ip->i_diflags & XFS_DIFLAG_PREALLOC);
+
+	if (xflags & FS_XFLAG_IMMUTABLE)
+		di_flags |= XFS_DIFLAG_IMMUTABLE;
+	if (xflags & FS_XFLAG_APPEND)
+		di_flags |= XFS_DIFLAG_APPEND;
+	if (xflags & FS_XFLAG_SYNC)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if (xflags & FS_XFLAG_NOATIME)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if (xflags & FS_XFLAG_NODUMP)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if (xflags & FS_XFLAG_NODEFRAG)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (xflags & FS_XFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		if (xflags & FS_XFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (xflags & FS_XFLAG_NOSYMLINKS)
+			di_flags |= XFS_DIFLAG_NOSYMLINKS;
+		if (xflags & FS_XFLAG_EXTSZINHERIT)
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+		if (xflags & FS_XFLAG_PROJINHERIT)
+			di_flags |= XFS_DIFLAG_PROJINHERIT;
+	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
+		if (xflags & FS_XFLAG_REALTIME)
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (xflags & FS_XFLAG_EXTSIZE)
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+	}
+
+	return di_flags;
+}
+
+STATIC uint64_t
+xfs_flags2diflags2(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	uint64_t		di_flags2 =
+		(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
+				   XFS_DIFLAG2_BIGTIME |
+				   XFS_DIFLAG2_NREXT64));
+
+	if (xflags & FS_XFLAG_DAX)
+		di_flags2 |= XFS_DIFLAG2_DAX;
+	if (xflags & FS_XFLAG_COWEXTSIZE)
+		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+
+	return di_flags2;
+}
+
+static int
+xfs_ioctl_setattr_xflags(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct fileattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint64_t		i_flags2;
+
+	/* Can't change realtime flag if any extents are allocated. */
+	if ((ip->i_df.if_nextents || ip->i_delayed_blks) &&
+	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
+		return -EINVAL;
+
+	/* If realtime flag is set then must have realtime device */
+	if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
+		if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
+		    (ip->i_extsize % mp->m_sb.sb_rextsize))
+			return -EINVAL;
+	}
+
+	/* Clear reflink if we are actually able to set the rt flag. */
+	if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
+		ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+	/* Don't allow us to set DAX mode for a reflinked file for now. */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
+		return -EINVAL;
+
+	/* diflags2 only valid for v3 inodes. */
+	i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
+	if (i_flags2 && !xfs_has_v3inodes(mp))
+		return -EINVAL;
+
+	ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+	ip->i_diflags2 = i_flags2;
+
+	xfs_diflags_to_iflags(ip, false);
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	XFS_STATS_INC(mp, xs_ig_attrchg);
+	return 0;
+}
+
+static void
+xfs_ioctl_setattr_prepare_dax(
+	struct xfs_inode	*ip,
+	struct fileattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode            *inode = VFS_I(ip);
+
+	if (S_ISDIR(inode->i_mode))
+		return;
+
+	if (xfs_has_dax_always(mp) || xfs_has_dax_never(mp))
+		return;
+
+	if (((fa->fsx_xflags & FS_XFLAG_DAX) &&
+	    !(ip->i_diflags2 & XFS_DIFLAG2_DAX)) ||
+	    (!(fa->fsx_xflags & FS_XFLAG_DAX) &&
+	     (ip->i_diflags2 & XFS_DIFLAG2_DAX)))
+		d_mark_dontcache(inode);
+}
+
+/*
+ * Set up the transaction structure for the setattr operation, checking that we
+ * have permission to do so. On success, return a clean transaction and the
+ * inode locked exclusively ready for further operation specific checks. On
+ * failure, return an error without modifying or locking the inode.
+ */
+static struct xfs_trans *
+xfs_ioctl_setattr_get_trans(
+	struct xfs_inode	*ip,
+	struct xfs_dquot	*pdqp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error = -EROFS;
+
+	if (xfs_is_readonly(mp))
+		goto out_error;
+	error = -EIO;
+	if (xfs_is_shutdown(mp))
+		goto out_error;
+
+	error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
+			has_capability_noaudit(current, CAP_FOWNER), &tp);
+	if (error)
+		goto out_error;
+
+	if (xfs_has_wsync(mp))
+		xfs_trans_set_sync(tp);
+
+	return tp;
+
+out_error:
+	return ERR_PTR(error);
+}
+
+/*
+ * Validate a proposed extent size hint.  For regular files, the hint can only
+ * be changed if no extents are allocated.
+ */
+static int
+xfs_ioctl_setattr_check_extsize(
+	struct xfs_inode	*ip,
+	struct fileattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_failaddr_t		failaddr;
+	uint16_t		new_diflags;
+
+	if (!fa->fsx_valid)
+		return 0;
+
+	if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
+	    XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize)
+		return -EINVAL;
+
+	if (fa->fsx_extsize & mp->m_blockmask)
+		return -EINVAL;
+
+	new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+
+	/*
+	 * Inode verifiers do not check that the extent size hint is an integer
+	 * multiple of the rt extent size on a directory with both rtinherit
+	 * and extszinherit flags set.  Don't let sysadmins misconfigure
+	 * directories.
+	 */
+	if ((new_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (new_diflags & XFS_DIFLAG_EXTSZINHERIT)) {
+		unsigned int	rtextsize_bytes;
+
+		rtextsize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+		if (fa->fsx_extsize % rtextsize_bytes)
+			return -EINVAL;
+	}
+
+	failaddr = xfs_inode_validate_extsize(ip->i_mount,
+			XFS_B_TO_FSB(mp, fa->fsx_extsize),
+			VFS_I(ip)->i_mode, new_diflags);
+	return failaddr != NULL ? -EINVAL : 0;
+}
+
+static int
+xfs_ioctl_setattr_check_cowextsize(
+	struct xfs_inode	*ip,
+	struct fileattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_failaddr_t		failaddr;
+	uint64_t		new_diflags2;
+	uint16_t		new_diflags;
+
+	if (!fa->fsx_valid)
+		return 0;
+
+	if (fa->fsx_cowextsize & mp->m_blockmask)
+		return -EINVAL;
+
+	new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+	new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
+
+	failaddr = xfs_inode_validate_cowextsize(ip->i_mount,
+			XFS_B_TO_FSB(mp, fa->fsx_cowextsize),
+			VFS_I(ip)->i_mode, new_diflags, new_diflags2);
+	return failaddr != NULL ? -EINVAL : 0;
+}
+
+static int
+xfs_ioctl_setattr_check_projid(
+	struct xfs_inode	*ip,
+	struct fileattr		*fa)
+{
+	if (!fa->fsx_valid)
+		return 0;
+
+	/* Disallow 32bit project ids if 32bit IDs are not enabled. */
+	if (fa->fsx_projid > (uint16_t)-1 &&
+	    !xfs_has_projid32(ip->i_mount))
+		return -EINVAL;
+	return 0;
+}
+
+int
+xfs_fileattr_set(
+	struct user_namespace	*mnt_userns,
+	struct dentry		*dentry,
+	struct fileattr		*fa)
+{
+	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_dquot	*olddquot = NULL;
+	int			error;
+
+	trace_xfs_ioctl_setattr(ip);
+
+	if (d_is_special(dentry))
+		return -ENOTTY;
+
+	if (!fa->fsx_valid) {
+		if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL |
+				  FS_NOATIME_FL | FS_NODUMP_FL |
+				  FS_SYNC_FL | FS_DAX_FL | FS_PROJINHERIT_FL))
+			return -EOPNOTSUPP;
+	}
+
+	error = xfs_ioctl_setattr_check_projid(ip, fa);
+	if (error)
+		return error;
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (fa->fsx_valid && XFS_IS_QUOTA_ON(mp)) {
+		error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+				VFS_I(ip)->i_gid, fa->fsx_projid,
+				XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp);
+		if (error)
+			return error;
+	}
+
+	xfs_ioctl_setattr_prepare_dax(ip, fa);
+
+	tp = xfs_ioctl_setattr_get_trans(ip, pdqp);
+	if (IS_ERR(tp)) {
+		error = PTR_ERR(tp);
+		goto error_free_dquots;
+	}
+
+	error = xfs_ioctl_setattr_check_extsize(ip, fa);
+	if (error)
+		goto error_trans_cancel;
+
+	error = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+	if (error)
+		goto error_trans_cancel;
+
+	error = xfs_ioctl_setattr_xflags(tp, ip, fa);
+	if (error)
+		goto error_trans_cancel;
+
+	if (!fa->fsx_valid)
+		goto skip_xattr;
+	/*
+	 * Change file ownership.  Must be the owner or privileged.  CAP_FSETID
+	 * overrides the following restrictions:
+	 *
+	 * The set-user-ID and set-group-ID bits of a file will be cleared upon
+	 * successful return from chown()
+	 */
+
+	if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
+	    !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID))
+		VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
+
+	/* Change the ownerships and register project quota modifications */
+	if (ip->i_projid != fa->fsx_projid) {
+		if (XFS_IS_PQUOTA_ON(mp)) {
+			olddquot = xfs_qm_vop_chown(tp, ip,
+						&ip->i_pdquot, pdqp);
+		}
+		ip->i_projid = fa->fsx_projid;
+	}
+
+	/*
+	 * Only set the extent size hint if we've already determined that the
+	 * extent size hint should be set on the inode. If no extent size flags
+	 * are set on the inode then unconditionally clear the extent size hint.
+	 */
+	if (ip->i_diflags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
+		ip->i_extsize = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+	else
+		ip->i_extsize = 0;
+
+	if (xfs_has_v3inodes(mp)) {
+		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+			ip->i_cowextsize = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+		else
+			ip->i_cowextsize = 0;
+	}
+
+skip_xattr:
+	error = xfs_trans_commit(tp);
+
+	/*
+	 * Release any dquot(s) the inode had kept before chown.
+	 */
+	xfs_qm_dqrele(olddquot);
+	xfs_qm_dqrele(pdqp);
+
+	return error;
+
+error_trans_cancel:
+	xfs_trans_cancel(tp);
+error_free_dquots:
+	xfs_qm_dqrele(pdqp);
+	return error;
+}
+
+static bool
+xfs_getbmap_format(
+	struct kgetbmap		*p,
+	struct getbmapx __user	*u,
+	size_t			recsize)
+{
+	if (put_user(p->bmv_offset, &u->bmv_offset) ||
+	    put_user(p->bmv_block, &u->bmv_block) ||
+	    put_user(p->bmv_length, &u->bmv_length) ||
+	    put_user(0, &u->bmv_count) ||
+	    put_user(0, &u->bmv_entries))
+		return false;
+	if (recsize < sizeof(struct getbmapx))
+		return true;
+	if (put_user(0, &u->bmv_iflags) ||
+	    put_user(p->bmv_oflags, &u->bmv_oflags) ||
+	    put_user(0, &u->bmv_unused1) ||
+	    put_user(0, &u->bmv_unused2))
+		return false;
+	return true;
+}
+
+STATIC int
+xfs_ioc_getbmap(
+	struct file		*file,
+	unsigned int		cmd,
+	void			__user *arg)
+{
+	struct getbmapx		bmx = { 0 };
+	struct kgetbmap		*buf;
+	size_t			recsize;
+	int			error, i;
+
+	switch (cmd) {
+	case XFS_IOC_GETBMAPA:
+		bmx.bmv_iflags = BMV_IF_ATTRFORK;
+		fallthrough;
+	case XFS_IOC_GETBMAP:
+		/* struct getbmap is a strict subset of struct getbmapx. */
+		recsize = sizeof(struct getbmap);
+		break;
+	case XFS_IOC_GETBMAPX:
+		recsize = sizeof(struct getbmapx);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&bmx, arg, recsize))
+		return -EFAULT;
+
+	if (bmx.bmv_count < 2)
+		return -EINVAL;
+	if (bmx.bmv_count >= INT_MAX / recsize)
+		return -ENOMEM;
+
+	buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, buf);
+	if (error)
+		goto out_free_buf;
+
+	error = -EFAULT;
+	if (copy_to_user(arg, &bmx, recsize))
+		goto out_free_buf;
+	arg += recsize;
+
+	for (i = 0; i < bmx.bmv_entries; i++) {
+		if (!xfs_getbmap_format(buf + i, arg, recsize))
+			goto out_free_buf;
+		arg += recsize;
+	}
+
+	error = 0;
+out_free_buf:
+	kmem_free(buf);
+	return error;
+}
+
+STATIC int
+xfs_ioc_getfsmap(
+	struct xfs_inode	*ip,
+	struct fsmap_head	__user *arg)
+{
+	struct xfs_fsmap_head	xhead = {0};
+	struct fsmap_head	head;
+	struct fsmap		*recs;
+	unsigned int		count;
+	__u32			last_flags = 0;
+	bool			done = false;
+	int			error;
+
+	if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+		return -EFAULT;
+	if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+	    memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+		       sizeof(head.fmh_keys[0].fmr_reserved)) ||
+	    memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+		       sizeof(head.fmh_keys[1].fmr_reserved)))
+		return -EINVAL;
+
+	/*
+	 * Use an internal memory buffer so that we don't have to copy fsmap
+	 * data to userspace while holding locks.  Start by trying to allocate
+	 * up to 128k for the buffer, but fall back to a single page if needed.
+	 */
+	count = min_t(unsigned int, head.fmh_count,
+			131072 / sizeof(struct fsmap));
+	recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
+	if (!recs) {
+		count = min_t(unsigned int, head.fmh_count,
+				PAGE_SIZE / sizeof(struct fsmap));
+		recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
+		if (!recs)
+			return -ENOMEM;
+	}
+
+	xhead.fmh_iflags = head.fmh_iflags;
+	xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
+	xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+	trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+	trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
+
+	head.fmh_entries = 0;
+	do {
+		struct fsmap __user	*user_recs;
+		struct fsmap		*last_rec;
+
+		user_recs = &arg->fmh_recs[head.fmh_entries];
+		xhead.fmh_entries = 0;
+		xhead.fmh_count = min_t(unsigned int, count,
+					head.fmh_count - head.fmh_entries);
+
+		/* Run query, record how many entries we got. */
+		error = xfs_getfsmap(ip->i_mount, &xhead, recs);
+		switch (error) {
+		case 0:
+			/*
+			 * There are no more records in the result set.  Copy
+			 * whatever we got to userspace and break out.
+			 */
+			done = true;
+			break;
+		case -ECANCELED:
+			/*
+			 * The internal memory buffer is full.  Copy whatever
+			 * records we got to userspace and go again if we have
+			 * not yet filled the userspace buffer.
+			 */
+			error = 0;
+			break;
+		default:
+			goto out_free;
+		}
+		head.fmh_entries += xhead.fmh_entries;
+		head.fmh_oflags = xhead.fmh_oflags;
+
+		/*
+		 * If the caller wanted a record count or there aren't any
+		 * new records to return, we're done.
+		 */
+		if (head.fmh_count == 0 || xhead.fmh_entries == 0)
+			break;
+
+		/* Copy all the records we got out to userspace. */
+		if (copy_to_user(user_recs, recs,
+				 xhead.fmh_entries * sizeof(struct fsmap))) {
+			error = -EFAULT;
+			goto out_free;
+		}
+
+		/* Remember the last record flags we copied to userspace. */
+		last_rec = &recs[xhead.fmh_entries - 1];
+		last_flags = last_rec->fmr_flags;
+
+		/* Set up the low key for the next iteration. */
+		xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
+		trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+	} while (!done && head.fmh_entries < head.fmh_count);
+
+	/*
+	 * If there are no more records in the query result set and we're not
+	 * in counting mode, mark the last record returned with the LAST flag.
+	 */
+	if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
+		struct fsmap __user	*user_rec;
+
+		last_flags |= FMR_OF_LAST;
+		user_rec = &arg->fmh_recs[head.fmh_entries - 1];
+
+		if (copy_to_user(&user_rec->fmr_flags, &last_flags,
+					sizeof(last_flags))) {
+			error = -EFAULT;
+			goto out_free;
+		}
+	}
+
+	/* copy back header */
+	if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
+		error = -EFAULT;
+		goto out_free;
+	}
+
+out_free:
+	kmem_free(recs);
+	return error;
+}
+
+STATIC int
+xfs_ioc_scrub_metadata(
+	struct file			*file,
+	void				__user *arg)
+{
+	struct xfs_scrub_metadata	scrub;
+	int				error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&scrub, arg, sizeof(scrub)))
+		return -EFAULT;
+
+	error = xfs_scrub_metadata(file, &scrub);
+	if (error)
+		return error;
+
+	if (copy_to_user(arg, &scrub, sizeof(scrub)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+xfs_ioc_swapext(
+	xfs_swapext_t	*sxp)
+{
+	xfs_inode_t     *ip, *tip;
+	struct fd	f, tmp;
+	int		error = 0;
+
+	/* Pull information for the target fd */
+	f = fdget((int)sxp->sx_fdtarget);
+	if (!f.file) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	if (!(f.file->f_mode & FMODE_WRITE) ||
+	    !(f.file->f_mode & FMODE_READ) ||
+	    (f.file->f_flags & O_APPEND)) {
+		error = -EBADF;
+		goto out_put_file;
+	}
+
+	tmp = fdget((int)sxp->sx_fdtmp);
+	if (!tmp.file) {
+		error = -EINVAL;
+		goto out_put_file;
+	}
+
+	if (!(tmp.file->f_mode & FMODE_WRITE) ||
+	    !(tmp.file->f_mode & FMODE_READ) ||
+	    (tmp.file->f_flags & O_APPEND)) {
+		error = -EBADF;
+		goto out_put_tmp_file;
+	}
+
+	if (IS_SWAPFILE(file_inode(f.file)) ||
+	    IS_SWAPFILE(file_inode(tmp.file))) {
+		error = -EINVAL;
+		goto out_put_tmp_file;
+	}
+
+	/*
+	 * We need to ensure that the fds passed in point to XFS inodes
+	 * before we cast and access them as XFS structures as we have no
+	 * control over what the user passes us here.
+	 */
+	if (f.file->f_op != &xfs_file_operations ||
+	    tmp.file->f_op != &xfs_file_operations) {
+		error = -EINVAL;
+		goto out_put_tmp_file;
+	}
+
+	ip = XFS_I(file_inode(f.file));
+	tip = XFS_I(file_inode(tmp.file));
+
+	if (ip->i_mount != tip->i_mount) {
+		error = -EINVAL;
+		goto out_put_tmp_file;
+	}
+
+	if (ip->i_ino == tip->i_ino) {
+		error = -EINVAL;
+		goto out_put_tmp_file;
+	}
+
+	if (xfs_is_shutdown(ip->i_mount)) {
+		error = -EIO;
+		goto out_put_tmp_file;
+	}
+
+	error = xfs_swap_extents(ip, tip, sxp);
+
+ out_put_tmp_file:
+	fdput(tmp);
+ out_put_file:
+	fdput(f);
+ out:
+	return error;
+}
+
+static int
+xfs_ioc_getlabel(
+	struct xfs_mount	*mp,
+	char			__user *user_label)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+	char			label[XFSLABEL_MAX + 1];
+
+	/* Paranoia */
+	BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX);
+
+	/* 1 larger than sb_fname, so this ensures a trailing NUL char */
+	memset(label, 0, sizeof(label));
+	spin_lock(&mp->m_sb_lock);
+	strncpy(label, sbp->sb_fname, XFSLABEL_MAX);
+	spin_unlock(&mp->m_sb_lock);
+
+	if (copy_to_user(user_label, label, sizeof(label)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+xfs_ioc_setlabel(
+	struct file		*filp,
+	struct xfs_mount	*mp,
+	char			__user *newlabel)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+	char			label[XFSLABEL_MAX + 1];
+	size_t			len;
+	int			error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	/*
+	 * The generic ioctl allows up to FSLABEL_MAX chars, but XFS is much
+	 * smaller, at 12 bytes.  We copy one more to be sure we find the
+	 * (required) NULL character to test the incoming label length.
+	 * NB: The on disk label doesn't need to be null terminated.
+	 */
+	if (copy_from_user(label, newlabel, XFSLABEL_MAX + 1))
+		return -EFAULT;
+	len = strnlen(label, XFSLABEL_MAX + 1);
+	if (len > sizeof(sbp->sb_fname))
+		return -EINVAL;
+
+	error = mnt_want_write_file(filp);
+	if (error)
+		return error;
+
+	spin_lock(&mp->m_sb_lock);
+	memset(sbp->sb_fname, 0, sizeof(sbp->sb_fname));
+	memcpy(sbp->sb_fname, label, len);
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * Now we do several things to satisfy userspace.
+	 * In addition to normal logging of the primary superblock, we also
+	 * immediately write these changes to sector zero for the primary, then
+	 * update all backup supers (as xfs_db does for a label change), then
+	 * invalidate the block device page cache.  This is so that any prior
+	 * buffered reads from userspace (i.e. from blkid) are invalidated,
+	 * and userspace will see the newly-written label.
+	 */
+	error = xfs_sync_sb_buf(mp);
+	if (error)
+		goto out;
+	/*
+	 * growfs also updates backup supers so lock against that.
+	 */
+	mutex_lock(&mp->m_growlock);
+	error = xfs_update_secondary_sbs(mp);
+	mutex_unlock(&mp->m_growlock);
+
+	invalidate_bdev(mp->m_ddev_targp->bt_bdev);
+
+out:
+	mnt_drop_write_file(filp);
+	return error;
+}
+
+static inline int
+xfs_fs_eofblocks_from_user(
+	struct xfs_fs_eofblocks		*src,
+	struct xfs_icwalk		*dst)
+{
+	if (src->eof_version != XFS_EOFBLOCKS_VERSION)
+		return -EINVAL;
+
+	if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
+		return -EINVAL;
+
+	if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
+	    memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+		return -EINVAL;
+
+	dst->icw_flags = 0;
+	if (src->eof_flags & XFS_EOF_FLAGS_SYNC)
+		dst->icw_flags |= XFS_ICWALK_FLAG_SYNC;
+	if (src->eof_flags & XFS_EOF_FLAGS_UID)
+		dst->icw_flags |= XFS_ICWALK_FLAG_UID;
+	if (src->eof_flags & XFS_EOF_FLAGS_GID)
+		dst->icw_flags |= XFS_ICWALK_FLAG_GID;
+	if (src->eof_flags & XFS_EOF_FLAGS_PRID)
+		dst->icw_flags |= XFS_ICWALK_FLAG_PRID;
+	if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE)
+		dst->icw_flags |= XFS_ICWALK_FLAG_MINFILESIZE;
+
+	dst->icw_prid = src->eof_prid;
+	dst->icw_min_file_size = src->eof_min_file_size;
+
+	dst->icw_uid = INVALID_UID;
+	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+		dst->icw_uid = make_kuid(current_user_ns(), src->eof_uid);
+		if (!uid_valid(dst->icw_uid))
+			return -EINVAL;
+	}
+
+	dst->icw_gid = INVALID_GID;
+	if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+		dst->icw_gid = make_kgid(current_user_ns(), src->eof_gid);
+		if (!gid_valid(dst->icw_gid))
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * These long-unused ioctls were removed from the official ioctl API in 5.17,
+ * but retain these definitions so that we can log warnings about them.
+ */
+#define XFS_IOC_ALLOCSP		_IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP		_IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)
+
+/*
+ * Note: some of the ioctl's return positive numbers as a
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		p)
+{
+	struct inode		*inode = file_inode(filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	void			__user *arg = (void __user *)p;
+	int			error;
+
+	trace_xfs_file_ioctl(ip);
+
+	switch (cmd) {
+	case FITRIM:
+		return xfs_ioc_trim(mp, arg);
+	case FS_IOC_GETFSLABEL:
+		return xfs_ioc_getlabel(mp, arg);
+	case FS_IOC_SETFSLABEL:
+		return xfs_ioc_setlabel(filp, mp, arg);
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+		xfs_warn_once(mp,
+	"%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported",
+				current->comm);
+		return -ENOTTY;
+	case XFS_IOC_DIOINFO: {
+		struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+		struct dioattr		da;
+
+		da.d_mem =  da.d_miniosz = target->bt_logical_sectorsize;
+		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+
+		if (copy_to_user(arg, &da, sizeof(da)))
+			return -EFAULT;
+		return 0;
+	}
+
+	case XFS_IOC_FSBULKSTAT_SINGLE:
+	case XFS_IOC_FSBULKSTAT:
+	case XFS_IOC_FSINUMBERS:
+		return xfs_ioc_fsbulkstat(filp, cmd, arg);
+
+	case XFS_IOC_BULKSTAT:
+		return xfs_ioc_bulkstat(filp, cmd, arg);
+	case XFS_IOC_INUMBERS:
+		return xfs_ioc_inumbers(mp, cmd, arg);
+
+	case XFS_IOC_FSGEOMETRY_V1:
+		return xfs_ioc_fsgeometry(mp, arg, 3);
+	case XFS_IOC_FSGEOMETRY_V4:
+		return xfs_ioc_fsgeometry(mp, arg, 4);
+	case XFS_IOC_FSGEOMETRY:
+		return xfs_ioc_fsgeometry(mp, arg, 5);
+
+	case XFS_IOC_AG_GEOMETRY:
+		return xfs_ioc_ag_geometry(mp, arg);
+
+	case XFS_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *)arg);
+
+	case XFS_IOC_FSGETXATTRA:
+		return xfs_ioc_fsgetxattra(ip, arg);
+
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+	case XFS_IOC_GETBMAPX:
+		return xfs_ioc_getbmap(filp, cmd, arg);
+
+	case FS_IOC_GETFSMAP:
+		return xfs_ioc_getfsmap(ip, arg);
+
+	case XFS_IOC_SCRUB_METADATA:
+		return xfs_ioc_scrub_metadata(filp, arg);
+
+	case XFS_IOC_FD_TO_HANDLE:
+	case XFS_IOC_PATH_TO_HANDLE:
+	case XFS_IOC_PATH_TO_FSHANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(hreq)))
+			return -EFAULT;
+		return xfs_find_handle(cmd, &hreq);
+	}
+	case XFS_IOC_OPEN_BY_HANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+			return -EFAULT;
+		return xfs_open_by_handle(filp, &hreq);
+	}
+
+	case XFS_IOC_READLINK_BY_HANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+			return -EFAULT;
+		return xfs_readlink_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_ATTRLIST_BY_HANDLE:
+		return xfs_attrlist_by_handle(filp, arg);
+
+	case XFS_IOC_ATTRMULTI_BY_HANDLE:
+		return xfs_attrmulti_by_handle(filp, arg);
+
+	case XFS_IOC_SWAPEXT: {
+		struct xfs_swapext	sxp;
+
+		if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+			return -EFAULT;
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_ioc_swapext(&sxp);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_FSCOUNTS: {
+		xfs_fsop_counts_t out;
+
+		xfs_fs_counts(mp, &out);
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -EFAULT;
+		return 0;
+	}
+
+	case XFS_IOC_SET_RESBLKS: {
+		xfs_fsop_resblks_t inout;
+		uint64_t	   in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (xfs_is_readonly(mp))
+			return -EROFS;
+
+		if (copy_from_user(&inout, arg, sizeof(inout)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+
+		/* input parameter is passed in resblks field of structure */
+		in = inout.resblks;
+		error = xfs_reserve_blocks(mp, &in, &inout);
+		mnt_drop_write_file(filp);
+		if (error)
+			return error;
+
+		if (copy_to_user(arg, &inout, sizeof(inout)))
+			return -EFAULT;
+		return 0;
+	}
+
+	case XFS_IOC_GET_RESBLKS: {
+		xfs_fsop_resblks_t out;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_reserve_blocks(mp, NULL, &out);
+		if (error)
+			return error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSDATA: {
+		struct xfs_growfs_data in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_data(mp, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_FSGROWFSLOG: {
+		struct xfs_growfs_log in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_log(mp, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_FSGROWFSRT: {
+		xfs_growfs_rt_t in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_rt(mp, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+
+	case XFS_IOC_GOINGDOWN: {
+		uint32_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (get_user(in, (uint32_t __user *)arg))
+			return -EFAULT;
+
+		return xfs_fs_goingdown(mp, in);
+	}
+
+	case XFS_IOC_ERROR_INJECTION: {
+		xfs_error_injection_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -EFAULT;
+
+		return xfs_errortag_add(mp, in.errtag);
+	}
+
+	case XFS_IOC_ERROR_CLEARALL:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		return xfs_errortag_clearall(mp);
+
+	case XFS_IOC_FREE_EOFBLOCKS: {
+		struct xfs_fs_eofblocks	eofb;
+		struct xfs_icwalk	icw;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (xfs_is_readonly(mp))
+			return -EROFS;
+
+		if (copy_from_user(&eofb, arg, sizeof(eofb)))
+			return -EFAULT;
+
+		error = xfs_fs_eofblocks_from_user(&eofb, &icw);
+		if (error)
+			return error;
+
+		trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_);
+
+		sb_start_write(mp->m_super);
+		error = xfs_blockgc_free_space(mp, &icw);
+		sb_end_write(mp->m_super);
+		return error;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
new file mode 100644
index 000000000..d4abba2c1
--- /dev/null
+++ b/fs/xfs/xfs_ioctl.h
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+
+struct xfs_bstat;
+struct xfs_ibulk;
+struct xfs_inogrp;
+
+int
+xfs_ioc_swapext(
+	xfs_swapext_t	*sxp);
+
+extern int
+xfs_find_handle(
+	unsigned int		cmd,
+	xfs_fsop_handlereq_t	*hreq);
+
+extern int
+xfs_open_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq);
+
+extern int
+xfs_readlink_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq);
+
+int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
+		uint32_t opcode, void __user *uname, void __user *value,
+		uint32_t *len, uint32_t flags);
+int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf,
+		      size_t bufsize, int flags,
+		      struct xfs_attrlist_cursor __user *ucursor);
+
+extern struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen);
+
+extern int
+xfs_fileattr_get(
+	struct dentry		*dentry,
+	struct fileattr		*fa);
+
+extern int
+xfs_fileattr_set(
+	struct user_namespace	*mnt_userns,
+	struct dentry		*dentry,
+	struct fileattr		*fa);
+
+extern long
+xfs_file_ioctl(
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		p);
+
+extern long
+xfs_file_compat_ioctl(
+	struct file		*file,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq,
+			   const struct xfs_bulkstat *bstat);
+int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp);
+
+#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
new file mode 100644
index 000000000..2f54b701e
--- /dev/null
+++ b/fs/xfs/xfs_ioctl32.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include <linux/mount.h>
+#include <linux/fsmap.h>
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iwalk.h"
+#include "xfs_itable.h"
+#include "xfs_fsops.h"
+#include "xfs_rtalloc.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
+#include "xfs_ioctl32.h"
+#include "xfs_trace.h"
+#include "xfs_sb.h"
+
+#define  _NATIVE_IOC(cmd, type) \
+	  _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
+
+#ifdef BROKEN_X86_ALIGNMENT
+STATIC int
+xfs_compat_ioc_fsgeometry_v1(
+	struct xfs_mount	  *mp,
+	compat_xfs_fsop_geom_v1_t __user *arg32)
+{
+	struct xfs_fsop_geom	  fsgeo;
+
+	xfs_fs_geometry(mp, &fsgeo, 3);
+	/* The 32-bit variant simply has some padding at the end */
+	if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_compat_growfs_data_copyin(
+	struct xfs_growfs_data	 *in,
+	compat_xfs_growfs_data_t __user *arg32)
+{
+	if (get_user(in->newblocks, &arg32->newblocks) ||
+	    get_user(in->imaxpct,   &arg32->imaxpct))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_compat_growfs_rt_copyin(
+	struct xfs_growfs_rt	 *in,
+	compat_xfs_growfs_rt_t	__user *arg32)
+{
+	if (get_user(in->newblocks, &arg32->newblocks) ||
+	    get_user(in->extsize,   &arg32->extsize))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_fsinumbers_fmt_compat(
+	struct xfs_ibulk		*breq,
+	const struct xfs_inumbers	*ig)
+{
+	struct compat_xfs_inogrp __user	*p32 = breq->ubuffer;
+	struct xfs_inogrp		ig1;
+	struct xfs_inogrp		*igrp = &ig1;
+
+	xfs_inumbers_to_inogrp(&ig1, ig);
+
+	if (put_user(igrp->xi_startino,   &p32->xi_startino) ||
+	    put_user(igrp->xi_alloccount, &p32->xi_alloccount) ||
+	    put_user(igrp->xi_allocmask,  &p32->xi_allocmask))
+		return -EFAULT;
+
+	return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_inogrp));
+}
+
+#else
+#define xfs_fsinumbers_fmt_compat xfs_fsinumbers_fmt
+#endif	/* BROKEN_X86_ALIGNMENT */
+
+STATIC int
+xfs_ioctl32_bstime_copyin(
+	xfs_bstime_t		*bstime,
+	compat_xfs_bstime_t	__user *bstime32)
+{
+	old_time32_t		sec32;	/* tv_sec differs on 64 vs. 32 */
+
+	if (get_user(sec32,		&bstime32->tv_sec)	||
+	    get_user(bstime->tv_nsec,	&bstime32->tv_nsec))
+		return -EFAULT;
+	bstime->tv_sec = sec32;
+	return 0;
+}
+
+/*
+ * struct xfs_bstat has differing alignment on intel, & bstime_t sizes
+ * everywhere
+ */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+	struct xfs_bstat		*bstat,
+	struct compat_xfs_bstat	__user	*bstat32)
+{
+	if (get_user(bstat->bs_ino,	&bstat32->bs_ino)	||
+	    get_user(bstat->bs_mode,	&bstat32->bs_mode)	||
+	    get_user(bstat->bs_nlink,	&bstat32->bs_nlink)	||
+	    get_user(bstat->bs_uid,	&bstat32->bs_uid)	||
+	    get_user(bstat->bs_gid,	&bstat32->bs_gid)	||
+	    get_user(bstat->bs_rdev,	&bstat32->bs_rdev)	||
+	    get_user(bstat->bs_blksize,	&bstat32->bs_blksize)	||
+	    get_user(bstat->bs_size,	&bstat32->bs_size)	||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+	    get_user(bstat->bs_blocks,	&bstat32->bs_size)	||
+	    get_user(bstat->bs_xflags,	&bstat32->bs_size)	||
+	    get_user(bstat->bs_extsize,	&bstat32->bs_extsize)	||
+	    get_user(bstat->bs_extents,	&bstat32->bs_extents)	||
+	    get_user(bstat->bs_gen,	&bstat32->bs_gen)	||
+	    get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+	    get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
+	    get_user(bstat->bs_forkoff,	&bstat32->bs_forkoff)	||
+	    get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask)	||
+	    get_user(bstat->bs_dmstate,	&bstat32->bs_dmstate)	||
+	    get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+		return -EFAULT;
+	return 0;
+}
+
+/* XFS_IOC_FSBULKSTAT and friends */
+
+STATIC int
+xfs_bstime_store_compat(
+	compat_xfs_bstime_t	__user *p32,
+	const xfs_bstime_t	*p)
+{
+	__s32			sec32;
+
+	sec32 = p->tv_sec;
+	if (put_user(sec32, &p32->tv_sec) ||
+	    put_user(p->tv_nsec, &p32->tv_nsec))
+		return -EFAULT;
+	return 0;
+}
+
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
+STATIC int
+xfs_fsbulkstat_one_fmt_compat(
+	struct xfs_ibulk		*breq,
+	const struct xfs_bulkstat	*bstat)
+{
+	struct compat_xfs_bstat	__user	*p32 = breq->ubuffer;
+	struct xfs_bstat		bs1;
+	struct xfs_bstat		*buffer = &bs1;
+
+	xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat);
+
+	if (put_user(buffer->bs_ino,	  &p32->bs_ino)		||
+	    put_user(buffer->bs_mode,	  &p32->bs_mode)	||
+	    put_user(buffer->bs_nlink,	  &p32->bs_nlink)	||
+	    put_user(buffer->bs_uid,	  &p32->bs_uid)		||
+	    put_user(buffer->bs_gid,	  &p32->bs_gid)		||
+	    put_user(buffer->bs_rdev,	  &p32->bs_rdev)	||
+	    put_user(buffer->bs_blksize,  &p32->bs_blksize)	||
+	    put_user(buffer->bs_size,	  &p32->bs_size)	||
+	    xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
+	    xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
+	    xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
+	    put_user(buffer->bs_blocks,	  &p32->bs_blocks)	||
+	    put_user(buffer->bs_xflags,	  &p32->bs_xflags)	||
+	    put_user(buffer->bs_extsize,  &p32->bs_extsize)	||
+	    put_user(buffer->bs_extents,  &p32->bs_extents)	||
+	    put_user(buffer->bs_gen,	  &p32->bs_gen)		||
+	    put_user(buffer->bs_projid,	  &p32->bs_projid)	||
+	    put_user(buffer->bs_projid_hi,	&p32->bs_projid_hi)	||
+	    put_user(buffer->bs_forkoff,  &p32->bs_forkoff)	||
+	    put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)	||
+	    put_user(buffer->bs_dmstate,  &p32->bs_dmstate)	||
+	    put_user(buffer->bs_aextents, &p32->bs_aextents))
+		return -EFAULT;
+
+	return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_bstat));
+}
+
+/* copied from xfs_ioctl.c */
+STATIC int
+xfs_compat_ioc_fsbulkstat(
+	struct file		*file,
+	unsigned int		  cmd,
+	struct compat_xfs_fsop_bulkreq __user *p32)
+{
+	struct xfs_mount	*mp = XFS_I(file_inode(file))->i_mount;
+	u32			addr;
+	struct xfs_fsop_bulkreq	bulkreq;
+	struct xfs_ibulk	breq = {
+		.mp		= mp,
+		.mnt_userns	= file_mnt_user_ns(file),
+		.ocount		= 0,
+	};
+	xfs_ino_t		lastino;
+	int			error;
+
+	/*
+	 * Output structure handling functions.  Depending on the command,
+	 * either the xfs_bstat and xfs_inogrp structures are written out
+	 * to userpace memory via bulkreq.ubuffer.  Normally the compat
+	 * functions and structure size are the correct ones to use ...
+	 */
+	inumbers_fmt_pf		inumbers_func = xfs_fsinumbers_fmt_compat;
+	bulkstat_one_fmt_pf	bs_one_func = xfs_fsbulkstat_one_fmt_compat;
+
+#ifdef CONFIG_X86_X32_ABI
+	if (in_x32_syscall()) {
+		/*
+		 * ... but on x32 the input xfs_fsop_bulkreq has pointers
+		 * which must be handled in the "compat" (32-bit) way, while
+		 * the xfs_bstat and xfs_inogrp structures follow native 64-
+		 * bit layout convention.  So adjust accordingly, otherwise
+		 * the data written out in compat layout will not match what
+		 * x32 userspace expects.
+		 */
+		inumbers_func = xfs_fsinumbers_fmt;
+		bs_one_func = xfs_fsbulkstat_one_fmt;
+	}
+#endif
+
+	/* done = 1 if there are more stats to get and if bulkstat */
+	/* should be called again (unused here, but used in dmapi) */
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	if (get_user(addr, &p32->lastip))
+		return -EFAULT;
+	bulkreq.lastip = compat_ptr(addr);
+	if (get_user(bulkreq.icount, &p32->icount) ||
+	    get_user(addr, &p32->ubuffer))
+		return -EFAULT;
+	bulkreq.ubuffer = compat_ptr(addr);
+	if (get_user(addr, &p32->ocount))
+		return -EFAULT;
+	bulkreq.ocount = compat_ptr(addr);
+
+	if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64)))
+		return -EFAULT;
+
+	if (bulkreq.icount <= 0)
+		return -EINVAL;
+
+	if (bulkreq.ubuffer == NULL)
+		return -EINVAL;
+
+	breq.ubuffer = bulkreq.ubuffer;
+	breq.icount = bulkreq.icount;
+
+	/*
+	 * FSBULKSTAT_SINGLE expects that *lastip contains the inode number
+	 * that we want to stat.  However, FSINUMBERS and FSBULKSTAT expect
+	 * that *lastip contains either zero or the number of the last inode to
+	 * be examined by the previous call and return results starting with
+	 * the next inode after that.  The new bulk request back end functions
+	 * take the inode to start with, so we have to compute the startino
+	 * parameter from lastino to maintain correct function.  lastino == 0
+	 * is a special case because it has traditionally meant "first inode
+	 * in filesystem".
+	 */
+	if (cmd == XFS_IOC_FSINUMBERS_32) {
+		breq.startino = lastino ? lastino + 1 : 0;
+		error = xfs_inumbers(&breq, inumbers_func);
+		lastino = breq.startino - 1;
+	} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+		breq.startino = lastino;
+		breq.icount = 1;
+		error = xfs_bulkstat_one(&breq, bs_one_func);
+		lastino = breq.startino;
+	} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
+		breq.startino = lastino ? lastino + 1 : 0;
+		error = xfs_bulkstat(&breq, bs_one_func);
+		lastino = breq.startino - 1;
+	} else {
+		error = -EINVAL;
+	}
+	if (error)
+		return error;
+
+	if (bulkreq.lastip != NULL &&
+	    copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t)))
+		return -EFAULT;
+
+	if (bulkreq.ocount != NULL &&
+	    copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+STATIC int
+xfs_compat_handlereq_copyin(
+	xfs_fsop_handlereq_t		*hreq,
+	compat_xfs_fsop_handlereq_t	__user *arg32)
+{
+	compat_xfs_fsop_handlereq_t	hreq32;
+
+	if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+		return -EFAULT;
+
+	hreq->fd = hreq32.fd;
+	hreq->path = compat_ptr(hreq32.path);
+	hreq->oflags = hreq32.oflags;
+	hreq->ihandle = compat_ptr(hreq32.ihandle);
+	hreq->ihandlen = hreq32.ihandlen;
+	hreq->ohandle = compat_ptr(hreq32.ohandle);
+	hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+
+	return 0;
+}
+
+STATIC struct dentry *
+xfs_compat_handlereq_to_dentry(
+	struct file		*parfilp,
+	compat_xfs_fsop_handlereq_t *hreq)
+{
+	return xfs_handle_to_dentry(parfilp,
+			compat_ptr(hreq->ihandle), hreq->ihandlen);
+}
+
+STATIC int
+xfs_compat_attrlist_by_handle(
+	struct file		*parfilp,
+	compat_xfs_fsop_attrlist_handlereq_t __user *p)
+{
+	compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+	struct dentry		*dentry;
+	int			error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
+		return -EFAULT;
+
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)),
+			compat_ptr(al_hreq.buffer), al_hreq.buflen,
+			al_hreq.flags, &p->pos);
+	dput(dentry);
+	return error;
+}
+
+STATIC int
+xfs_compat_attrmulti_by_handle(
+	struct file				*parfilp,
+	void					__user *arg)
+{
+	int					error;
+	compat_xfs_attr_multiop_t		*ops;
+	compat_xfs_fsop_attrmulti_handlereq_t	am_hreq;
+	struct dentry				*dentry;
+	unsigned int				i, size;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&am_hreq, arg,
+			   sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+		return -EFAULT;
+
+	/* overflow check */
+	if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+		return -E2BIG;
+
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = -E2BIG;
+	size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+	if (!size || size > 16 * PAGE_SIZE)
+		goto out_dput;
+
+	ops = memdup_user(compat_ptr(am_hreq.ops), size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
+		goto out_dput;
+	}
+
+	error = 0;
+	for (i = 0; i < am_hreq.opcount; i++) {
+		ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+				d_inode(dentry), ops[i].am_opcode,
+				compat_ptr(ops[i].am_attrname),
+				compat_ptr(ops[i].am_attrvalue),
+				&ops[i].am_length, ops[i].am_flags);
+	}
+
+	if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+		error = -EFAULT;
+
+	kfree(ops);
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+long
+xfs_file_compat_ioctl(
+	struct file		*filp,
+	unsigned		cmd,
+	unsigned long		p)
+{
+	struct inode		*inode = file_inode(filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	void			__user *arg = compat_ptr(p);
+	int			error;
+
+	trace_xfs_file_compat_ioctl(ip);
+
+	switch (cmd) {
+#if defined(BROKEN_X86_ALIGNMENT)
+	case XFS_IOC_FSGEOMETRY_V1_32:
+		return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg);
+	case XFS_IOC_FSGROWFSDATA_32: {
+		struct xfs_growfs_data	in;
+
+		if (xfs_compat_growfs_data_copyin(&in, arg))
+			return -EFAULT;
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_data(ip->i_mount, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+	case XFS_IOC_FSGROWFSRT_32: {
+		struct xfs_growfs_rt	in;
+
+		if (xfs_compat_growfs_rt_copyin(&in, arg))
+			return -EFAULT;
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_growfs_rt(ip->i_mount, &in);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+#endif
+	/* long changes size, but xfs only copiese out 32 bits */
+	case XFS_IOC_GETVERSION_32:
+		cmd = _NATIVE_IOC(cmd, long);
+		return xfs_file_ioctl(filp, cmd, p);
+	case XFS_IOC_SWAPEXT_32: {
+		struct xfs_swapext	  sxp;
+		struct compat_xfs_swapext __user *sxu = arg;
+
+		/* Bulk copy in up to the sx_stat field, then copy bstat */
+		if (copy_from_user(&sxp, sxu,
+				   offsetof(struct xfs_swapext, sx_stat)) ||
+		    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+			return -EFAULT;
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_ioc_swapext(&sxp);
+		mnt_drop_write_file(filp);
+		return error;
+	}
+	case XFS_IOC_FSBULKSTAT_32:
+	case XFS_IOC_FSBULKSTAT_SINGLE_32:
+	case XFS_IOC_FSINUMBERS_32:
+		return xfs_compat_ioc_fsbulkstat(filp, cmd, arg);
+	case XFS_IOC_FD_TO_HANDLE_32:
+	case XFS_IOC_PATH_TO_HANDLE_32:
+	case XFS_IOC_PATH_TO_FSHANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -EFAULT;
+		cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
+		return xfs_find_handle(cmd, &hreq);
+	}
+	case XFS_IOC_OPEN_BY_HANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -EFAULT;
+		return xfs_open_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_READLINK_BY_HANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -EFAULT;
+		return xfs_readlink_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_ATTRLIST_BY_HANDLE_32:
+		return xfs_compat_attrlist_by_handle(filp, arg);
+	case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+		return xfs_compat_attrmulti_by_handle(filp, arg);
+	default:
+		/* try the native version */
+		return xfs_file_ioctl(filp, cmd, (unsigned long)arg);
+	}
+}
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
new file mode 100644
index 000000000..c14852362
--- /dev/null
+++ b/fs/xfs/xfs_ioctl32.h
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_IOCTL32_H__
+#define __XFS_IOCTL32_H__
+
+#include <linux/compat.h>
+
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment.  We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETVERSION_32	FS_IOC32_GETVERSION
+
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+
+typedef struct compat_xfs_bstime {
+	old_time32_t	tv_sec;		/* seconds		*/
+	__s32		tv_nsec;	/* and nanoseconds	*/
+} compat_xfs_bstime_t;
+
+struct compat_xfs_bstat {
+	__u64		bs_ino;		/* inode number			*/
+	__u16		bs_mode;	/* type and mode		*/
+	__u16		bs_nlink;	/* number of links		*/
+	__u32		bs_uid;		/* user id			*/
+	__u32		bs_gid;		/* group id			*/
+	__u32		bs_rdev;	/* device value			*/
+	__s32		bs_blksize;	/* block size			*/
+	__s64		bs_size;	/* file size			*/
+	compat_xfs_bstime_t bs_atime;	/* access time			*/
+	compat_xfs_bstime_t bs_mtime;	/* modify time			*/
+	compat_xfs_bstime_t bs_ctime;	/* inode change time		*/
+	int64_t		bs_blocks;	/* number of blocks		*/
+	__u32		bs_xflags;	/* extended flags		*/
+	__s32		bs_extsize;	/* extent size			*/
+	__s32		bs_extents;	/* number of extents		*/
+	__u32		bs_gen;		/* generation count		*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
+	__u16		bs_projid_hi;	/* high part of project id	*/
+	unsigned char	bs_pad[10];	/* pad space, unused		*/
+	__u32		bs_dmevmask;	/* DMIG event mask		*/
+	__u16		bs_dmstate;	/* DMIG state info		*/
+	__u16		bs_aextents;	/* attribute number of extents	*/
+} __compat_packed;
+
+struct compat_xfs_fsop_bulkreq {
+	compat_uptr_t	lastip;		/* last inode # pointer		*/
+	__s32		icount;		/* count of entries in buffer	*/
+	compat_uptr_t	ubuffer;	/* user buffer for inode desc.	*/
+	compat_uptr_t	ocount;		/* output count pointer		*/
+};
+
+#define XFS_IOC_FSBULKSTAT_32 \
+	_IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+	_IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+	_IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+
+typedef struct compat_xfs_fsop_handlereq {
+	__u32		fd;		/* fd for FD_TO_HANDLE		*/
+	compat_uptr_t	path;		/* user pathname		*/
+	__u32		oflags;		/* open flags			*/
+	compat_uptr_t	ihandle;	/* user supplied handle		*/
+	__u32		ihandlen;	/* user supplied length		*/
+	compat_uptr_t	ohandle;	/* user buffer for handle	*/
+	compat_uptr_t	ohandlen;	/* user buffer length		*/
+} compat_xfs_fsop_handlereq_t;
+
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+	_IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+	_IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+	_IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+	_IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+	_IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+
+/* The bstat field in the swapext struct needs translation */
+struct compat_xfs_swapext {
+	int64_t			sx_version;	/* version */
+	int64_t			sx_fdtarget;	/* fd of target file */
+	int64_t			sx_fdtmp;	/* fd of tmp file */
+	xfs_off_t		sx_offset;	/* offset into file */
+	xfs_off_t		sx_length;	/* leng from offset */
+	char			sx_pad[16];	/* pad space, unused */
+	struct compat_xfs_bstat	sx_stat;	/* stat of target b4 copy */
+} __compat_packed;
+
+#define XFS_IOC_SWAPEXT_32	_IOWR('X', 109, struct compat_xfs_swapext)
+
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+	struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+	struct xfs_attrlist_cursor	pos; /* opaque cookie, list offset */
+	__u32				flags;	/* which namespace to use */
+	__u32				buflen;	/* length of buffer supplied */
+	compat_uptr_t			buffer;	/* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+	_IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+	__u32		am_opcode;
+	__s32		am_error;
+	compat_uptr_t	am_attrname;
+	compat_uptr_t	am_attrvalue;
+	__u32		am_length;
+	__u32		am_flags;
+} compat_xfs_attr_multiop_t;
+
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+	struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+	__u32				opcount;/* count of following multiop */
+	/* ptr to compat_xfs_attr_multiop */
+	compat_uptr_t			ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+	_IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+
+#ifdef BROKEN_X86_ALIGNMENT
+typedef struct compat_xfs_fsop_geom_v1 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+
+#define XFS_IOC_FSGEOMETRY_V1_32  \
+	_IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+
+struct compat_xfs_inogrp {
+	__u64		xi_startino;	/* starting inode number	*/
+	__s32		xi_alloccount;	/* # bits set in allocmask	*/
+	__u64		xi_allocmask;	/* mask of allocated inodes	*/
+} __attribute__((packed));
+
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+	__u64		newblocks;	/* new data subvol size, fsblocks */
+	__u32		imaxpct;	/* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+
+typedef struct compat_xfs_growfs_rt {
+	__u64		newblocks;	/* new realtime size, fsblocks */
+	__u32		extsize;	/* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
+
+#endif /* BROKEN_X86_ALIGNMENT */
+
+#endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
new file mode 100644
index 000000000..07da03976
--- /dev/null
+++ b/fs/xfs/xfs_iomap.c
@@ -0,0 +1,1391 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
+#include "xfs_iomap.h"
+#include "xfs_trace.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_reflink.h"
+
+#define XFS_ALLOC_ALIGN(mp, off) \
+	(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
+
+static int
+xfs_alert_fsblock_zero(
+	xfs_inode_t	*ip,
+	xfs_bmbt_irec_t	*imap)
+{
+	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+			"Access to block zero in inode %llu "
+			"start_block: %llx start_off: %llx "
+			"blkcnt: %llx extent-state: %x",
+		(unsigned long long)ip->i_ino,
+		(unsigned long long)imap->br_startblock,
+		(unsigned long long)imap->br_startoff,
+		(unsigned long long)imap->br_blockcount,
+		imap->br_state);
+	return -EFSCORRUPTED;
+}
+
+int
+xfs_bmbt_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	struct xfs_bmbt_irec	*imap,
+	unsigned int		mapping_flags,
+	u16			iomap_flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+
+	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+		return xfs_alert_fsblock_zero(ip, imap);
+
+	if (imap->br_startblock == HOLESTARTBLOCK) {
+		iomap->addr = IOMAP_NULL_ADDR;
+		iomap->type = IOMAP_HOLE;
+	} else if (imap->br_startblock == DELAYSTARTBLOCK ||
+		   isnullstartblock(imap->br_startblock)) {
+		iomap->addr = IOMAP_NULL_ADDR;
+		iomap->type = IOMAP_DELALLOC;
+	} else {
+		iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
+		if (mapping_flags & IOMAP_DAX)
+			iomap->addr += target->bt_dax_part_off;
+
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			iomap->type = IOMAP_UNWRITTEN;
+		else
+			iomap->type = IOMAP_MAPPED;
+
+	}
+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+	if (mapping_flags & IOMAP_DAX)
+		iomap->dax_dev = target->bt_daxdev;
+	else
+		iomap->bdev = target->bt_bdev;
+	iomap->flags = iomap_flags;
+
+	if (xfs_ipincount(ip) &&
+	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+		iomap->flags |= IOMAP_F_DIRTY;
+	return 0;
+}
+
+static void
+xfs_hole_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	xfs_fileoff_t		offset_fsb,
+	xfs_fileoff_t		end_fsb)
+{
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+
+	iomap->addr = IOMAP_NULL_ADDR;
+	iomap->type = IOMAP_HOLE;
+	iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
+	iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
+	iomap->bdev = target->bt_bdev;
+	iomap->dax_dev = target->bt_daxdev;
+}
+
+static inline xfs_fileoff_t
+xfs_iomap_end_fsb(
+	struct xfs_mount	*mp,
+	loff_t			offset,
+	loff_t			count)
+{
+	ASSERT(offset <= mp->m_super->s_maxbytes);
+	return min(XFS_B_TO_FSB(mp, offset + count),
+		   XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+}
+
+static xfs_extlen_t
+xfs_eof_alignment(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_extlen_t		align = 0;
+
+	if (!XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * Round up the allocation request to a stripe unit
+		 * (m_dalign) boundary if the file size is >= stripe unit
+		 * size, and we are allocating past the allocation eof.
+		 *
+		 * If mounted with the "-o swalloc" option the alignment is
+		 * increased from the strip unit size to the stripe width.
+		 */
+		if (mp->m_swidth && xfs_has_swalloc(mp))
+			align = mp->m_swidth;
+		else if (mp->m_dalign)
+			align = mp->m_dalign;
+
+		if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
+			align = 0;
+	}
+
+	return align;
+}
+
+/*
+ * Check if last_fsb is outside the last extent, and if so grow it to the next
+ * stripe unit boundary.
+ */
+xfs_fileoff_t
+xfs_iomap_eof_align_last_fsb(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		end_fsb)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	xfs_extlen_t		extsz = xfs_get_extsz_hint(ip);
+	xfs_extlen_t		align = xfs_eof_alignment(ip);
+	struct xfs_bmbt_irec	irec;
+	struct xfs_iext_cursor	icur;
+
+	ASSERT(!xfs_need_iread_extents(ifp));
+
+	/*
+	 * Always round up the allocation request to the extent hint boundary.
+	 */
+	if (extsz) {
+		if (align)
+			align = roundup_64(align, extsz);
+		else
+			align = extsz;
+	}
+
+	if (align) {
+		xfs_fileoff_t	aligned_end_fsb = roundup_64(end_fsb, align);
+
+		xfs_iext_last(ifp, &icur);
+		if (!xfs_iext_get_extent(ifp, &icur, &irec) ||
+		    aligned_end_fsb >= irec.br_startoff + irec.br_blockcount)
+			return aligned_end_fsb;
+	}
+
+	return end_fsb;
+}
+
+int
+xfs_iomap_write_direct(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb,
+	xfs_fileoff_t		count_fsb,
+	unsigned int		flags,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	xfs_filblks_t		resaligned;
+	int			nimaps;
+	unsigned int		dblocks, rblocks;
+	bool			force = false;
+	int			error;
+	int			bmapi_flags = XFS_BMAPI_PREALLOC;
+	int			nr_exts = XFS_IEXT_ADD_NOSPLIT_CNT;
+
+	ASSERT(count_fsb > 0);
+
+	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+					   xfs_get_extsz_hint(ip));
+	if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
+		dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+		rblocks = resaligned;
+	} else {
+		dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+		rblocks = 0;
+	}
+
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		return error;
+
+	/*
+	 * For DAX, we do not allocate unwritten extents, but instead we zero
+	 * the block before we commit the transaction.  Ideally we'd like to do
+	 * this outside the transaction context, but if we commit and then crash
+	 * we may not have zeroed the blocks and this will be exposed on
+	 * recovery of the allocation. Hence we must zero before commit.
+	 *
+	 * Further, if we are mapping unwritten extents here, we need to zero
+	 * and convert them to written so that we don't need an unwritten extent
+	 * callback for DAX. This also means that we need to be able to dip into
+	 * the reserve block pool for bmbt block allocation if there is no space
+	 * left but we need to do unwritten extent conversion.
+	 */
+	if (flags & IOMAP_DAX) {
+		bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
+		if (imap->br_state == XFS_EXT_UNWRITTEN) {
+			force = true;
+			nr_exts = XFS_IEXT_WRITE_UNWRITTEN_CNT;
+			dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+		}
+	}
+
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+			rblocks, force, &tp);
+	if (error)
+		return error;
+
+	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
+	if (error == -EFBIG)
+		error = xfs_iext_count_upgrade(tp, ip, nr_exts);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * From this point onwards we overwrite the imap pointer that the
+	 * caller gave to us.
+	 */
+	nimaps = 1;
+	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
+				imap, &nimaps);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Complete the transaction
+	 */
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Copy any maps to caller's array and return any error.
+	 */
+	if (nimaps == 0) {
+		error = -ENOSPC;
+		goto out_unlock;
+	}
+
+	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+		error = xfs_alert_fsblock_zero(ip, imap);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	goto out_unlock;
+}
+
+STATIC bool
+xfs_quota_need_throttle(
+	struct xfs_inode	*ip,
+	xfs_dqtype_t		type,
+	xfs_fsblock_t		alloc_blocks)
+{
+	struct xfs_dquot	*dq = xfs_inode_dquot(ip, type);
+
+	if (!dq || !xfs_this_quota_on(ip->i_mount, type))
+		return false;
+
+	/* no hi watermark, no throttle */
+	if (!dq->q_prealloc_hi_wmark)
+		return false;
+
+	/* under the lo watermark, no throttle */
+	if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark)
+		return false;
+
+	return true;
+}
+
+STATIC void
+xfs_quota_calc_throttle(
+	struct xfs_inode	*ip,
+	xfs_dqtype_t		type,
+	xfs_fsblock_t		*qblocks,
+	int			*qshift,
+	int64_t			*qfreesp)
+{
+	struct xfs_dquot	*dq = xfs_inode_dquot(ip, type);
+	int64_t			freesp;
+	int			shift = 0;
+
+	/* no dq, or over hi wmark, squash the prealloc completely */
+	if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) {
+		*qblocks = 0;
+		*qfreesp = 0;
+		return;
+	}
+
+	freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved;
+	if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
+		shift = 2;
+		if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
+			shift += 2;
+		if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
+			shift += 2;
+	}
+
+	if (freesp < *qfreesp)
+		*qfreesp = freesp;
+
+	/* only overwrite the throttle values if we are more aggressive */
+	if ((freesp >> shift) < (*qblocks >> *qshift)) {
+		*qblocks = freesp;
+		*qshift = shift;
+	}
+}
+
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows.  Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to being full, the smaller the maximum preallocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	loff_t			offset,
+	loff_t			count,
+	struct xfs_iext_cursor	*icur)
+{
+	struct xfs_iext_cursor	ncur = *icur;
+	struct xfs_bmbt_irec	prev, got;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	int64_t			freesp;
+	xfs_fsblock_t		qblocks;
+	xfs_fsblock_t		alloc_blocks = 0;
+	xfs_extlen_t		plen;
+	int			shift = 0;
+	int			qshift = 0;
+
+	/*
+	 * As an exception we don't do any preallocation at all if the file is
+	 * smaller than the minimum preallocation and we are using the default
+	 * dynamic preallocation scheme, as it is likely this is the only write
+	 * to the file that is going to be done.
+	 */
+	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))
+		return 0;
+
+	/*
+	 * Use the minimum preallocation size for small files or if we are
+	 * writing right after a hole.
+	 */
+	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
+	    !xfs_iext_prev_extent(ifp, &ncur, &prev) ||
+	    prev.br_startoff + prev.br_blockcount < offset_fsb)
+		return mp->m_allocsize_blocks;
+
+	/*
+	 * Take the size of the preceding data extents as the basis for the
+	 * preallocation size. Note that we don't care if the previous extents
+	 * are written or not.
+	 */
+	plen = prev.br_blockcount;
+	while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
+		if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
+		    isnullstartblock(got.br_startblock) ||
+		    got.br_startoff + got.br_blockcount != prev.br_startoff ||
+		    got.br_startblock + got.br_blockcount != prev.br_startblock)
+			break;
+		plen += got.br_blockcount;
+		prev = got;
+	}
+
+	/*
+	 * If the size of the extents is greater than half the maximum extent
+	 * length, then use the current offset as the basis.  This ensures that
+	 * for large files the preallocation size always extends to
+	 * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
+	 * unit/width alignment of real extents.
+	 */
+	alloc_blocks = plen * 2;
+	if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
+		alloc_blocks = XFS_B_TO_FSB(mp, offset);
+	qblocks = alloc_blocks;
+
+	/*
+	 * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
+	 * down to the nearest power of two value after throttling. To prevent
+	 * the round down from unconditionally reducing the maximum supported
+	 * prealloc size, we round up first, apply appropriate throttling, round
+	 * down and cap the value to XFS_BMBT_MAX_EXTLEN.
+	 */
+	alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
+				       alloc_blocks);
+
+	freesp = percpu_counter_read_positive(&mp->m_fdblocks);
+	if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+		shift = 2;
+		if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+			shift++;
+	}
+
+	/*
+	 * Check each quota to cap the prealloc size, provide a shift value to
+	 * throttle with and adjust amount of available space.
+	 */
+	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift,
+					&freesp);
+	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift,
+					&freesp);
+	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift,
+					&freesp);
+
+	/*
+	 * The final prealloc size is set to the minimum of free space available
+	 * in each of the quotas and the overall filesystem.
+	 *
+	 * The shift throttle value is set to the maximum value as determined by
+	 * the global low free space values and per-quota low free space values.
+	 */
+	alloc_blocks = min(alloc_blocks, qblocks);
+	shift = max(shift, qshift);
+
+	if (shift)
+		alloc_blocks >>= shift;
+	/*
+	 * rounddown_pow_of_two() returns an undefined result if we pass in
+	 * alloc_blocks = 0.
+	 */
+	if (alloc_blocks)
+		alloc_blocks = rounddown_pow_of_two(alloc_blocks);
+	if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
+		alloc_blocks = XFS_MAX_BMBT_EXTLEN;
+
+	/*
+	 * If we are still trying to allocate more space than is
+	 * available, squash the prealloc hard. This can happen if we
+	 * have a large file on a small filesystem and the above
+	 * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
+	 */
+	while (alloc_blocks && alloc_blocks >= freesp)
+		alloc_blocks >>= 4;
+	if (alloc_blocks < mp->m_allocsize_blocks)
+		alloc_blocks = mp->m_allocsize_blocks;
+	trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
+				      mp->m_allocsize_blocks);
+	return alloc_blocks;
+}
+
+int
+xfs_iomap_write_unwritten(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	xfs_off_t	count,
+	bool		update_isize)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb;
+	xfs_filblks_t	count_fsb;
+	xfs_filblks_t	numblks_fsb;
+	int		nimaps;
+	xfs_trans_t	*tp;
+	xfs_bmbt_irec_t imap;
+	struct inode	*inode = VFS_I(ip);
+	xfs_fsize_t	i_size;
+	uint		resblks;
+	int		error;
+
+	trace_xfs_unwritten_convert(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
+
+	/*
+	 * Reserve enough blocks in this transaction for two complete extent
+	 * btree splits.  We may be converting the middle part of an unwritten
+	 * extent and in this case we will insert two new extents in the btree
+	 * each of which could cause a full split.
+	 *
+	 * This reservation amount will be used in the first call to
+	 * xfs_bmbt_split() to select an AG with enough space to satisfy the
+	 * rest of the operation.
+	 */
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+
+	/* Attach dquots so that bmbt splits are accounted correctly. */
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		return error;
+
+	do {
+		/*
+		 * Set up a transaction to convert the range of extents
+		 * from unwritten to real. Do allocations in a loop until
+		 * we have covered the range passed in.
+		 *
+		 * Note that we can't risk to recursing back into the filesystem
+		 * here as we might be asked to write out the same inode that we
+		 * complete here and might deadlock on the iolock.
+		 */
+		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
+				0, true, &tp);
+		if (error)
+			return error;
+
+		error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+				XFS_IEXT_WRITE_UNWRITTEN_CNT);
+		if (error == -EFBIG)
+			error = xfs_iext_count_upgrade(tp, ip,
+					XFS_IEXT_WRITE_UNWRITTEN_CNT);
+		if (error)
+			goto error_on_bmapi_transaction;
+
+		/*
+		 * Modify the unwritten extent state of the buffer.
+		 */
+		nimaps = 1;
+		error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+					XFS_BMAPI_CONVERT, resblks, &imap,
+					&nimaps);
+		if (error)
+			goto error_on_bmapi_transaction;
+
+		/*
+		 * Log the updated inode size as we go.  We have to be careful
+		 * to only log it up to the actual write offset if it is
+		 * halfway into a block.
+		 */
+		i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+		if (i_size > offset + count)
+			i_size = offset + count;
+		if (update_isize && i_size > i_size_read(inode))
+			i_size_write(inode, i_size);
+		i_size = xfs_new_eof(ip, i_size);
+		if (i_size) {
+			ip->i_disk_size = i_size;
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		}
+
+		error = xfs_trans_commit(tp);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error)
+			return error;
+
+		if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
+			return xfs_alert_fsblock_zero(ip, &imap);
+
+		if ((numblks_fsb = imap.br_blockcount) == 0) {
+			/*
+			 * The numblks_fsb value should always get
+			 * smaller, otherwise the loop is stuck.
+			 */
+			ASSERT(imap.br_blockcount);
+			break;
+		}
+		offset_fsb += numblks_fsb;
+		count_fsb -= numblks_fsb;
+	} while (count_fsb > 0);
+
+	return 0;
+
+error_on_bmapi_transaction:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+static inline bool
+imap_needs_alloc(
+	struct inode		*inode,
+	unsigned		flags,
+	struct xfs_bmbt_irec	*imap,
+	int			nimaps)
+{
+	/* don't allocate blocks when just zeroing */
+	if (flags & IOMAP_ZERO)
+		return false;
+	if (!nimaps ||
+	    imap->br_startblock == HOLESTARTBLOCK ||
+	    imap->br_startblock == DELAYSTARTBLOCK)
+		return true;
+	/* we convert unwritten extents before copying the data for DAX */
+	if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN)
+		return true;
+	return false;
+}
+
+static inline bool
+imap_needs_cow(
+	struct xfs_inode	*ip,
+	unsigned int		flags,
+	struct xfs_bmbt_irec	*imap,
+	int			nimaps)
+{
+	if (!xfs_is_cow_inode(ip))
+		return false;
+
+	/* when zeroing we don't have to COW holes or unwritten extents */
+	if (flags & IOMAP_ZERO) {
+		if (!nimaps ||
+		    imap->br_startblock == HOLESTARTBLOCK ||
+		    imap->br_state == XFS_EXT_UNWRITTEN)
+			return false;
+	}
+
+	return true;
+}
+
+static int
+xfs_ilock_for_iomap(
+	struct xfs_inode	*ip,
+	unsigned		flags,
+	unsigned		*lockmode)
+{
+	unsigned int		mode = *lockmode;
+	bool			is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
+
+	/*
+	 * COW writes may allocate delalloc space or convert unwritten COW
+	 * extents, so we need to make sure to take the lock exclusively here.
+	 */
+	if (xfs_is_cow_inode(ip) && is_write)
+		mode = XFS_ILOCK_EXCL;
+
+	/*
+	 * Extents not yet cached requires exclusive access, don't block.  This
+	 * is an opencoded xfs_ilock_data_map_shared() call but with
+	 * non-blocking behaviour.
+	 */
+	if (xfs_need_iread_extents(&ip->i_df)) {
+		if (flags & IOMAP_NOWAIT)
+			return -EAGAIN;
+		mode = XFS_ILOCK_EXCL;
+	}
+
+relock:
+	if (flags & IOMAP_NOWAIT) {
+		if (!xfs_ilock_nowait(ip, mode))
+			return -EAGAIN;
+	} else {
+		xfs_ilock(ip, mode);
+	}
+
+	/*
+	 * The reflink iflag could have changed since the earlier unlocked
+	 * check, so if we got ILOCK_SHARED for a write and but we're now a
+	 * reflink inode we have to switch to ILOCK_EXCL and relock.
+	 */
+	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
+		xfs_iunlock(ip, mode);
+		mode = XFS_ILOCK_EXCL;
+		goto relock;
+	}
+
+	*lockmode = mode;
+	return 0;
+}
+
+/*
+ * Check that the imap we are going to return to the caller spans the entire
+ * range that the caller requested for the IO.
+ */
+static bool
+imap_spans_range(
+	struct xfs_bmbt_irec	*imap,
+	xfs_fileoff_t		offset_fsb,
+	xfs_fileoff_t		end_fsb)
+{
+	if (imap->br_startoff > offset_fsb)
+		return false;
+	if (imap->br_startoff + imap->br_blockcount < end_fsb)
+		return false;
+	return true;
+}
+
+static int
+xfs_direct_write_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap,
+	struct iomap		*srcmap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap, cmap;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+	int			nimaps = 1, error = 0;
+	bool			shared = false;
+	u16			iomap_flags = 0;
+	unsigned int		lockmode = XFS_ILOCK_SHARED;
+
+	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/*
+	 * Writes that span EOF might trigger an IO size update on completion,
+	 * so consider them to be dirty for the purposes of O_DSYNC even if
+	 * there is no other metadata changes pending or have been made here.
+	 */
+	if (offset + length > i_size_read(inode))
+		iomap_flags |= IOMAP_F_DIRTY;
+
+	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+	if (error)
+		return error;
+
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+			       &nimaps, 0);
+	if (error)
+		goto out_unlock;
+
+	if (imap_needs_cow(ip, flags, &imap, nimaps)) {
+		error = -EAGAIN;
+		if (flags & IOMAP_NOWAIT)
+			goto out_unlock;
+
+		/* may drop and re-acquire the ilock */
+		error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
+				&lockmode,
+				(flags & IOMAP_DIRECT) || IS_DAX(inode));
+		if (error)
+			goto out_unlock;
+		if (shared)
+			goto out_found_cow;
+		end_fsb = imap.br_startoff + imap.br_blockcount;
+		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
+	}
+
+	if (imap_needs_alloc(inode, flags, &imap, nimaps))
+		goto allocate_blocks;
+
+	/*
+	 * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
+	 * a single map so that we avoid partial IO failures due to the rest of
+	 * the I/O range not covered by this map triggering an EAGAIN condition
+	 * when it is subsequently mapped and aborting the I/O.
+	 */
+	if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
+		error = -EAGAIN;
+		if (!imap_spans_range(&imap, offset_fsb, end_fsb))
+			goto out_unlock;
+	}
+
+	/*
+	 * For overwrite only I/O, we cannot convert unwritten extents without
+	 * requiring sub-block zeroing.  This can only be done under an
+	 * exclusive IOLOCK, hence return -EAGAIN if this is not a written
+	 * extent to tell the caller to try again.
+	 */
+	if (flags & IOMAP_OVERWRITE_ONLY) {
+		error = -EAGAIN;
+		if (imap.br_state != XFS_EXT_NORM &&
+	            ((offset | length) & mp->m_blockmask))
+			goto out_unlock;
+	}
+
+	xfs_iunlock(ip, lockmode);
+	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
+
+allocate_blocks:
+	error = -EAGAIN;
+	if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
+		goto out_unlock;
+
+	/*
+	 * We cap the maximum length we map to a sane size  to keep the chunks
+	 * of work done where somewhat symmetric with the work writeback does.
+	 * This is a completely arbitrary number pulled out of thin air as a
+	 * best guess for initial testing.
+	 *
+	 * Note that the values needs to be less than 32-bits wide until the
+	 * lower level functions are updated.
+	 */
+	length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+	end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+
+	if (offset + length > XFS_ISIZE(ip))
+		end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
+	else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+		end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+	xfs_iunlock(ip, lockmode);
+
+	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
+			flags, &imap);
+	if (error)
+		return error;
+
+	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+				 iomap_flags | IOMAP_F_NEW);
+
+out_found_cow:
+	xfs_iunlock(ip, lockmode);
+	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+	if (imap.br_startblock != HOLESTARTBLOCK) {
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+		if (error)
+			return error;
+	}
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
+
+out_unlock:
+	if (lockmode)
+		xfs_iunlock(ip, lockmode);
+	return error;
+}
+
+const struct iomap_ops xfs_direct_write_iomap_ops = {
+	.iomap_begin		= xfs_direct_write_iomap_begin,
+};
+
+static int
+xfs_dax_write_iomap_end(
+	struct inode		*inode,
+	loff_t			pos,
+	loff_t			length,
+	ssize_t			written,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	if (!xfs_is_cow_inode(ip))
+		return 0;
+
+	if (!written) {
+		xfs_reflink_cancel_cow_range(ip, pos, length, true);
+		return 0;
+	}
+
+	return xfs_reflink_end_cow(ip, pos, written);
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+	.iomap_begin	= xfs_direct_write_iomap_begin,
+	.iomap_end	= xfs_dax_write_iomap_end,
+};
+
+static int
+xfs_buffered_write_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			count,
+	unsigned		flags,
+	struct iomap		*iomap,
+	struct iomap		*srcmap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+	struct xfs_bmbt_irec	imap, cmap;
+	struct xfs_iext_cursor	icur, ccur;
+	xfs_fsblock_t		prealloc_blocks = 0;
+	bool			eof = false, cow_eof = false, shared = false;
+	int			allocfork = XFS_DATA_FORK;
+	int			error = 0;
+	unsigned int		lockmode = XFS_ILOCK_EXCL;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/* we can't use delayed allocations when using extent size hints */
+	if (xfs_get_extsz_hint(ip))
+		return xfs_direct_write_iomap_begin(inode, offset, count,
+				flags, iomap, srcmap);
+
+	ASSERT(!XFS_IS_REALTIME_INODE(ip));
+
+	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+	if (error)
+		return error;
+
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		error = -EFSCORRUPTED;
+		goto out_unlock;
+	}
+
+	XFS_STATS_INC(mp, xs_blk_mapw);
+
+	error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Search the data fork first to look up our source mapping.  We
+	 * always need the data fork map, as we have to return it to the
+	 * iomap code so that the higher level write code can read data in to
+	 * perform read-modify-write cycles for unaligned writes.
+	 */
+	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
+	if (eof)
+		imap.br_startoff = end_fsb; /* fake hole until the end */
+
+	/* We never need to allocate blocks for zeroing a hole. */
+	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+		goto out_unlock;
+	}
+
+	/*
+	 * Search the COW fork extent list even if we did not find a data fork
+	 * extent.  This serves two purposes: first this implements the
+	 * speculative preallocation using cowextsize, so that we also unshare
+	 * block adjacent to shared blocks instead of just the shared blocks
+	 * themselves.  Second the lookup in the extent list is generally faster
+	 * than going out to the shared extent tree.
+	 */
+	if (xfs_is_cow_inode(ip)) {
+		if (!ip->i_cowfp) {
+			ASSERT(!xfs_is_reflink_inode(ip));
+			xfs_ifork_init_cow(ip);
+		}
+		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+				&ccur, &cmap);
+		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+			trace_xfs_reflink_cow_found(ip, &cmap);
+			goto found_cow;
+		}
+	}
+
+	if (imap.br_startoff <= offset_fsb) {
+		/*
+		 * For reflink files we may need a delalloc reservation when
+		 * overwriting shared extents.   This includes zeroing of
+		 * existing extents that contain data.
+		 */
+		if (!xfs_is_cow_inode(ip) ||
+		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+					&imap);
+			goto found_imap;
+		}
+
+		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
+
+		/* Trim the mapping to the nearest shared extent boundary. */
+		error = xfs_bmap_trim_cow(ip, &imap, &shared);
+		if (error)
+			goto out_unlock;
+
+		/* Not shared?  Just report the (potentially capped) extent. */
+		if (!shared) {
+			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+					&imap);
+			goto found_imap;
+		}
+
+		/*
+		 * Fork all the shared blocks from our write offset until the
+		 * end of the extent.
+		 */
+		allocfork = XFS_COW_FORK;
+		end_fsb = imap.br_startoff + imap.br_blockcount;
+	} else {
+		/*
+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+		 * pages to keep the chunks of work done where somewhat
+		 * symmetric with the work writeback does.  This is a completely
+		 * arbitrary number pulled out of thin air.
+		 *
+		 * Note that the values needs to be less than 32-bits wide until
+		 * the lower level functions are updated.
+		 */
+		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+		end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+
+		if (xfs_is_always_cow_inode(ip))
+			allocfork = XFS_COW_FORK;
+	}
+
+	error = xfs_qm_dqattach_locked(ip, false);
+	if (error)
+		goto out_unlock;
+
+	if (eof && offset + count > XFS_ISIZE(ip)) {
+		/*
+		 * Determine the initial size of the preallocation.
+		 * We clean up any extra preallocation when the file is closed.
+		 */
+		if (xfs_has_allocsize(mp))
+			prealloc_blocks = mp->m_allocsize_blocks;
+		else
+			prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork,
+						offset, count, &icur);
+		if (prealloc_blocks) {
+			xfs_extlen_t	align;
+			xfs_off_t	end_offset;
+			xfs_fileoff_t	p_end_fsb;
+
+			end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1);
+			p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+					prealloc_blocks;
+
+			align = xfs_eof_alignment(ip);
+			if (align)
+				p_end_fsb = roundup_64(p_end_fsb, align);
+
+			p_end_fsb = min(p_end_fsb,
+				XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+			ASSERT(p_end_fsb > offset_fsb);
+			prealloc_blocks = p_end_fsb - end_fsb;
+		}
+	}
+
+retry:
+	error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+			end_fsb - offset_fsb, prealloc_blocks,
+			allocfork == XFS_DATA_FORK ? &imap : &cmap,
+			allocfork == XFS_DATA_FORK ? &icur : &ccur,
+			allocfork == XFS_DATA_FORK ? eof : cow_eof);
+	switch (error) {
+	case 0:
+		break;
+	case -ENOSPC:
+	case -EDQUOT:
+		/* retry without any preallocation */
+		trace_xfs_delalloc_enospc(ip, offset, count);
+		if (prealloc_blocks) {
+			prealloc_blocks = 0;
+			goto retry;
+		}
+		fallthrough;
+	default:
+		goto out_unlock;
+	}
+
+	if (allocfork == XFS_COW_FORK) {
+		trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
+		goto found_cow;
+	}
+
+	/*
+	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
+	 * them out if the write happens to fail.
+	 */
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
+
+found_imap:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+
+found_cow:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (imap.br_startoff <= offset_fsb) {
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+		if (error)
+			return error;
+		return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+					 IOMAP_F_SHARED);
+	}
+
+	xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+static int
+xfs_buffered_write_iomap_end(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	ssize_t			written,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error = 0;
+
+	if (iomap->type != IOMAP_DELALLOC)
+		return 0;
+
+	/*
+	 * Behave as if the write failed if drop writes is enabled. Set the NEW
+	 * flag to force delalloc cleanup.
+	 */
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
+		iomap->flags |= IOMAP_F_NEW;
+		written = 0;
+	}
+
+	/*
+	 * start_fsb refers to the first unused block after a short write. If
+	 * nothing was written, round offset down to point at the first block in
+	 * the range.
+	 */
+	if (unlikely(!written))
+		start_fsb = XFS_B_TO_FSBT(mp, offset);
+	else
+		start_fsb = XFS_B_TO_FSB(mp, offset + written);
+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+	/*
+	 * Trim delalloc blocks if they were allocated by this write and we
+	 * didn't manage to write the whole range.
+	 *
+	 * We don't need to care about racing delalloc as we hold i_mutex
+	 * across the reserve/allocate/unreserve calls. If there are delalloc
+	 * blocks in the range, they are ours.
+	 */
+	if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
+		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+					 XFS_FSB_TO_B(mp, end_fsb) - 1);
+
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+					       end_fsb - start_fsb);
+		if (error && !xfs_is_shutdown(mp)) {
+			xfs_alert(mp, "%s: unable to clean up ino %lld",
+				__func__, ip->i_ino);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+const struct iomap_ops xfs_buffered_write_iomap_ops = {
+	.iomap_begin		= xfs_buffered_write_iomap_begin,
+	.iomap_end		= xfs_buffered_write_iomap_end,
+};
+
+static int
+xfs_read_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap,
+	struct iomap		*srcmap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+	int			nimaps = 1, error = 0;
+	bool			shared = false;
+	unsigned int		lockmode = XFS_ILOCK_SHARED;
+
+	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+	if (error)
+		return error;
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+			       &nimaps, 0);
+	if (!error && (flags & IOMAP_REPORT))
+		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+	xfs_iunlock(ip, lockmode);
+
+	if (error)
+		return error;
+	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+				 shared ? IOMAP_F_SHARED : 0);
+}
+
+const struct iomap_ops xfs_read_iomap_ops = {
+	.iomap_begin		= xfs_read_iomap_begin,
+};
+
+static int
+xfs_seek_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap,
+	struct iomap		*srcmap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
+	xfs_fileoff_t		cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	imap, cmap;
+	int			error = 0;
+	unsigned		lockmode;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	lockmode = xfs_ilock_data_map_shared(ip);
+	error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+	if (error)
+		goto out_unlock;
+
+	if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+		/*
+		 * If we found a data extent we are done.
+		 */
+		if (imap.br_startoff <= offset_fsb)
+			goto done;
+		data_fsb = imap.br_startoff;
+	} else {
+		/*
+		 * Fake a hole until the end of the file.
+		 */
+		data_fsb = xfs_iomap_end_fsb(mp, offset, length);
+	}
+
+	/*
+	 * If a COW fork extent covers the hole, report it - capped to the next
+	 * data fork extent:
+	 */
+	if (xfs_inode_has_cow_data(ip) &&
+	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+		cow_fsb = cmap.br_startoff;
+	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+		if (data_fsb < cow_fsb + cmap.br_blockcount)
+			end_fsb = min(end_fsb, data_fsb);
+		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+					  IOMAP_F_SHARED);
+		/*
+		 * This is a COW extent, so we must probe the page cache
+		 * because there could be dirty page cache being backed
+		 * by this extent.
+		 */
+		iomap->type = IOMAP_UNWRITTEN;
+		goto out_unlock;
+	}
+
+	/*
+	 * Else report a hole, capped to the next found data or COW extent.
+	 */
+	if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+		imap.br_blockcount = cow_fsb - offset_fsb;
+	else
+		imap.br_blockcount = data_fsb - offset_fsb;
+	imap.br_startoff = offset_fsb;
+	imap.br_startblock = HOLESTARTBLOCK;
+	imap.br_state = XFS_EXT_NORM;
+done:
+	xfs_trim_extent(&imap, offset_fsb, end_fsb);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+out_unlock:
+	xfs_iunlock(ip, lockmode);
+	return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+	.iomap_begin		= xfs_seek_iomap_begin,
+};
+
+static int
+xfs_xattr_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap,
+	struct iomap		*srcmap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
+	struct xfs_bmbt_irec	imap;
+	int			nimaps = 1, error = 0;
+	unsigned		lockmode;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	lockmode = xfs_ilock_attr_map_shared(ip);
+
+	/* if there are no attribute fork or extents, return ENOENT */
+	if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) {
+		error = -ENOENT;
+		goto out_unlock;
+	}
+
+	ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+			       &nimaps, XFS_BMAPI_ATTRFORK);
+out_unlock:
+	xfs_iunlock(ip, lockmode);
+
+	if (error)
+		return error;
+	ASSERT(nimaps);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+}
+
+const struct iomap_ops xfs_xattr_iomap_ops = {
+	.iomap_begin		= xfs_xattr_iomap_begin,
+};
+
+int
+xfs_zero_range(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	loff_t			len,
+	bool			*did_zero)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	if (IS_DAX(inode))
+		return dax_zero_range(inode, pos, len, did_zero,
+				      &xfs_direct_write_iomap_ops);
+	return iomap_zero_range(inode, pos, len, did_zero,
+				&xfs_buffered_write_iomap_ops);
+}
+
+int
+xfs_truncate_page(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	bool			*did_zero)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	if (IS_DAX(inode))
+		return dax_truncate_page(inode, pos, did_zero,
+					&xfs_direct_write_iomap_ops);
+	return iomap_truncate_page(inode, pos, did_zero,
+				   &xfs_buffered_write_iomap_ops);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
new file mode 100644
index 000000000..c782e8c04
--- /dev/null
+++ b/fs/xfs/xfs_iomap.h
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2003-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_IOMAP_H__
+#define __XFS_IOMAP_H__
+
+#include <linux/iomap.h>
+
+struct xfs_inode;
+struct xfs_bmbt_irec;
+
+int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
+		xfs_fileoff_t count_fsb, unsigned int flags,
+		struct xfs_bmbt_irec *imap);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
+		xfs_fileoff_t end_fsb);
+
+int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
+		struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
+		u16 iomap_flags);
+
+int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
+		bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+
+static inline xfs_filblks_t
+xfs_aligned_fsb_count(
+	xfs_fileoff_t		offset_fsb,
+	xfs_filblks_t		count_fsb,
+	xfs_extlen_t		extsz)
+{
+	if (extsz) {
+		xfs_extlen_t	align;
+
+		div_u64_rem(offset_fsb, extsz, &align);
+		if (align)
+			count_fsb += align;
+		div_u64_rem(count_fsb, extsz, &align);
+		if (align)
+			count_fsb += extsz - align;
+	}
+
+	return count_fsb;
+}
+
+extern const struct iomap_ops xfs_buffered_write_iomap_ops;
+extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_read_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
+extern const struct iomap_ops xfs_xattr_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;
+
+#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
new file mode 100644
index 000000000..2e10e1c66
--- /dev/null
+++ b/fs/xfs/xfs_iops.c
@@ -0,0 +1,1334 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_acl.h"
+#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_dir2.h"
+#include "xfs_iomap.h"
+#include "xfs_error.h"
+#include "xfs_ioctl.h"
+#include "xfs_xattr.h"
+
+#include <linux/posix_acl.h>
+#include <linux/security.h>
+#include <linux/iversion.h>
+#include <linux/fiemap.h>
+
+/*
+ * Directories have different lock order w.r.t. mmap_lock compared to regular
+ * files. This is due to readdir potentially triggering page faults on a user
+ * buffer inside filldir(), and this happens with the ilock on the directory
+ * held. For regular files, the lock order is the other way around - the
+ * mmap_lock is taken during the page fault, and then we lock the ilock to do
+ * block mapping. Hence we need a different class for the directory ilock so
+ * that lockdep can tell them apart.
+ */
+static struct lock_class_key xfs_nondir_ilock_class;
+static struct lock_class_key xfs_dir_ilock_class;
+
+static int
+xfs_initxattrs(
+	struct inode		*inode,
+	const struct xattr	*xattr_array,
+	void			*fs_info)
+{
+	const struct xattr	*xattr;
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		struct xfs_da_args	args = {
+			.dp		= ip,
+			.attr_filter	= XFS_ATTR_SECURE,
+			.name		= xattr->name,
+			.namelen	= strlen(xattr->name),
+			.value		= xattr->value,
+			.valuelen	= xattr->value_len,
+		};
+		error = xfs_attr_change(&args);
+		if (error < 0)
+			break;
+	}
+	return error;
+}
+
+/*
+ * Hook in SELinux.  This is not quite correct yet, what we really need
+ * here (as we do for default ACLs) is a mechanism by which creation of
+ * these attrs can be journalled at inode creation time (along with the
+ * inode, of course, such that log replay can't cause these to be lost).
+ */
+int
+xfs_inode_init_security(
+	struct inode	*inode,
+	struct inode	*dir,
+	const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					     &xfs_initxattrs, NULL);
+}
+
+static void
+xfs_dentry_to_name(
+	struct xfs_name	*namep,
+	struct dentry	*dentry)
+{
+	namep->name = dentry->d_name.name;
+	namep->len = dentry->d_name.len;
+	namep->type = XFS_DIR3_FT_UNKNOWN;
+}
+
+static int
+xfs_dentry_mode_to_name(
+	struct xfs_name	*namep,
+	struct dentry	*dentry,
+	int		mode)
+{
+	namep->name = dentry->d_name.name;
+	namep->len = dentry->d_name.len;
+	namep->type = xfs_mode_to_ftype(mode);
+
+	if (unlikely(namep->type == XFS_DIR3_FT_UNKNOWN))
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+STATIC void
+xfs_cleanup_inode(
+	struct inode	*dir,
+	struct inode	*inode,
+	struct dentry	*dentry)
+{
+	struct xfs_name	teardown;
+
+	/* Oh, the horror.
+	 * If we can't add the ACL or we fail in
+	 * xfs_inode_init_security we must back out.
+	 * ENOSPC can hit here, among other things.
+	 */
+	xfs_dentry_to_name(&teardown, dentry);
+
+	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
+}
+
+/*
+ * Check to see if we are likely to need an extended attribute to be added to
+ * the inode we are about to allocate. This allows the attribute fork to be
+ * created during the inode allocation, reducing the number of transactions we
+ * need to do in this fast path.
+ *
+ * The security checks are optimistic, but not guaranteed. The two LSMs that
+ * require xattrs to be added here (selinux and smack) are also the only two
+ * LSMs that add a sb->s_security structure to the superblock. Hence if security
+ * is enabled and sb->s_security is set, we have a pretty good idea that we are
+ * going to be asked to add a security xattr immediately after allocating the
+ * xfs inode and instantiating the VFS inode.
+ */
+static inline bool
+xfs_create_need_xattr(
+	struct inode	*dir,
+	struct posix_acl *default_acl,
+	struct posix_acl *acl)
+{
+	if (acl)
+		return true;
+	if (default_acl)
+		return true;
+#if IS_ENABLED(CONFIG_SECURITY)
+	if (dir->i_sb->s_security)
+		return true;
+#endif
+	return false;
+}
+
+
+STATIC int
+xfs_generic_create(
+	struct user_namespace	*mnt_userns,
+	struct inode	*dir,
+	struct dentry	*dentry,
+	umode_t		mode,
+	dev_t		rdev,
+	struct file	*tmpfile)	/* unnamed file */
+{
+	struct inode	*inode;
+	struct xfs_inode *ip = NULL;
+	struct posix_acl *default_acl, *acl;
+	struct xfs_name	name;
+	int		error;
+
+	/*
+	 * Irix uses Missed'em'V split, but doesn't want to see
+	 * the upper 5 bits of (14bit) major.
+	 */
+	if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+			return -EINVAL;
+	} else {
+		rdev = 0;
+	}
+
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	/* Verify mode is valid also for tmpfile case */
+	error = xfs_dentry_mode_to_name(&name, dentry, mode);
+	if (unlikely(error))
+		goto out_free_acl;
+
+	if (!tmpfile) {
+		error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev,
+				xfs_create_need_xattr(dir, default_acl, acl),
+				&ip);
+	} else {
+		error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip);
+	}
+	if (unlikely(error))
+		goto out_free_acl;
+
+	inode = VFS_I(ip);
+
+	error = xfs_inode_init_security(inode, dir, &dentry->d_name);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	if (default_acl) {
+		error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		if (error)
+			goto out_cleanup_inode;
+	}
+	if (acl) {
+		error = __xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		if (error)
+			goto out_cleanup_inode;
+	}
+
+	xfs_setup_iops(ip);
+
+	if (tmpfile) {
+		/*
+		 * The VFS requires that any inode fed to d_tmpfile must have
+		 * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+		 * However, we created the temp file with nlink == 0 because
+		 * we're not allowed to put an inode with nlink > 0 on the
+		 * unlinked list.  Therefore we have to set nlink to 1 so that
+		 * d_tmpfile can immediately set it back to zero.
+		 */
+		set_nlink(inode, 1);
+		d_tmpfile(tmpfile, inode);
+	} else
+		d_instantiate(dentry, inode);
+
+	xfs_finish_inode_setup(ip);
+
+ out_free_acl:
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
+	return error;
+
+ out_cleanup_inode:
+	xfs_finish_inode_setup(ip);
+	if (!tmpfile)
+		xfs_cleanup_inode(dir, inode, dentry);
+	xfs_irele(ip);
+	goto out_free_acl;
+}
+
+STATIC int
+xfs_vn_mknod(
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode,
+	dev_t			rdev)
+{
+	return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL);
+}
+
+STATIC int
+xfs_vn_create(
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode,
+	bool			flags)
+{
+	return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL);
+}
+
+STATIC int
+xfs_vn_mkdir(
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode)
+{
+	return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0,
+				  NULL);
+}
+
+STATIC struct dentry *
+xfs_vn_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	unsigned int flags)
+{
+	struct inode *inode;
+	struct xfs_inode *cip;
+	struct xfs_name	name;
+	int		error;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	xfs_dentry_to_name(&name, dentry);
+	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
+	if (likely(!error))
+		inode = VFS_I(cip);
+	else if (likely(error == -ENOENT))
+		inode = NULL;
+	else
+		inode = ERR_PTR(error);
+	return d_splice_alias(inode, dentry);
+}
+
+STATIC struct dentry *
+xfs_vn_ci_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	unsigned int flags)
+{
+	struct xfs_inode *ip;
+	struct xfs_name	xname;
+	struct xfs_name ci_name;
+	struct qstr	dname;
+	int		error;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	xfs_dentry_to_name(&xname, dentry);
+	error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
+	if (unlikely(error)) {
+		if (unlikely(error != -ENOENT))
+			return ERR_PTR(error);
+		/*
+		 * call d_add(dentry, NULL) here when d_drop_negative_children
+		 * is called in xfs_vn_mknod (ie. allow negative dentries
+		 * with CI filesystems).
+		 */
+		return NULL;
+	}
+
+	/* if exact match, just splice and exit */
+	if (!ci_name.name)
+		return d_splice_alias(VFS_I(ip), dentry);
+
+	/* else case-insensitive match... */
+	dname.name = ci_name.name;
+	dname.len = ci_name.len;
+	dentry = d_add_ci(dentry, VFS_I(ip), &dname);
+	kmem_free(ci_name.name);
+	return dentry;
+}
+
+STATIC int
+xfs_vn_link(
+	struct dentry	*old_dentry,
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	struct inode	*inode = d_inode(old_dentry);
+	struct xfs_name	name;
+	int		error;
+
+	error = xfs_dentry_mode_to_name(&name, dentry, inode->i_mode);
+	if (unlikely(error))
+		return error;
+
+	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
+	if (unlikely(error))
+		return error;
+
+	ihold(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+STATIC int
+xfs_vn_unlink(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	struct xfs_name	name;
+	int		error;
+
+	xfs_dentry_to_name(&name, dentry);
+
+	error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry)));
+	if (error)
+		return error;
+
+	/*
+	 * With unlink, the VFS makes the dentry "negative": no inode,
+	 * but still hashed. This is incompatible with case-insensitive
+	 * mode, so invalidate (unhash) the dentry in CI-mode.
+	 */
+	if (xfs_has_asciici(XFS_M(dir->i_sb)))
+		d_invalidate(dentry);
+	return 0;
+}
+
+STATIC int
+xfs_vn_symlink(
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	const char		*symname)
+{
+	struct inode	*inode;
+	struct xfs_inode *cip = NULL;
+	struct xfs_name	name;
+	int		error;
+	umode_t		mode;
+
+	mode = S_IFLNK |
+		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
+	error = xfs_dentry_mode_to_name(&name, dentry, mode);
+	if (unlikely(error))
+		goto out;
+
+	error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip);
+	if (unlikely(error))
+		goto out;
+
+	inode = VFS_I(cip);
+
+	error = xfs_inode_init_security(inode, dir, &dentry->d_name);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	xfs_setup_iops(cip);
+
+	d_instantiate(dentry, inode);
+	xfs_finish_inode_setup(cip);
+	return 0;
+
+ out_cleanup_inode:
+	xfs_finish_inode_setup(cip);
+	xfs_cleanup_inode(dir, inode, dentry);
+	xfs_irele(cip);
+ out:
+	return error;
+}
+
+STATIC int
+xfs_vn_rename(
+	struct user_namespace	*mnt_userns,
+	struct inode		*odir,
+	struct dentry		*odentry,
+	struct inode		*ndir,
+	struct dentry		*ndentry,
+	unsigned int		flags)
+{
+	struct inode	*new_inode = d_inode(ndentry);
+	int		omode = 0;
+	int		error;
+	struct xfs_name	oname;
+	struct xfs_name	nname;
+
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+		return -EINVAL;
+
+	/* if we are exchanging files, we need to set i_mode of both files */
+	if (flags & RENAME_EXCHANGE)
+		omode = d_inode(ndentry)->i_mode;
+
+	error = xfs_dentry_mode_to_name(&oname, odentry, omode);
+	if (omode && unlikely(error))
+		return error;
+
+	error = xfs_dentry_mode_to_name(&nname, ndentry,
+					d_inode(odentry)->i_mode);
+	if (unlikely(error))
+		return error;
+
+	return xfs_rename(mnt_userns, XFS_I(odir), &oname,
+			  XFS_I(d_inode(odentry)), XFS_I(ndir), &nname,
+			  new_inode ? XFS_I(new_inode) : NULL, flags);
+}
+
+/*
+ * careful here - this function can get called recursively, so
+ * we need to be very careful about how much stack we use.
+ * uio is kmalloced for this reason...
+ */
+STATIC const char *
+xfs_vn_get_link(
+	struct dentry		*dentry,
+	struct inode		*inode,
+	struct delayed_call	*done)
+{
+	char			*link;
+	int			error = -ENOMEM;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	link = kmalloc(XFS_SYMLINK_MAXLEN+1, GFP_KERNEL);
+	if (!link)
+		goto out_err;
+
+	error = xfs_readlink(XFS_I(d_inode(dentry)), link);
+	if (unlikely(error))
+		goto out_kfree;
+
+	set_delayed_call(done, kfree_link, link);
+	return link;
+
+ out_kfree:
+	kfree(link);
+ out_err:
+	return ERR_PTR(error);
+}
+
+static uint32_t
+xfs_stat_blksize(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	/*
+	 * If the file blocks are being allocated from a realtime volume, then
+	 * always return the realtime extent size.
+	 */
+	if (XFS_IS_REALTIME_INODE(ip))
+		return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip));
+
+	/*
+	 * Allow large block sizes to be reported to userspace programs if the
+	 * "largeio" mount option is used.
+	 *
+	 * If compatibility mode is specified, simply return the basic unit of
+	 * caching so that we don't get inefficient read/modify/write I/O from
+	 * user apps. Otherwise....
+	 *
+	 * If the underlying volume is a stripe, then return the stripe width in
+	 * bytes as the recommended I/O size. It is not a stripe and we've set a
+	 * default buffered I/O size, return that, otherwise return the compat
+	 * default.
+	 */
+	if (xfs_has_large_iosize(mp)) {
+		if (mp->m_swidth)
+			return XFS_FSB_TO_B(mp, mp->m_swidth);
+		if (xfs_has_allocsize(mp))
+			return 1U << mp->m_allocsize_log;
+	}
+
+	return PAGE_SIZE;
+}
+
+STATIC int
+xfs_vn_getattr(
+	struct user_namespace	*mnt_userns,
+	const struct path	*path,
+	struct kstat		*stat,
+	u32			request_mask,
+	unsigned int		query_flags)
+{
+	struct inode		*inode = d_inode(path->dentry);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	vfsuid_t		vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+	vfsgid_t		vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
+	trace_xfs_getattr(ip);
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	stat->size = XFS_ISIZE(ip);
+	stat->dev = inode->i_sb->s_dev;
+	stat->mode = inode->i_mode;
+	stat->nlink = inode->i_nlink;
+	stat->uid = vfsuid_into_kuid(vfsuid);
+	stat->gid = vfsgid_into_kgid(vfsgid);
+	stat->ino = ip->i_ino;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
+
+	if (xfs_has_v3inodes(mp)) {
+		if (request_mask & STATX_BTIME) {
+			stat->result_mask |= STATX_BTIME;
+			stat->btime = ip->i_crtime;
+		}
+	}
+
+	/*
+	 * Note: If you add another clause to set an attribute flag, please
+	 * update attributes_mask below.
+	 */
+	if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	if (ip->i_diflags & XFS_DIFLAG_APPEND)
+		stat->attributes |= STATX_ATTR_APPEND;
+	if (ip->i_diflags & XFS_DIFLAG_NODUMP)
+		stat->attributes |= STATX_ATTR_NODUMP;
+
+	stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+				  STATX_ATTR_APPEND |
+				  STATX_ATTR_NODUMP);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		stat->blksize = BLKDEV_IOSIZE;
+		stat->rdev = inode->i_rdev;
+		break;
+	case S_IFREG:
+		if (request_mask & STATX_DIOALIGN) {
+			struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+			struct block_device	*bdev = target->bt_bdev;
+
+			stat->result_mask |= STATX_DIOALIGN;
+			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+			stat->dio_offset_align = bdev_logical_block_size(bdev);
+		}
+		fallthrough;
+	default:
+		stat->blksize = xfs_stat_blksize(ip);
+		stat->rdev = 0;
+		break;
+	}
+
+	return 0;
+}
+
+static int
+xfs_vn_change_ok(
+	struct user_namespace	*mnt_userns,
+	struct dentry		*dentry,
+	struct iattr		*iattr)
+{
+	struct xfs_mount	*mp = XFS_I(d_inode(dentry))->i_mount;
+
+	if (xfs_is_readonly(mp))
+		return -EROFS;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	return setattr_prepare(mnt_userns, dentry, iattr);
+}
+
+/*
+ * Set non-size attributes of an inode.
+ *
+ * Caution: The caller of this function is responsible for calling
+ * setattr_prepare() or otherwise verifying the change is fine.
+ */
+static int
+xfs_setattr_nonsize(
+	struct user_namespace	*mnt_userns,
+	struct xfs_inode	*ip,
+	struct iattr		*iattr)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	int			mask = iattr->ia_valid;
+	xfs_trans_t		*tp;
+	int			error;
+	kuid_t			uid = GLOBAL_ROOT_UID;
+	kgid_t			gid = GLOBAL_ROOT_GID;
+	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
+	struct xfs_dquot	*old_udqp = NULL, *old_gdqp = NULL;
+
+	ASSERT((mask & ATTR_SIZE) == 0);
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
+		uint	qflags = 0;
+
+		if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
+			uid = from_vfsuid(mnt_userns, i_user_ns(inode),
+					  iattr->ia_vfsuid);
+			qflags |= XFS_QMOPT_UQUOTA;
+		} else {
+			uid = inode->i_uid;
+		}
+		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
+			gid = from_vfsgid(mnt_userns, i_user_ns(inode),
+					  iattr->ia_vfsgid);
+			qflags |= XFS_QMOPT_GQUOTA;
+		}  else {
+			gid = inode->i_gid;
+		}
+
+		/*
+		 * We take a reference when we initialize udqp and gdqp,
+		 * so it is important that we never blindly double trip on
+		 * the same variable. See xfs_create() for an example.
+		 */
+		ASSERT(udqp == NULL);
+		ASSERT(gdqp == NULL);
+		error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_projid,
+					   qflags, &udqp, &gdqp, NULL);
+		if (error)
+			return error;
+	}
+
+	error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
+			has_capability_noaudit(current, CAP_FOWNER), &tp);
+	if (error)
+		goto out_dqrele;
+
+	/*
+	 * Register quota modifications in the transaction.  Must be the owner
+	 * or privileged.  These IDs could have changed since we last looked at
+	 * them.  But, we're assured that if the ownership did change while we
+	 * didn't have the inode locked, inode's dquot(s) would have changed
+	 * also.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp) &&
+	    i_uid_needs_update(mnt_userns, iattr, inode)) {
+		ASSERT(udqp);
+		old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
+	}
+	if (XFS_IS_GQUOTA_ON(mp) &&
+	    i_gid_needs_update(mnt_userns, iattr, inode)) {
+		ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
+		ASSERT(gdqp);
+		old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
+	}
+
+	setattr_copy(mnt_userns, inode, iattr);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	XFS_STATS_INC(mp, xs_ig_attrchg);
+
+	if (xfs_has_wsync(mp))
+		xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp);
+
+	/*
+	 * Release any dquot(s) the inode had kept before chown.
+	 */
+	xfs_qm_dqrele(old_udqp);
+	xfs_qm_dqrele(old_gdqp);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	if (error)
+		return error;
+
+	/*
+	 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+	 * 	     update.  We could avoid this with linked transactions
+	 * 	     and passing down the transaction pointer all the way
+	 *	     to attr_set.  No previous user of the generic
+	 * 	     Posix ACL code seems to care about this issue either.
+	 */
+	if (mask & ATTR_MODE) {
+		error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+		if (error)
+			return error;
+	}
+
+	return 0;
+
+out_dqrele:
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	return error;
+}
+
+/*
+ * Truncate file.  Must have write permission and not be a directory.
+ *
+ * Caution: The caller of this function is responsible for calling
+ * setattr_prepare() or otherwise verifying the change is fine.
+ */
+STATIC int
+xfs_setattr_size(
+	struct user_namespace	*mnt_userns,
+	struct xfs_inode	*ip,
+	struct iattr		*iattr)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	xfs_off_t		oldsize, newsize;
+	struct xfs_trans	*tp;
+	int			error;
+	uint			lock_flags = 0;
+	bool			did_zeroing = false;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+	ASSERT(S_ISREG(inode->i_mode));
+	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+		ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
+
+	oldsize = inode->i_size;
+	newsize = iattr->ia_size;
+
+	/*
+	 * Short circuit the truncate case for zero length files.
+	 */
+	if (newsize == 0 && oldsize == 0 && ip->i_df.if_nextents == 0) {
+		if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
+			return 0;
+
+		/*
+		 * Use the regular setattr path to update the timestamps.
+		 */
+		iattr->ia_valid &= ~ATTR_SIZE;
+		return xfs_setattr_nonsize(mnt_userns, ip, iattr);
+	}
+
+	/*
+	 * Make sure that the dquots are attached to the inode.
+	 */
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		return error;
+
+	/*
+	 * Wait for all direct I/O to complete.
+	 */
+	inode_dio_wait(inode);
+
+	/*
+	 * File data changes must be complete before we start the transaction to
+	 * modify the inode.  This needs to be done before joining the inode to
+	 * the transaction because the inode cannot be unlocked once it is a
+	 * part of the transaction.
+	 *
+	 * Start with zeroing any data beyond EOF that we may expose on file
+	 * extension, or zeroing out the rest of the block on a downward
+	 * truncate.
+	 */
+	if (newsize > oldsize) {
+		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
+		error = xfs_zero_range(ip, oldsize, newsize - oldsize,
+				&did_zeroing);
+	} else {
+		/*
+		 * iomap won't detect a dirty page over an unwritten block (or a
+		 * cow block over a hole) and subsequently skips zeroing the
+		 * newly post-EOF portion of the page. Flush the new EOF to
+		 * convert the block before the pagecache truncate.
+		 */
+		error = filemap_write_and_wait_range(inode->i_mapping, newsize,
+						     newsize);
+		if (error)
+			return error;
+		error = xfs_truncate_page(ip, newsize, &did_zeroing);
+	}
+
+	if (error)
+		return error;
+
+	/*
+	 * We've already locked out new page faults, so now we can safely remove
+	 * pages from the page cache knowing they won't get refaulted until we
+	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
+	 * complete. The truncate_setsize() call also cleans partial EOF page
+	 * PTEs on extending truncates and hence ensures sub-page block size
+	 * filesystems are correctly handled, too.
+	 *
+	 * We have to do all the page cache truncate work outside the
+	 * transaction context as the "lock" order is page lock->log space
+	 * reservation as defined by extent allocation in the writeback path.
+	 * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
+	 * having already truncated the in-memory version of the file (i.e. made
+	 * user visible changes). There's not much we can do about this, except
+	 * to hope that the caller sees ENOMEM and retries the truncate
+	 * operation.
+	 *
+	 * And we update in-core i_size and truncate page cache beyond newsize
+	 * before writeback the [i_disk_size, newsize] range, so we're
+	 * guaranteed not to write stale data past the new EOF on truncate down.
+	 */
+	truncate_setsize(inode, newsize);
+
+	/*
+	 * We are going to log the inode size change in this transaction so
+	 * any previous writes that are beyond the on disk EOF and the new
+	 * EOF that have not been written out need to be written here.  If we
+	 * do not write the data out, we expose ourselves to the null files
+	 * problem. Note that this includes any block zeroing we did above;
+	 * otherwise those blocks may not be zeroed after a crash.
+	 */
+	if (did_zeroing ||
+	    (newsize > ip->i_disk_size && oldsize != ip->i_disk_size)) {
+		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+						ip->i_disk_size, newsize - 1);
+		if (error)
+			return error;
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	lock_flags |= XFS_ILOCK_EXCL;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Only change the c/mtime if we are changing the size or we are
+	 * explicitly asked to change it.  This handles the semantic difference
+	 * between truncate() and ftruncate() as implemented in the VFS.
+	 *
+	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+	 * special case where we need to update the times despite not having
+	 * these flags set.  For all other operations the VFS set these flags
+	 * explicitly if it wants a timestamp update.
+	 */
+	if (newsize != oldsize &&
+	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
+		iattr->ia_ctime = iattr->ia_mtime =
+			current_time(inode);
+		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+	}
+
+	/*
+	 * The first thing we do is set the size to new_size permanently on
+	 * disk.  This way we don't have to worry about anyone ever being able
+	 * to look at the data being freed even in the face of a crash.
+	 * What we're getting around here is the case where we free a block, it
+	 * is allocated to another file, it is written to, and then we crash.
+	 * If the new data gets written to the file but the log buffers
+	 * containing the free and reallocation don't, then we'd end up with
+	 * garbage in the blocks being freed.  As long as we make the new size
+	 * permanent before actually freeing any blocks it doesn't matter if
+	 * they get written to.
+	 */
+	ip->i_disk_size = newsize;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (newsize <= oldsize) {
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
+		if (error)
+			goto out_trans_cancel;
+
+		/*
+		 * Truncated "down", so we're removing references to old data
+		 * here - if we delay flushing for a long time, we expose
+		 * ourselves unduly to the notorious NULL files problem.  So,
+		 * we mark this inode and flush it when the file is closed,
+		 * and do not wait the usual (long) time for writeout.
+		 */
+		xfs_iflags_set(ip, XFS_ITRUNCATED);
+
+		/* A truncate down always removes post-EOF blocks. */
+		xfs_inode_clear_eofblocks_tag(ip);
+	}
+
+	ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+	setattr_copy(mnt_userns, inode, iattr);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	XFS_STATS_INC(mp, xs_ig_attrchg);
+
+	if (xfs_has_wsync(mp))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp);
+out_unlock:
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	goto out_unlock;
+}
+
+int
+xfs_vn_setattr_size(
+	struct user_namespace	*mnt_userns,
+	struct dentry		*dentry,
+	struct iattr		*iattr)
+{
+	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	int error;
+
+	trace_xfs_setattr(ip);
+
+	error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
+	if (error)
+		return error;
+	return xfs_setattr_size(mnt_userns, ip, iattr);
+}
+
+STATIC int
+xfs_vn_setattr(
+	struct user_namespace	*mnt_userns,
+	struct dentry		*dentry,
+	struct iattr		*iattr)
+{
+	struct inode		*inode = d_inode(dentry);
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
+
+	if (iattr->ia_valid & ATTR_SIZE) {
+		uint			iolock;
+
+		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+		iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+
+		error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
+		if (error) {
+			xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+			return error;
+		}
+
+		error = xfs_vn_setattr_size(mnt_userns, dentry, iattr);
+		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+	} else {
+		trace_xfs_setattr(ip);
+
+		error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
+		if (!error)
+			error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
+	}
+
+	return error;
+}
+
+STATIC int
+xfs_vn_update_time(
+	struct inode		*inode,
+	struct timespec64	*now,
+	int			flags)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			log_flags = XFS_ILOG_TIMESTAMP;
+	struct xfs_trans	*tp;
+	int			error;
+
+	trace_xfs_update_time(ip);
+
+	if (inode->i_sb->s_flags & SB_LAZYTIME) {
+		if (!((flags & S_VERSION) &&
+		      inode_maybe_inc_iversion(inode, false)))
+			return generic_update_time(inode, now, flags);
+
+		/* Capture the iversion update that just occurred */
+		log_flags |= XFS_ILOG_CORE;
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (flags & S_CTIME)
+		inode->i_ctime = *now;
+	if (flags & S_MTIME)
+		inode->i_mtime = *now;
+	if (flags & S_ATIME)
+		inode->i_atime = *now;
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, log_flags);
+	return xfs_trans_commit(tp);
+}
+
+STATIC int
+xfs_vn_fiemap(
+	struct inode		*inode,
+	struct fiemap_extent_info *fieinfo,
+	u64			start,
+	u64			length)
+{
+	int			error;
+
+	xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+		fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
+		error = iomap_fiemap(inode, fieinfo, start, length,
+				&xfs_xattr_iomap_ops);
+	} else {
+		error = iomap_fiemap(inode, fieinfo, start, length,
+				&xfs_read_iomap_ops);
+	}
+	xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
+
+	return error;
+}
+
+STATIC int
+xfs_vn_tmpfile(
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct file		*file,
+	umode_t			mode)
+{
+	int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file);
+
+	return finish_open_simple(file, err);
+}
+
+static const struct inode_operations xfs_inode_operations = {
+	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.listxattr		= xfs_vn_listxattr,
+	.fiemap			= xfs_vn_fiemap,
+	.update_time		= xfs_vn_update_time,
+	.fileattr_get		= xfs_fileattr_get,
+	.fileattr_set		= xfs_fileattr_set,
+};
+
+static const struct inode_operations xfs_dir_inode_operations = {
+	.create			= xfs_vn_create,
+	.lookup			= xfs_vn_lookup,
+	.link			= xfs_vn_link,
+	.unlink			= xfs_vn_unlink,
+	.symlink		= xfs_vn_symlink,
+	.mkdir			= xfs_vn_mkdir,
+	/*
+	 * Yes, XFS uses the same method for rmdir and unlink.
+	 *
+	 * There are some subtile differences deeper in the code,
+	 * but we use S_ISDIR to check for those.
+	 */
+	.rmdir			= xfs_vn_unlink,
+	.mknod			= xfs_vn_mknod,
+	.rename			= xfs_vn_rename,
+	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.listxattr		= xfs_vn_listxattr,
+	.update_time		= xfs_vn_update_time,
+	.tmpfile		= xfs_vn_tmpfile,
+	.fileattr_get		= xfs_fileattr_get,
+	.fileattr_set		= xfs_fileattr_set,
+};
+
+static const struct inode_operations xfs_dir_ci_inode_operations = {
+	.create			= xfs_vn_create,
+	.lookup			= xfs_vn_ci_lookup,
+	.link			= xfs_vn_link,
+	.unlink			= xfs_vn_unlink,
+	.symlink		= xfs_vn_symlink,
+	.mkdir			= xfs_vn_mkdir,
+	/*
+	 * Yes, XFS uses the same method for rmdir and unlink.
+	 *
+	 * There are some subtile differences deeper in the code,
+	 * but we use S_ISDIR to check for those.
+	 */
+	.rmdir			= xfs_vn_unlink,
+	.mknod			= xfs_vn_mknod,
+	.rename			= xfs_vn_rename,
+	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.listxattr		= xfs_vn_listxattr,
+	.update_time		= xfs_vn_update_time,
+	.tmpfile		= xfs_vn_tmpfile,
+	.fileattr_get		= xfs_fileattr_get,
+	.fileattr_set		= xfs_fileattr_set,
+};
+
+static const struct inode_operations xfs_symlink_inode_operations = {
+	.get_link		= xfs_vn_get_link,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.listxattr		= xfs_vn_listxattr,
+	.update_time		= xfs_vn_update_time,
+};
+
+/* Figure out if this file actually supports DAX. */
+static bool
+xfs_inode_supports_dax(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	/* Only supported on regular files. */
+	if (!S_ISREG(VFS_I(ip)->i_mode))
+		return false;
+
+	/* Only supported on non-reflinked files. */
+	if (xfs_is_reflink_inode(ip))
+		return false;
+
+	/* Block size must match page size */
+	if (mp->m_sb.sb_blocksize != PAGE_SIZE)
+		return false;
+
+	/* Device has to support DAX too. */
+	return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
+}
+
+static bool
+xfs_inode_should_enable_dax(
+	struct xfs_inode *ip)
+{
+	if (!IS_ENABLED(CONFIG_FS_DAX))
+		return false;
+	if (xfs_has_dax_never(ip->i_mount))
+		return false;
+	if (!xfs_inode_supports_dax(ip))
+		return false;
+	if (xfs_has_dax_always(ip->i_mount))
+		return true;
+	if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
+		return true;
+	return false;
+}
+
+void
+xfs_diflags_to_iflags(
+	struct xfs_inode	*ip,
+	bool init)
+{
+	struct inode            *inode = VFS_I(ip);
+	unsigned int            xflags = xfs_ip2xflags(ip);
+	unsigned int            flags = 0;
+
+	ASSERT(!(IS_DAX(inode) && init));
+
+	if (xflags & FS_XFLAG_IMMUTABLE)
+		flags |= S_IMMUTABLE;
+	if (xflags & FS_XFLAG_APPEND)
+		flags |= S_APPEND;
+	if (xflags & FS_XFLAG_SYNC)
+		flags |= S_SYNC;
+	if (xflags & FS_XFLAG_NOATIME)
+		flags |= S_NOATIME;
+	if (init && xfs_inode_should_enable_dax(ip))
+		flags |= S_DAX;
+
+	/*
+	 * S_DAX can only be set during inode initialization and is never set by
+	 * the VFS, so we cannot mask off S_DAX in i_flags.
+	 */
+	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME);
+	inode->i_flags |= flags;
+}
+
+/*
+ * Initialize the Linux inode.
+ *
+ * When reading existing inodes from disk this is called directly from xfs_iget,
+ * when creating a new inode it is called from xfs_init_new_inode after setting
+ * up the inode. These callers have different criteria for clearing XFS_INEW, so
+ * leave it up to the caller to deal with unlocking the inode appropriately.
+ */
+void
+xfs_setup_inode(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = &ip->i_vnode;
+	gfp_t			gfp_mask;
+
+	inode->i_ino = ip->i_ino;
+	inode->i_state |= I_NEW;
+
+	inode_sb_list_add(inode);
+	/* make the inode look hashed for the writeback code */
+	inode_fake_hash(inode);
+
+	i_size_write(inode, ip->i_disk_size);
+	xfs_diflags_to_iflags(ip, true);
+
+	if (S_ISDIR(inode->i_mode)) {
+		/*
+		 * We set the i_rwsem class here to avoid potential races with
+		 * lockdep_annotate_inode_mutex_key() reinitialising the lock
+		 * after a filehandle lookup has already found the inode in
+		 * cache before it has been unlocked via unlock_new_inode().
+		 */
+		lockdep_set_class(&inode->i_rwsem,
+				  &inode->i_sb->s_type->i_mutex_dir_key);
+		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+	} else {
+		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+	}
+
+	/*
+	 * Ensure all page cache allocations are done from GFP_NOFS context to
+	 * prevent direct reclaim recursion back into the filesystem and blowing
+	 * stacks or deadlocking.
+	 */
+	gfp_mask = mapping_gfp_mask(inode->i_mapping);
+	mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
+
+	/*
+	 * If there is no attribute fork no ACL can exist on this inode,
+	 * and it can't have any file capabilities attached to it either.
+	 */
+	if (!xfs_inode_has_attr_fork(ip)) {
+		inode_has_no_xattr(inode);
+		cache_no_acl(inode);
+	}
+}
+
+void
+xfs_setup_iops(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = &ip->i_vnode;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &xfs_inode_operations;
+		inode->i_fop = &xfs_file_operations;
+		if (IS_DAX(inode))
+			inode->i_mapping->a_ops = &xfs_dax_aops;
+		else
+			inode->i_mapping->a_ops = &xfs_address_space_operations;
+		break;
+	case S_IFDIR:
+		if (xfs_has_asciici(XFS_M(inode->i_sb)))
+			inode->i_op = &xfs_dir_ci_inode_operations;
+		else
+			inode->i_op = &xfs_dir_inode_operations;
+		inode->i_fop = &xfs_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &xfs_symlink_inode_operations;
+		break;
+	default:
+		inode->i_op = &xfs_inode_operations;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	}
+}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
new file mode 100644
index 000000000..e570dcb5d
--- /dev/null
+++ b/fs/xfs/xfs_iops.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_IOPS_H__
+#define __XFS_IOPS_H__
+
+struct xfs_inode;
+
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+
+extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
+
+int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
+		struct dentry *dentry, struct iattr *vap);
+
+int xfs_inode_init_security(struct inode *inode, struct inode *dir,
+		const struct qstr *qstr);
+
+#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
new file mode 100644
index 000000000..a1c2bcf65
--- /dev/null
+++ b/fs/xfs/xfs_itable.c
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_icache.h"
+#include "xfs_health.h"
+#include "xfs_trans.h"
+
+/*
+ * Bulk Stat
+ * =========
+ *
+ * Use the inode walking functions to fill out struct xfs_bulkstat for every
+ * allocated inode, then pass the stat information to some externally provided
+ * iteration function.
+ */
+
+struct xfs_bstat_chunk {
+	bulkstat_one_fmt_pf	formatter;
+	struct xfs_ibulk	*breq;
+	struct xfs_bulkstat	*buf;
+};
+
+/*
+ * Fill out the bulkstat info for a single inode and report it somewhere.
+ *
+ * bc->breq->lastino is effectively the inode cursor as we walk through the
+ * filesystem.  Therefore, we update it any time we need to move the cursor
+ * forward, regardless of whether or not we're sending any bstat information
+ * back to userspace.  If the inode is internal metadata or, has been freed
+ * out from under us, we just simply keep going.
+ *
+ * However, if any other type of error happens we want to stop right where we
+ * are so that userspace will call back with exact number of the bad inode and
+ * we can send back an error code.
+ *
+ * Note that if the formatter tells us there's no space left in the buffer we
+ * move the cursor forward and abort the walk.
+ */
+STATIC int
+xfs_bulkstat_one_int(
+	struct xfs_mount	*mp,
+	struct user_namespace	*mnt_userns,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	struct xfs_bstat_chunk	*bc)
+{
+	struct user_namespace	*sb_userns = mp->m_super->s_user_ns;
+	struct xfs_inode	*ip;		/* incore inode pointer */
+	struct inode		*inode;
+	struct xfs_bulkstat	*buf = bc->buf;
+	xfs_extnum_t		nextents;
+	int			error = -EINVAL;
+	vfsuid_t		vfsuid;
+	vfsgid_t		vfsgid;
+
+	if (xfs_internal_inum(mp, ino))
+		goto out_advance;
+
+	error = xfs_iget(mp, tp, ino,
+			 (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
+			 XFS_ILOCK_SHARED, &ip);
+	if (error == -ENOENT || error == -EINVAL)
+		goto out_advance;
+	if (error)
+		goto out;
+
+	ASSERT(ip != NULL);
+	ASSERT(ip->i_imap.im_blkno != 0);
+	inode = VFS_I(ip);
+	vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+	vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
+	/* xfs_iget returns the following without needing
+	 * further change.
+	 */
+	buf->bs_projectid = ip->i_projid;
+	buf->bs_ino = ino;
+	buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid));
+	buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid));
+	buf->bs_size = ip->i_disk_size;
+
+	buf->bs_nlink = inode->i_nlink;
+	buf->bs_atime = inode->i_atime.tv_sec;
+	buf->bs_atime_nsec = inode->i_atime.tv_nsec;
+	buf->bs_mtime = inode->i_mtime.tv_sec;
+	buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
+	buf->bs_ctime = inode->i_ctime.tv_sec;
+	buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
+	buf->bs_gen = inode->i_generation;
+	buf->bs_mode = inode->i_mode;
+
+	buf->bs_xflags = xfs_ip2xflags(ip);
+	buf->bs_extsize_blks = ip->i_extsize;
+
+	nextents = xfs_ifork_nextents(&ip->i_df);
+	if (!(bc->breq->flags & XFS_IBULK_NREXT64))
+		buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL);
+	else
+		buf->bs_extents64 = nextents;
+
+	xfs_bulkstat_health(ip, buf);
+	buf->bs_aextents = xfs_ifork_nextents(&ip->i_af);
+	buf->bs_forkoff = xfs_inode_fork_boff(ip);
+	buf->bs_version = XFS_BULKSTAT_VERSION_V5;
+
+	if (xfs_has_v3inodes(mp)) {
+		buf->bs_btime = ip->i_crtime.tv_sec;
+		buf->bs_btime_nsec = ip->i_crtime.tv_nsec;
+		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+			buf->bs_cowextsize_blks = ip->i_cowextsize;
+	}
+
+	switch (ip->i_df.if_format) {
+	case XFS_DINODE_FMT_DEV:
+		buf->bs_rdev = sysv_encode_dev(inode->i_rdev);
+		buf->bs_blksize = BLKDEV_IOSIZE;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		buf->bs_blocks = ip->i_nblocks + ip->i_delayed_blks;
+		break;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	xfs_irele(ip);
+
+	error = bc->formatter(bc->breq, buf);
+	if (error == -ECANCELED)
+		goto out_advance;
+	if (error)
+		goto out;
+
+out_advance:
+	/*
+	 * Advance the cursor to the inode that comes after the one we just
+	 * looked at.  We want the caller to move along if the bulkstat
+	 * information was copied successfully; if we tried to grab the inode
+	 * but it's no longer allocated; or if it's internal metadata.
+	 */
+	bc->breq->startino = ino + 1;
+out:
+	return error;
+}
+
+/* Bulkstat a single inode. */
+int
+xfs_bulkstat_one(
+	struct xfs_ibulk	*breq,
+	bulkstat_one_fmt_pf	formatter)
+{
+	struct xfs_bstat_chunk	bc = {
+		.formatter	= formatter,
+		.breq		= breq,
+	};
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (breq->mnt_userns != &init_user_ns) {
+		xfs_warn_ratelimited(breq->mp,
+			"bulkstat not supported inside of idmapped mounts.");
+		return -EINVAL;
+	}
+
+	ASSERT(breq->icount == 1);
+
+	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
+			KM_MAYFAIL);
+	if (!bc.buf)
+		return -ENOMEM;
+
+	/*
+	 * Grab an empty transaction so that we can use its recursive buffer
+	 * locking abilities to detect cycles in the inobt without deadlocking.
+	 */
+	error = xfs_trans_alloc_empty(breq->mp, &tp);
+	if (error)
+		goto out;
+
+	error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp,
+			breq->startino, &bc);
+	xfs_trans_cancel(tp);
+out:
+	kmem_free(bc.buf);
+
+	/*
+	 * If we reported one inode to userspace then we abort because we hit
+	 * the end of the buffer.  Don't leak that back to userspace.
+	 */
+	if (error == -ECANCELED)
+		error = 0;
+
+	return error;
+}
+
+static int
+xfs_bulkstat_iwalk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	void			*data)
+{
+	struct xfs_bstat_chunk	*bc = data;
+	int			error;
+
+	error = xfs_bulkstat_one_int(mp, bc->breq->mnt_userns, tp, ino, data);
+	/* bulkstat just skips over missing inodes */
+	if (error == -ENOENT || error == -EINVAL)
+		return 0;
+	return error;
+}
+
+/*
+ * Check the incoming lastino parameter.
+ *
+ * We allow any inode value that could map to physical space inside the
+ * filesystem because if there are no inodes there, bulkstat moves on to the
+ * next chunk.  In other words, the magic agino value of zero takes us to the
+ * first chunk in the AG, and an agino value past the end of the AG takes us to
+ * the first chunk in the next AG.
+ *
+ * Therefore we can end early if the requested inode is beyond the end of the
+ * filesystem or doesn't map properly.
+ */
+static inline bool
+xfs_bulkstat_already_done(
+	struct xfs_mount	*mp,
+	xfs_ino_t		startino)
+{
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, startino);
+
+	return agno >= mp->m_sb.sb_agcount ||
+	       startino != XFS_AGINO_TO_INO(mp, agno, agino);
+}
+
+/* Return stat information in bulk (by-inode) for the filesystem. */
+int
+xfs_bulkstat(
+	struct xfs_ibulk	*breq,
+	bulkstat_one_fmt_pf	formatter)
+{
+	struct xfs_bstat_chunk	bc = {
+		.formatter	= formatter,
+		.breq		= breq,
+	};
+	struct xfs_trans	*tp;
+	unsigned int		iwalk_flags = 0;
+	int			error;
+
+	if (breq->mnt_userns != &init_user_ns) {
+		xfs_warn_ratelimited(breq->mp,
+			"bulkstat not supported inside of idmapped mounts.");
+		return -EINVAL;
+	}
+	if (xfs_bulkstat_already_done(breq->mp, breq->startino))
+		return 0;
+
+	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
+			KM_MAYFAIL);
+	if (!bc.buf)
+		return -ENOMEM;
+
+	/*
+	 * Grab an empty transaction so that we can use its recursive buffer
+	 * locking abilities to detect cycles in the inobt without deadlocking.
+	 */
+	error = xfs_trans_alloc_empty(breq->mp, &tp);
+	if (error)
+		goto out;
+
+	if (breq->flags & XFS_IBULK_SAME_AG)
+		iwalk_flags |= XFS_IWALK_SAME_AG;
+
+	error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
+			xfs_bulkstat_iwalk, breq->icount, &bc);
+	xfs_trans_cancel(tp);
+out:
+	kmem_free(bc.buf);
+
+	/*
+	 * We found some inodes, so clear the error status and return them.
+	 * The lastino pointer will point directly at the inode that triggered
+	 * any error that occurred, so on the next call the error will be
+	 * triggered again and propagated to userspace as there will be no
+	 * formatted inodes in the buffer.
+	 */
+	if (breq->ocount > 0)
+		error = 0;
+
+	return error;
+}
+
+/* Convert bulkstat (v5) to bstat (v1). */
+void
+xfs_bulkstat_to_bstat(
+	struct xfs_mount		*mp,
+	struct xfs_bstat		*bs1,
+	const struct xfs_bulkstat	*bstat)
+{
+	/* memset is needed here because of padding holes in the structure. */
+	memset(bs1, 0, sizeof(struct xfs_bstat));
+	bs1->bs_ino = bstat->bs_ino;
+	bs1->bs_mode = bstat->bs_mode;
+	bs1->bs_nlink = bstat->bs_nlink;
+	bs1->bs_uid = bstat->bs_uid;
+	bs1->bs_gid = bstat->bs_gid;
+	bs1->bs_rdev = bstat->bs_rdev;
+	bs1->bs_blksize = bstat->bs_blksize;
+	bs1->bs_size = bstat->bs_size;
+	bs1->bs_atime.tv_sec = bstat->bs_atime;
+	bs1->bs_mtime.tv_sec = bstat->bs_mtime;
+	bs1->bs_ctime.tv_sec = bstat->bs_ctime;
+	bs1->bs_atime.tv_nsec = bstat->bs_atime_nsec;
+	bs1->bs_mtime.tv_nsec = bstat->bs_mtime_nsec;
+	bs1->bs_ctime.tv_nsec = bstat->bs_ctime_nsec;
+	bs1->bs_blocks = bstat->bs_blocks;
+	bs1->bs_xflags = bstat->bs_xflags;
+	bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks);
+	bs1->bs_extents = bstat->bs_extents;
+	bs1->bs_gen = bstat->bs_gen;
+	bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF;
+	bs1->bs_forkoff = bstat->bs_forkoff;
+	bs1->bs_projid_hi = bstat->bs_projectid >> 16;
+	bs1->bs_sick = bstat->bs_sick;
+	bs1->bs_checked = bstat->bs_checked;
+	bs1->bs_cowextsize = XFS_FSB_TO_B(mp, bstat->bs_cowextsize_blks);
+	bs1->bs_dmevmask = 0;
+	bs1->bs_dmstate = 0;
+	bs1->bs_aextents = bstat->bs_aextents;
+}
+
+struct xfs_inumbers_chunk {
+	inumbers_fmt_pf		formatter;
+	struct xfs_ibulk	*breq;
+};
+
+/*
+ * INUMBERS
+ * ========
+ * This is how we export inode btree records to userspace, so that XFS tools
+ * can figure out where inodes are allocated.
+ */
+
+/*
+ * Format the inode group structure and report it somewhere.
+ *
+ * Similar to xfs_bulkstat_one_int, lastino is the inode cursor as we walk
+ * through the filesystem so we move it forward unless there was a runtime
+ * error.  If the formatter tells us the buffer is now full we also move the
+ * cursor forward and abort the walk.
+ */
+STATIC int
+xfs_inumbers_walk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	const struct xfs_inobt_rec_incore *irec,
+	void			*data)
+{
+	struct xfs_inumbers	inogrp = {
+		.xi_startino	= XFS_AGINO_TO_INO(mp, agno, irec->ir_startino),
+		.xi_alloccount	= irec->ir_count - irec->ir_freecount,
+		.xi_allocmask	= ~irec->ir_free,
+		.xi_version	= XFS_INUMBERS_VERSION_V5,
+	};
+	struct xfs_inumbers_chunk *ic = data;
+	int			error;
+
+	error = ic->formatter(ic->breq, &inogrp);
+	if (error && error != -ECANCELED)
+		return error;
+
+	ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
+			XFS_INODES_PER_CHUNK;
+	return error;
+}
+
+/*
+ * Return inode number table for the filesystem.
+ */
+int
+xfs_inumbers(
+	struct xfs_ibulk	*breq,
+	inumbers_fmt_pf		formatter)
+{
+	struct xfs_inumbers_chunk ic = {
+		.formatter	= formatter,
+		.breq		= breq,
+	};
+	struct xfs_trans	*tp;
+	int			error = 0;
+
+	if (xfs_bulkstat_already_done(breq->mp, breq->startino))
+		return 0;
+
+	/*
+	 * Grab an empty transaction so that we can use its recursive buffer
+	 * locking abilities to detect cycles in the inobt without deadlocking.
+	 */
+	error = xfs_trans_alloc_empty(breq->mp, &tp);
+	if (error)
+		goto out;
+
+	error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags,
+			xfs_inumbers_walk, breq->icount, &ic);
+	xfs_trans_cancel(tp);
+out:
+
+	/*
+	 * We found some inode groups, so clear the error status and return
+	 * them.  The lastino pointer will point directly at the inode that
+	 * triggered any error that occurred, so on the next call the error
+	 * will be triggered again and propagated to userspace as there will be
+	 * no formatted inode groups in the buffer.
+	 */
+	if (breq->ocount > 0)
+		error = 0;
+
+	return error;
+}
+
+/* Convert an inumbers (v5) struct to a inogrp (v1) struct. */
+void
+xfs_inumbers_to_inogrp(
+	struct xfs_inogrp		*ig1,
+	const struct xfs_inumbers	*ig)
+{
+	/* memset is needed here because of padding holes in the structure. */
+	memset(ig1, 0, sizeof(struct xfs_inogrp));
+	ig1->xi_startino = ig->xi_startino;
+	ig1->xi_alloccount = ig->xi_alloccount;
+	ig1->xi_allocmask = ig->xi_allocmask;
+}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
new file mode 100644
index 000000000..e2d0eba43
--- /dev/null
+++ b/fs/xfs/xfs_itable.h
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+#ifndef __XFS_ITABLE_H__
+#define	__XFS_ITABLE_H__
+
+/* In-memory representation of a userspace request for batch inode data. */
+struct xfs_ibulk {
+	struct xfs_mount	*mp;
+	struct user_namespace   *mnt_userns;
+	void __user		*ubuffer; /* user output buffer */
+	xfs_ino_t		startino; /* start with this inode */
+	unsigned int		icount;   /* number of elements in ubuffer */
+	unsigned int		ocount;   /* number of records returned */
+	unsigned int		flags;    /* see XFS_IBULK_FLAG_* */
+};
+
+/* Only iterate within the same AG as startino */
+#define XFS_IBULK_SAME_AG	(1U << 0)
+
+/* Fill out the bs_extents64 field if set. */
+#define XFS_IBULK_NREXT64	(1U << 1)
+
+/*
+ * Advance the user buffer pointer by one record of the given size.  If the
+ * buffer is now full, return the appropriate error code.
+ */
+static inline int
+xfs_ibulk_advance(
+	struct xfs_ibulk	*breq,
+	size_t			bytes)
+{
+	char __user		*b = breq->ubuffer;
+
+	breq->ubuffer = b + bytes;
+	breq->ocount++;
+	return breq->ocount == breq->icount ? -ECANCELED : 0;
+}
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+
+/*
+ * Return codes for the formatter function are 0 to continue iterating, and
+ * non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * bulkstat/inumbers caller.  The special value -ECANCELED can be used to stop
+ * iteration, as neither bulkstat nor inumbers will ever generate that error
+ * code on their own.
+ */
+
+typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
+		const struct xfs_bulkstat *bstat);
+
+int xfs_bulkstat_one(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter);
+int xfs_bulkstat(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter);
+void xfs_bulkstat_to_bstat(struct xfs_mount *mp, struct xfs_bstat *bs1,
+		const struct xfs_bulkstat *bstat);
+
+typedef int (*inumbers_fmt_pf)(struct xfs_ibulk *breq,
+		const struct xfs_inumbers *igrp);
+
+int xfs_inumbers(struct xfs_ibulk *breq, inumbers_fmt_pf formatter);
+void xfs_inumbers_to_inogrp(struct xfs_inogrp *ig1,
+		const struct xfs_inumbers *ig);
+
+#endif	/* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
new file mode 100644
index 000000000..43005ce8b
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_ag.h"
+#include "xfs_iunlink_item.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+struct kmem_cache	*xfs_iunlink_cache;
+
+static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_iunlink_item, item);
+}
+
+static void
+xfs_iunlink_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_iunlink_item	*iup = IUL_ITEM(lip);
+
+	xfs_perag_put(iup->pag);
+	kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip));
+}
+
+
+static uint64_t
+xfs_iunlink_item_sort(
+	struct xfs_log_item	*lip)
+{
+	return IUL_ITEM(lip)->ip->i_ino;
+}
+
+/*
+ * Look up the inode cluster buffer and log the on-disk unlinked inode change
+ * we need to make.
+ */
+static int
+xfs_iunlink_log_dinode(
+	struct xfs_trans	*tp,
+	struct xfs_iunlink_item	*iup)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*ip = iup->ip;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*ibp;
+	int			offset;
+	int			error;
+
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+	if (error)
+		return error;
+	/*
+	 * Don't log the unlinked field on stale buffers as this may be the
+	 * transaction that frees the inode cluster and relogging the buffer
+	 * here will incorrectly remove the stale state.
+	 */
+	if (ibp->b_flags & XBF_STALE)
+		goto out;
+
+	dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+
+	/* Make sure the old pointer isn't garbage. */
+	if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) {
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+				sizeof(*dip), __this_address);
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno,
+			XFS_INO_TO_AGINO(mp, ip->i_ino),
+			be32_to_cpu(dip->di_next_unlinked), iup->next_agino);
+
+	dip->di_next_unlinked = cpu_to_be32(iup->next_agino);
+	offset = ip->i_imap.im_boffset +
+			offsetof(struct xfs_dinode, di_next_unlinked);
+
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_inode_buf(tp, ibp);
+	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	return 0;
+out:
+	xfs_trans_brelse(tp, ibp);
+	return error;
+}
+
+/*
+ * On precommit, we grab the inode cluster buffer for the inode number we were
+ * passed, then update the next unlinked field for that inode in the buffer and
+ * log the buffer. This ensures that the inode cluster buffer was logged in the
+ * correct order w.r.t. other inode cluster buffers. We can then remove the
+ * iunlink item from the transaction and release it as it is has now served it's
+ * purpose.
+ */
+static int
+xfs_iunlink_item_precommit(
+	struct xfs_trans	*tp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_iunlink_item	*iup = IUL_ITEM(lip);
+	int			error;
+
+	error = xfs_iunlink_log_dinode(tp, iup);
+	list_del(&lip->li_trans);
+	xfs_iunlink_item_release(lip);
+	return error;
+}
+
+static const struct xfs_item_ops xfs_iunlink_item_ops = {
+	.iop_release	= xfs_iunlink_item_release,
+	.iop_sort	= xfs_iunlink_item_sort,
+	.iop_precommit	= xfs_iunlink_item_precommit,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the iunlink item.
+ */
+int
+xfs_iunlink_log_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_iunlink_item	*iup;
+
+	ASSERT(xfs_verify_agino_or_null(pag, next_agino));
+	ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked));
+
+	/*
+	 * Since we're updating a linked list, we should never find that the
+	 * current pointer is the same as the new value, unless we're
+	 * terminating the list.
+	 */
+	if (ip->i_next_unlinked == next_agino) {
+		if (next_agino != NULLAGINO)
+			return -EFSCORRUPTED;
+		return 0;
+	}
+
+	iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK,
+			  &xfs_iunlink_item_ops);
+
+	iup->ip = ip;
+	iup->next_agino = next_agino;
+	iup->old_agino = ip->i_next_unlinked;
+
+	atomic_inc(&pag->pag_ref);
+	iup->pag = pag;
+
+	xfs_trans_add_item(tp, &iup->item);
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &iup->item.li_flags);
+	return 0;
+}
+
diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h
new file mode 100644
index 000000000..c793cdcac
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_IUNLINK_ITEM_H
+#define XFS_IUNLINK_ITEM_H	1
+
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_perag;
+
+/* in memory log item structure */
+struct xfs_iunlink_item {
+	struct xfs_log_item	item;
+	struct xfs_inode	*ip;
+	struct xfs_perag	*pag;
+	xfs_agino_t		next_agino;
+	xfs_agino_t		old_agino;
+};
+
+extern struct kmem_cache *xfs_iunlink_cache;
+
+int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+			struct xfs_perag *pag, xfs_agino_t next_agino);
+
+#endif	/* XFS_IUNLINK_ITEM_H */
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
new file mode 100644
index 000000000..7558486f4
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.c
@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_health.h"
+#include "xfs_trans.h"
+#include "xfs_pwork.h"
+#include "xfs_ag.h"
+
+/*
+ * Walking Inodes in the Filesystem
+ * ================================
+ *
+ * This iterator function walks a subset of filesystem inodes in increasing
+ * order from @startino until there are no more inodes.  For each allocated
+ * inode it finds, it calls a walk function with the relevant inode number and
+ * a pointer to caller-provided data.  The walk function can return the usual
+ * negative error code to stop the iteration; 0 to continue the iteration; or
+ * -ECANCELED to stop the iteration.  This return value is returned to the
+ * caller.
+ *
+ * Internally, we allow the walk function to do anything, which means that we
+ * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
+ * therefore cache the inobt records in kernel memory and only call the walk
+ * function when our memory buffer is full.  @nr_recs is the number of records
+ * that we've cached, and @sz_recs is the size of our cache.
+ *
+ * It is the responsibility of the walk function to ensure it accesses
+ * allocated inodes, as the inobt records may be stale by the time they are
+ * acted upon.
+ */
+
+struct xfs_iwalk_ag {
+	/* parallel work control data; will be null if single threaded */
+	struct xfs_pwork		pwork;
+
+	struct xfs_mount		*mp;
+	struct xfs_trans		*tp;
+	struct xfs_perag		*pag;
+
+	/* Where do we start the traversal? */
+	xfs_ino_t			startino;
+
+	/* What was the last inode number we saw when iterating the inobt? */
+	xfs_ino_t			lastino;
+
+	/* Array of inobt records we cache. */
+	struct xfs_inobt_rec_incore	*recs;
+
+	/* Number of entries allocated for the @recs array. */
+	unsigned int			sz_recs;
+
+	/* Number of entries in the @recs array that are in use. */
+	unsigned int			nr_recs;
+
+	/* Inode walk function and data pointer. */
+	xfs_iwalk_fn			iwalk_fn;
+	xfs_inobt_walk_fn		inobt_walk_fn;
+	void				*data;
+
+	/*
+	 * Make it look like the inodes up to startino are free so that
+	 * bulkstat can start its inode iteration at the correct place without
+	 * needing to special case everywhere.
+	 */
+	unsigned int			trim_start:1;
+
+	/* Skip empty inobt records? */
+	unsigned int			skip_empty:1;
+
+	/* Drop the (hopefully empty) transaction when calling iwalk_fn. */
+	unsigned int			drop_trans:1;
+};
+
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record.  Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_iwalk_ichunk_ra(
+	struct xfs_mount		*mp,
+	struct xfs_perag		*pag,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
+	xfs_agblock_t			agbno;
+	struct blk_plug			plug;
+	int				i;	/* inode chunk index */
+
+	agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+
+	blk_start_plug(&plug);
+	for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
+		xfs_inofree_t	imask;
+
+		imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
+		if (imask & ~irec->ir_free) {
+			xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
+					igeo->blocks_per_cluster,
+					&xfs_inode_buf_ops);
+		}
+		agbno += igeo->blocks_per_cluster;
+	}
+	blk_finish_plug(&plug);
+}
+
+/*
+ * Set the bits in @irec's free mask that correspond to the inodes before
+ * @agino so that we skip them.  This is how we restart an inode walk that was
+ * interrupted in the middle of an inode record.
+ */
+STATIC void
+xfs_iwalk_adjust_start(
+	xfs_agino_t			agino,	/* starting inode of chunk */
+	struct xfs_inobt_rec_incore	*irec)	/* btree record */
+{
+	int				idx;	/* index into inode chunk */
+	int				i;
+
+	idx = agino - irec->ir_startino;
+
+	/*
+	 * We got a right chunk with some left inodes allocated at it.  Grab
+	 * the chunk record.  Mark all the uninteresting inodes free because
+	 * they're before our start point.
+	 */
+	for (i = 0; i < idx; i++) {
+		if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+			irec->ir_freecount++;
+	}
+
+	irec->ir_free |= xfs_inobt_maskn(0, idx);
+}
+
+/* Allocate memory for a walk. */
+STATIC int
+xfs_iwalk_alloc(
+	struct xfs_iwalk_ag	*iwag)
+{
+	size_t			size;
+
+	ASSERT(iwag->recs == NULL);
+	iwag->nr_recs = 0;
+
+	/* Allocate a prefetch buffer for inobt records. */
+	size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
+	iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+	if (iwag->recs == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Free memory we allocated for a walk. */
+STATIC void
+xfs_iwalk_free(
+	struct xfs_iwalk_ag	*iwag)
+{
+	kmem_free(iwag->recs);
+	iwag->recs = NULL;
+}
+
+/* For each inuse inode in each cached inobt record, call our function. */
+STATIC int
+xfs_iwalk_ag_recs(
+	struct xfs_iwalk_ag	*iwag)
+{
+	struct xfs_mount	*mp = iwag->mp;
+	struct xfs_trans	*tp = iwag->tp;
+	struct xfs_perag	*pag = iwag->pag;
+	xfs_ino_t		ino;
+	unsigned int		i, j;
+	int			error;
+
+	for (i = 0; i < iwag->nr_recs; i++) {
+		struct xfs_inobt_rec_incore	*irec = &iwag->recs[i];
+
+		trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec);
+
+		if (xfs_pwork_want_abort(&iwag->pwork))
+			return 0;
+
+		if (iwag->inobt_walk_fn) {
+			error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec,
+					iwag->data);
+			if (error)
+				return error;
+		}
+
+		if (!iwag->iwalk_fn)
+			continue;
+
+		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+			if (xfs_pwork_want_abort(&iwag->pwork))
+				return 0;
+
+			/* Skip if this inode is free */
+			if (XFS_INOBT_MASK(j) & irec->ir_free)
+				continue;
+
+			/* Otherwise call our function. */
+			ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+						irec->ir_startino + j);
+			error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
+			if (error)
+				return error;
+		}
+	}
+
+	return 0;
+}
+
+/* Delete cursor and let go of AGI. */
+static inline void
+xfs_iwalk_del_inobt(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp,
+	int			error)
+{
+	if (*curpp) {
+		xfs_btree_del_cursor(*curpp, error);
+		*curpp = NULL;
+	}
+	if (*agi_bpp) {
+		xfs_trans_brelse(tp, *agi_bpp);
+		*agi_bpp = NULL;
+	}
+}
+
+/*
+ * Set ourselves up for walking inobt records starting from a given point in
+ * the filesystem.
+ *
+ * If caller passed in a nonzero start inode number, load the record from the
+ * inobt and make the record look like all the inodes before agino are free so
+ * that we skip them, and then move the cursor to the next inobt record.  This
+ * is how we support starting an iwalk in the middle of an inode chunk.
+ *
+ * If the caller passed in a start number of zero, move the cursor to the first
+ * inobt record.
+ *
+ * The caller is responsible for cleaning up the cursor and buffer pointer
+ * regardless of the error status.
+ */
+STATIC int
+xfs_iwalk_ag_start(
+	struct xfs_iwalk_ag	*iwag,
+	xfs_agino_t		agino,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp,
+	int			*has_more)
+{
+	struct xfs_mount	*mp = iwag->mp;
+	struct xfs_trans	*tp = iwag->tp;
+	struct xfs_perag	*pag = iwag->pag;
+	struct xfs_inobt_rec_incore *irec;
+	int			error;
+
+	/* Set up a fresh cursor and empty the inobt cache. */
+	iwag->nr_recs = 0;
+	error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp);
+	if (error)
+		return error;
+
+	/* Starting at the beginning of the AG?  That's easy! */
+	if (agino == 0)
+		return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
+
+	/*
+	 * Otherwise, we have to grab the inobt record where we left off, stuff
+	 * the record into our cache, and then see if there are more records.
+	 * We require a lookup cache of at least two elements so that the
+	 * caller doesn't have to deal with tearing down the cursor to walk the
+	 * records.
+	 */
+	error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
+	if (error)
+		return error;
+
+	/*
+	 * If the LE lookup at @agino yields no records, jump ahead to the
+	 * inobt cursor increment to see if there are more records to process.
+	 */
+	if (!*has_more)
+		goto out_advance;
+
+	/* Get the record, should always work */
+	irec = &iwag->recs[iwag->nr_recs];
+	error = xfs_inobt_get_rec(*curpp, irec, has_more);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(mp, *has_more != 1))
+		return -EFSCORRUPTED;
+
+	iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+				irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
+
+	/*
+	 * If the LE lookup yielded an inobt record before the cursor position,
+	 * skip it and see if there's another one after it.
+	 */
+	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+		goto out_advance;
+
+	/*
+	 * If agino fell in the middle of the inode record, make it look like
+	 * the inodes up to agino are free so that we don't return them again.
+	 */
+	if (iwag->trim_start)
+		xfs_iwalk_adjust_start(agino, irec);
+
+	/*
+	 * The prefetch calculation is supposed to give us a large enough inobt
+	 * record cache that grab_ichunk can stage a partial first record and
+	 * the loop body can cache a record without having to check for cache
+	 * space until after it reads an inobt record.
+	 */
+	iwag->nr_recs++;
+	ASSERT(iwag->nr_recs < iwag->sz_recs);
+
+out_advance:
+	return xfs_btree_increment(*curpp, 0, has_more);
+}
+
+/*
+ * The inobt record cache is full, so preserve the inobt cursor state and
+ * run callbacks on the cached inobt records.  When we're done, restore the
+ * cursor state to wherever the cursor would have been had the cache not been
+ * full (and therefore we could've just incremented the cursor) if *@has_more
+ * is true.  On exit, *@has_more will indicate whether or not the caller should
+ * try for more inode records.
+ */
+STATIC int
+xfs_iwalk_run_callbacks(
+	struct xfs_iwalk_ag		*iwag,
+	struct xfs_btree_cur		**curpp,
+	struct xfs_buf			**agi_bpp,
+	int				*has_more)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_inobt_rec_incore	*irec;
+	xfs_agino_t			next_agino;
+	int				error;
+
+	next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
+
+	ASSERT(iwag->nr_recs > 0);
+
+	/* Delete cursor but remember the last record we cached... */
+	xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0);
+	irec = &iwag->recs[iwag->nr_recs - 1];
+	ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
+
+	if (iwag->drop_trans) {
+		xfs_trans_cancel(iwag->tp);
+		iwag->tp = NULL;
+	}
+
+	error = xfs_iwalk_ag_recs(iwag);
+	if (error)
+		return error;
+
+	/* ...empty the cache... */
+	iwag->nr_recs = 0;
+
+	if (!has_more)
+		return 0;
+
+	if (iwag->drop_trans) {
+		error = xfs_trans_alloc_empty(mp, &iwag->tp);
+		if (error)
+			return error;
+	}
+
+	/* ...and recreate the cursor just past where we left off. */
+	error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp,
+			agi_bpp);
+	if (error)
+		return error;
+
+	return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
+}
+
+/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
+STATIC int
+xfs_iwalk_ag(
+	struct xfs_iwalk_ag		*iwag)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_perag		*pag = iwag->pag;
+	struct xfs_buf			*agi_bp = NULL;
+	struct xfs_btree_cur		*cur = NULL;
+	xfs_agino_t			agino;
+	int				has_more;
+	int				error = 0;
+
+	/* Set up our cursor at the right place in the inode btree. */
+	ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino));
+	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
+	error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more);
+
+	while (!error && has_more) {
+		struct xfs_inobt_rec_incore	*irec;
+		xfs_ino_t			rec_fsino;
+
+		cond_resched();
+		if (xfs_pwork_want_abort(&iwag->pwork))
+			goto out;
+
+		/* Fetch the inobt record. */
+		irec = &iwag->recs[iwag->nr_recs];
+		error = xfs_inobt_get_rec(cur, irec, &has_more);
+		if (error || !has_more)
+			break;
+
+		/* Make sure that we always move forward. */
+		rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
+		if (iwag->lastino != NULLFSINO &&
+		    XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+		iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
+
+		/* No allocated inodes in this chunk; skip it. */
+		if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
+			error = xfs_btree_increment(cur, 0, &has_more);
+			if (error)
+				break;
+			continue;
+		}
+
+		/*
+		 * Start readahead for this inode chunk in anticipation of
+		 * walking the inodes.
+		 */
+		if (iwag->iwalk_fn)
+			xfs_iwalk_ichunk_ra(mp, pag, irec);
+
+		/*
+		 * If there's space in the buffer for more records, increment
+		 * the btree cursor and grab more.
+		 */
+		if (++iwag->nr_recs < iwag->sz_recs) {
+			error = xfs_btree_increment(cur, 0, &has_more);
+			if (error || !has_more)
+				break;
+			continue;
+		}
+
+		/*
+		 * Otherwise, we need to save cursor state and run the callback
+		 * function on the cached records.  The run_callbacks function
+		 * is supposed to return a cursor pointing to the record where
+		 * we would be if we had been able to increment like above.
+		 */
+		ASSERT(has_more);
+		error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
+	}
+
+	if (iwag->nr_recs == 0 || error)
+		goto out;
+
+	/* Walk the unprocessed records in the cache. */
+	error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
+
+out:
+	xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error);
+	return error;
+}
+
+/*
+ * We experimentally determined that the reduction in ioctl call overhead
+ * diminishes when userspace asks for more than 2048 inodes, so we'll cap
+ * prefetch at this point.
+ */
+#define IWALK_MAX_INODE_PREFETCH	(2048U)
+
+/*
+ * Given the number of inodes to prefetch, set the number of inobt records that
+ * we cache in memory, which controls the number of inodes we try to read
+ * ahead.  Set the maximum if @inodes == 0.
+ */
+static inline unsigned int
+xfs_iwalk_prefetch(
+	unsigned int		inodes)
+{
+	unsigned int		inobt_records;
+
+	/*
+	 * If the caller didn't tell us the number of inodes they wanted,
+	 * assume the maximum prefetch possible for best performance.
+	 * Otherwise, cap prefetch at that maximum so that we don't start an
+	 * absurd amount of prefetch.
+	 */
+	if (inodes == 0)
+		inodes = IWALK_MAX_INODE_PREFETCH;
+	inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
+
+	/* Round the inode count up to a full chunk. */
+	inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
+
+	/*
+	 * In order to convert the number of inodes to prefetch into an
+	 * estimate of the number of inobt records to cache, we require a
+	 * conversion factor that reflects our expectations of the average
+	 * loading factor of an inode chunk.  Based on data gathered, most
+	 * (but not all) filesystems manage to keep the inode chunks totally
+	 * full, so we'll underestimate slightly so that our readahead will
+	 * still deliver the performance we want on aging filesystems:
+	 *
+	 * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
+	 *
+	 * The funny math is to avoid integer division.
+	 */
+	inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
+
+	/*
+	 * Allocate enough space to prefetch at least two inobt records so that
+	 * we can cache both the record where the iwalk started and the next
+	 * record.  This simplifies the AG inode walk loop setup code.
+	 */
+	return max(inobt_records, 2U);
+}
+
+/*
+ * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
+ * will be called for each allocated inode, being passed the inode's number and
+ * @data.  @max_prefetch controls how many inobt records' worth of inodes we
+ * try to readahead.
+ */
+int
+xfs_iwalk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		startino,
+	unsigned int		flags,
+	xfs_iwalk_fn		iwalk_fn,
+	unsigned int		inode_records,
+	void			*data)
+{
+	struct xfs_iwalk_ag	iwag = {
+		.mp		= mp,
+		.tp		= tp,
+		.iwalk_fn	= iwalk_fn,
+		.data		= data,
+		.startino	= startino,
+		.sz_recs	= xfs_iwalk_prefetch(inode_records),
+		.trim_start	= 1,
+		.skip_empty	= 1,
+		.pwork		= XFS_PWORK_SINGLE_THREADED,
+		.lastino	= NULLFSINO,
+	};
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+	error = xfs_iwalk_alloc(&iwag);
+	if (error)
+		return error;
+
+	for_each_perag_from(mp, agno, pag) {
+		iwag.pag = pag;
+		error = xfs_iwalk_ag(&iwag);
+		if (error)
+			break;
+		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+		if (flags & XFS_INOBT_WALK_SAME_AG)
+			break;
+		iwag.pag = NULL;
+	}
+
+	if (iwag.pag)
+		xfs_perag_put(pag);
+	xfs_iwalk_free(&iwag);
+	return error;
+}
+
+/* Run per-thread iwalk work. */
+static int
+xfs_iwalk_ag_work(
+	struct xfs_mount	*mp,
+	struct xfs_pwork	*pwork)
+{
+	struct xfs_iwalk_ag	*iwag;
+	int			error = 0;
+
+	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
+	if (xfs_pwork_want_abort(pwork))
+		goto out;
+
+	error = xfs_iwalk_alloc(iwag);
+	if (error)
+		goto out;
+	/*
+	 * Grab an empty transaction so that we can use its recursive buffer
+	 * locking abilities to detect cycles in the inobt without deadlocking.
+	 */
+	error = xfs_trans_alloc_empty(mp, &iwag->tp);
+	if (error)
+		goto out;
+	iwag->drop_trans = 1;
+
+	error = xfs_iwalk_ag(iwag);
+	if (iwag->tp)
+		xfs_trans_cancel(iwag->tp);
+	xfs_iwalk_free(iwag);
+out:
+	xfs_perag_put(iwag->pag);
+	kmem_free(iwag);
+	return error;
+}
+
+/*
+ * Walk all the inodes in the filesystem using multiple threads to process each
+ * AG.
+ */
+int
+xfs_iwalk_threaded(
+	struct xfs_mount	*mp,
+	xfs_ino_t		startino,
+	unsigned int		flags,
+	xfs_iwalk_fn		iwalk_fn,
+	unsigned int		inode_records,
+	bool			polled,
+	void			*data)
+{
+	struct xfs_pwork_ctl	pctl;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk");
+	if (error)
+		return error;
+
+	for_each_perag_from(mp, agno, pag) {
+		struct xfs_iwalk_ag	*iwag;
+
+		if (xfs_pwork_ctl_want_abort(&pctl))
+			break;
+
+		iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
+		iwag->mp = mp;
+
+		/*
+		 * perag is being handed off to async work, so take another
+		 * reference for the async work to release.
+		 */
+		atomic_inc(&pag->pag_ref);
+		iwag->pag = pag;
+		iwag->iwalk_fn = iwalk_fn;
+		iwag->data = data;
+		iwag->startino = startino;
+		iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+		iwag->lastino = NULLFSINO;
+		xfs_pwork_queue(&pctl, &iwag->pwork);
+		startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
+		if (flags & XFS_INOBT_WALK_SAME_AG)
+			break;
+	}
+	if (pag)
+		xfs_perag_put(pag);
+	if (polled)
+		xfs_pwork_poll(&pctl);
+	return xfs_pwork_destroy(&pctl);
+}
+
+/*
+ * Allow callers to cache up to a page's worth of inobt records.  This reflects
+ * the existing inumbers prefetching behavior.  Since the inobt walk does not
+ * itself do anything with the inobt records, we can set a fairly high limit
+ * here.
+ */
+#define MAX_INOBT_WALK_PREFETCH	\
+	(PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
+
+/*
+ * Given the number of records that the user wanted, set the number of inobt
+ * records that we buffer in memory.  Set the maximum if @inobt_records == 0.
+ */
+static inline unsigned int
+xfs_inobt_walk_prefetch(
+	unsigned int		inobt_records)
+{
+	/*
+	 * If the caller didn't tell us the number of inobt records they
+	 * wanted, assume the maximum prefetch possible for best performance.
+	 */
+	if (inobt_records == 0)
+		inobt_records = MAX_INOBT_WALK_PREFETCH;
+
+	/*
+	 * Allocate enough space to prefetch at least two inobt records so that
+	 * we can cache both the record where the iwalk started and the next
+	 * record.  This simplifies the AG inode walk loop setup code.
+	 */
+	inobt_records = max(inobt_records, 2U);
+
+	/*
+	 * Cap prefetch at that maximum so that we don't use an absurd amount
+	 * of memory.
+	 */
+	return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
+}
+
+/*
+ * Walk all inode btree records in the filesystem starting from @startino.  The
+ * @inobt_walk_fn will be called for each btree record, being passed the incore
+ * record and @data.  @max_prefetch controls how many inobt records we try to
+ * cache ahead of time.
+ */
+int
+xfs_inobt_walk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		startino,
+	unsigned int		flags,
+	xfs_inobt_walk_fn	inobt_walk_fn,
+	unsigned int		inobt_records,
+	void			*data)
+{
+	struct xfs_iwalk_ag	iwag = {
+		.mp		= mp,
+		.tp		= tp,
+		.inobt_walk_fn	= inobt_walk_fn,
+		.data		= data,
+		.startino	= startino,
+		.sz_recs	= xfs_inobt_walk_prefetch(inobt_records),
+		.pwork		= XFS_PWORK_SINGLE_THREADED,
+		.lastino	= NULLFSINO,
+	};
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
+
+	error = xfs_iwalk_alloc(&iwag);
+	if (error)
+		return error;
+
+	for_each_perag_from(mp, agno, pag) {
+		iwag.pag = pag;
+		error = xfs_iwalk_ag(&iwag);
+		if (error)
+			break;
+		iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
+		if (flags & XFS_INOBT_WALK_SAME_AG)
+			break;
+		iwag.pag = NULL;
+	}
+
+	if (iwag.pag)
+		xfs_perag_put(pag);
+	xfs_iwalk_free(&iwag);
+	return error;
+}
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
new file mode 100644
index 000000000..836990897
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_IWALK_H__
+#define __XFS_IWALK_H__
+
+/*
+ * Return codes for the inode/inobt walk function are 0 to continue iterating,
+ * and non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * iwalk or inobt_walk caller.  The special value -ECANCELED can be used to
+ * stop iteration, as neither iwalk nor inobt_walk will ever generate that
+ * error code on their own.
+ */
+
+/* Walk all inodes in the filesystem starting from @startino. */
+typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+			    xfs_ino_t ino, void *data);
+
+int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
+		unsigned int flags, xfs_iwalk_fn iwalk_fn,
+		unsigned int inode_records, void *data);
+int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
+		unsigned int flags, xfs_iwalk_fn iwalk_fn,
+		unsigned int inode_records, bool poll, void *data);
+
+/* Only iterate inodes within the same AG as @startino. */
+#define XFS_IWALK_SAME_AG	(1U << 0)
+
+#define XFS_IWALK_FLAGS_ALL	(XFS_IWALK_SAME_AG)
+
+/* Walk all inode btree records in the filesystem starting from @startino. */
+typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+				 xfs_agnumber_t agno,
+				 const struct xfs_inobt_rec_incore *irec,
+				 void *data);
+
+int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
+		xfs_ino_t startino, unsigned int flags,
+		xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records,
+		void *data);
+
+/* Only iterate inobt records within the same AG as @startino. */
+#define XFS_INOBT_WALK_SAME_AG	(XFS_IWALK_SAME_AG)
+
+#define XFS_INOBT_WALK_FLAGS_ALL (XFS_INOBT_WALK_SAME_AG)
+
+#endif /* __XFS_IWALK_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
new file mode 100644
index 000000000..f9878021e
--- /dev/null
+++ b/fs/xfs/xfs_linux.h
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_LINUX__
+#define __XFS_LINUX__
+
+#include <linux/types.h>
+#include <linux/uuid.h>
+
+/*
+ * Kernel specific type declarations for XFS
+ */
+
+typedef __s64			xfs_off_t;	/* <file offset> type */
+typedef unsigned long long	xfs_ino_t;	/* <inode> type */
+typedef __s64			xfs_daddr_t;	/* <disk address> type */
+typedef __u32			xfs_dev_t;
+typedef __u32			xfs_nlink_t;
+
+#include "xfs_types.h"
+
+#include "kmem.h"
+#include "mrlock.h"
+
+#include <linux/semaphore.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/errno.h>
+#include <linux/sched/signal.h>
+#include <linux/bitops.h>
+#include <linux/major.h>
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/sort.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/delay.h>
+#include <linux/log2.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/ctype.h>
+#include <linux/writeback.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list_sort.h>
+#include <linux/ratelimit.h>
+#include <linux/rhashtable.h>
+#include <linux/xattr.h>
+#include <linux/mnt_idmapping.h>
+
+#include <asm/page.h>
+#include <asm/div64.h>
+#include <asm/param.h>
+#include <linux/uaccess.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include "xfs_fs.h"
+#include "xfs_stats.h"
+#include "xfs_sysctl.h"
+#include "xfs_iops.h"
+#include "xfs_aops.h"
+#include "xfs_super.h"
+#include "xfs_cksum.h"
+#include "xfs_buf.h"
+#include "xfs_message.h"
+
+#ifdef __BIG_ENDIAN
+#define XFS_NATIVE_HOST 1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+#define irix_sgid_inherit	xfs_params.sgid_inherit.val
+#define irix_symlink_mode	xfs_params.symlink_mode.val
+#define xfs_panic_mask		xfs_params.panic_mask.val
+#define xfs_error_level		xfs_params.error_level.val
+#define xfs_syncd_centisecs	xfs_params.syncd_timer.val
+#define xfs_stats_clear		xfs_params.stats_clear.val
+#define xfs_inherit_sync	xfs_params.inherit_sync.val
+#define xfs_inherit_nodump	xfs_params.inherit_nodump.val
+#define xfs_inherit_noatime	xfs_params.inherit_noatim.val
+#define xfs_inherit_nosymlinks	xfs_params.inherit_nosym.val
+#define xfs_rotorstep		xfs_params.rotorstep.val
+#define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
+#define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val
+#define xfs_blockgc_secs	xfs_params.blockgc_timer.val
+
+#define current_cpu()		(raw_smp_processor_id())
+#define current_set_flags_nested(sp, f)		\
+		(*(sp) = current->flags, current->flags |= (f))
+#define current_restore_flags_nested(sp, f)	\
+		(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
+
+#define NBBY		8		/* number of bits per byte */
+
+/*
+ * Size of block device i/o is parameterized here.
+ * Currently the system supports page-sized i/o.
+ */
+#define	BLKDEV_IOSHIFT		PAGE_SHIFT
+#define	BLKDEV_IOSIZE		(1<<BLKDEV_IOSHIFT)
+/* number of BB's per block device block */
+#define	BLKDEV_BB		BTOBB(BLKDEV_IOSIZE)
+
+#define ENOATTR		ENODATA		/* Attribute not found */
+#define EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
+#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
+#define EFSBADCRC	EBADMSG		/* Bad CRC detected */
+
+#define __return_address __builtin_return_address(0)
+
+/*
+ * Return the address of a label.  Use barrier() so that the optimizer
+ * won't reorder code to refactor the error jumpouts into a single
+ * return, which throws off the reported address.
+ */
+#define __this_address	({ __label__ __here; __here: barrier(); &&__here; })
+
+#define XFS_PROJID_DEFAULT	0
+
+#define howmany(x, y)	(((x)+((y)-1))/(y))
+
+static inline void delay(long ticks)
+{
+	schedule_timeout_uninterruptible(ticks);
+}
+
+/*
+ * XFS wrapper structure for sysfs support. It depends on external data
+ * structures and is embedded in various internal data structures to implement
+ * the XFS sysfs object heirarchy. Define it here for broad access throughout
+ * the codebase.
+ */
+struct xfs_kobj {
+	struct kobject		kobject;
+	struct completion	complete;
+};
+
+struct xstats {
+	struct xfsstats __percpu	*xs_stats;
+	struct xfs_kobj			xs_kobj;
+};
+
+extern struct xstats xfsstats;
+
+static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
+{
+	return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
+}
+
+static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev)
+{
+	return sysv_encode_dev(dev);
+}
+
+/*
+ * Various platform dependent calls that don't fit anywhere else
+ */
+#define xfs_sort(a,n,s,fn)	sort(a,n,s,fn,NULL)
+#define xfs_stack_trace()	dump_stack()
+
+static inline uint64_t rounddown_64(uint64_t x, uint32_t y)
+{
+	do_div(x, y);
+	return x * y;
+}
+
+static inline uint64_t roundup_64(uint64_t x, uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return x * y;
+}
+
+static inline uint64_t howmany_64(uint64_t x, uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return x;
+}
+
+int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
+		char *data, enum req_op op);
+
+#define ASSERT_ALWAYS(expr)	\
+	(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
+
+#ifdef DEBUG
+#define ASSERT(expr)	\
+	(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
+
+#else	/* !DEBUG */
+
+#ifdef XFS_WARN
+
+#define ASSERT(expr)	\
+	(likely(expr) ? (void)0 : asswarn(NULL, #expr, __FILE__, __LINE__))
+
+#else	/* !DEBUG && !XFS_WARN */
+
+#define ASSERT(expr)		((void)0)
+
+#endif /* XFS_WARN */
+#endif /* DEBUG */
+
+#define XFS_IS_CORRUPT(mp, expr)	\
+	(unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \
+					       NULL, 0, __FILE__, __LINE__, \
+					       __this_address), \
+			  true : false)
+
+#define STATIC static noinline
+
+#ifdef CONFIG_XFS_RT
+
+/*
+ * make sure we ignore the inode flag if the filesystem doesn't have a
+ * configured realtime device.
+ */
+#define XFS_IS_REALTIME_INODE(ip)			\
+	(((ip)->i_diflags & XFS_DIFLAG_REALTIME) &&	\
+	 (ip)->i_mount->m_rtdev_targp)
+#define XFS_IS_REALTIME_MOUNT(mp) ((mp)->m_rtdev_targp ? 1 : 0)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#define XFS_IS_REALTIME_MOUNT(mp) (0)
+#endif
+
+/*
+ * Starting in Linux 4.15, the %p (raw pointer value) printk modifier
+ * prints a hashed version of the pointer to avoid leaking kernel
+ * pointers into dmesg.  If we're trying to debug the kernel we want the
+ * raw values, so override this behavior as best we can.
+ */
+#ifdef DEBUG
+# define PTR_FMT "%px"
+#else
+# define PTR_FMT "%p"
+#endif
+
+#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
new file mode 100644
index 000000000..f02a0dd52
--- /dev/null
+++ b/fs/xfs/xfs_log.c
@@ -0,0 +1,3907 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_trace.h"
+#include "xfs_sysfs.h"
+#include "xfs_sb.h"
+#include "xfs_health.h"
+
+struct kmem_cache	*xfs_log_ticket_cache;
+
+/* Local miscellaneous function prototypes */
+STATIC struct xlog *
+xlog_alloc_log(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*log_target,
+	xfs_daddr_t		blk_offset,
+	int			num_bblks);
+STATIC int
+xlog_space_left(
+	struct xlog		*log,
+	atomic64_t		*head);
+STATIC void
+xlog_dealloc_log(
+	struct xlog		*log);
+
+/* local state machine functions */
+STATIC void xlog_state_done_syncing(
+	struct xlog_in_core	*iclog);
+STATIC void xlog_state_do_callback(
+	struct xlog		*log);
+STATIC int
+xlog_state_get_iclog_space(
+	struct xlog		*log,
+	int			len,
+	struct xlog_in_core	**iclog,
+	struct xlog_ticket	*ticket,
+	int			*logoffsetp);
+STATIC void
+xlog_grant_push_ail(
+	struct xlog		*log,
+	int			need_bytes);
+STATIC void
+xlog_sync(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	struct xlog_ticket	*ticket);
+#if defined(DEBUG)
+STATIC void
+xlog_verify_grant_tail(
+	struct xlog *log);
+STATIC void
+xlog_verify_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			count);
+STATIC void
+xlog_verify_tail_lsn(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog);
+#else
+#define xlog_verify_grant_tail(a)
+#define xlog_verify_iclog(a,b,c)
+#define xlog_verify_tail_lsn(a,b)
+#endif
+
+STATIC int
+xlog_iclogs_empty(
+	struct xlog		*log);
+
+static int
+xfs_log_cover(struct xfs_mount *);
+
+/*
+ * We need to make sure the buffer pointer returned is naturally aligned for the
+ * biggest basic data type we put into it. We have already accounted for this
+ * padding when sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ *
+ * We also add space for the xlog_op_header that describes this region in the
+ * log. This prepends the data region we return to the caller to copy their data
+ * into, so do all the static initialisation of the ophdr now. Because the ophdr
+ * is not 8 byte aligned, we have to be careful to ensure that we align the
+ * start of the buffer such that the region we return to the call is 8 byte
+ * aligned and packed against the tail of the ophdr.
+ */
+void *
+xlog_prepare_iovec(
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp,
+	uint			type)
+{
+	struct xfs_log_iovec	*vec = *vecp;
+	struct xlog_op_header	*oph;
+	uint32_t		len;
+	void			*buf;
+
+	if (vec) {
+		ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+		vec++;
+	} else {
+		vec = &lv->lv_iovecp[0];
+	}
+
+	len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+	if (!IS_ALIGNED(len, sizeof(uint64_t))) {
+		lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+					sizeof(struct xlog_op_header);
+	}
+
+	vec->i_type = type;
+	vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+	oph = vec->i_addr;
+	oph->oh_clientid = XFS_TRANSACTION;
+	oph->oh_res2 = 0;
+	oph->oh_flags = 0;
+
+	buf = vec->i_addr + sizeof(struct xlog_op_header);
+	ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+
+	*vecp = vec;
+	return buf;
+}
+
+static void
+xlog_grant_sub_space(
+	struct xlog		*log,
+	atomic64_t		*head,
+	int			bytes)
+{
+	int64_t	head_val = atomic64_read(head);
+	int64_t new, old;
+
+	do {
+		int	cycle, space;
+
+		xlog_crack_grant_head_val(head_val, &cycle, &space);
+
+		space -= bytes;
+		if (space < 0) {
+			space += log->l_logsize;
+			cycle--;
+		}
+
+		old = head_val;
+		new = xlog_assign_grant_head_val(cycle, space);
+		head_val = atomic64_cmpxchg(head, old, new);
+	} while (head_val != old);
+}
+
+static void
+xlog_grant_add_space(
+	struct xlog		*log,
+	atomic64_t		*head,
+	int			bytes)
+{
+	int64_t	head_val = atomic64_read(head);
+	int64_t new, old;
+
+	do {
+		int		tmp;
+		int		cycle, space;
+
+		xlog_crack_grant_head_val(head_val, &cycle, &space);
+
+		tmp = log->l_logsize - space;
+		if (tmp > bytes)
+			space += bytes;
+		else {
+			space = bytes - tmp;
+			cycle++;
+		}
+
+		old = head_val;
+		new = xlog_assign_grant_head_val(cycle, space);
+		head_val = atomic64_cmpxchg(head, old, new);
+	} while (head_val != old);
+}
+
+STATIC void
+xlog_grant_head_init(
+	struct xlog_grant_head	*head)
+{
+	xlog_assign_grant_head(&head->grant, 1, 0);
+	INIT_LIST_HEAD(&head->waiters);
+	spin_lock_init(&head->lock);
+}
+
+STATIC void
+xlog_grant_head_wake_all(
+	struct xlog_grant_head	*head)
+{
+	struct xlog_ticket	*tic;
+
+	spin_lock(&head->lock);
+	list_for_each_entry(tic, &head->waiters, t_queue)
+		wake_up_process(tic->t_task);
+	spin_unlock(&head->lock);
+}
+
+static inline int
+xlog_ticket_reservation(
+	struct xlog		*log,
+	struct xlog_grant_head	*head,
+	struct xlog_ticket	*tic)
+{
+	if (head == &log->l_write_head) {
+		ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+		return tic->t_unit_res;
+	}
+
+	if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+		return tic->t_unit_res * tic->t_cnt;
+
+	return tic->t_unit_res;
+}
+
+STATIC bool
+xlog_grant_head_wake(
+	struct xlog		*log,
+	struct xlog_grant_head	*head,
+	int			*free_bytes)
+{
+	struct xlog_ticket	*tic;
+	int			need_bytes;
+	bool			woken_task = false;
+
+	list_for_each_entry(tic, &head->waiters, t_queue) {
+
+		/*
+		 * There is a chance that the size of the CIL checkpoints in
+		 * progress at the last AIL push target calculation resulted in
+		 * limiting the target to the log head (l_last_sync_lsn) at the
+		 * time. This may not reflect where the log head is now as the
+		 * CIL checkpoints may have completed.
+		 *
+		 * Hence when we are woken here, it may be that the head of the
+		 * log that has moved rather than the tail. As the tail didn't
+		 * move, there still won't be space available for the
+		 * reservation we require.  However, if the AIL has already
+		 * pushed to the target defined by the old log head location, we
+		 * will hang here waiting for something else to update the AIL
+		 * push target.
+		 *
+		 * Therefore, if there isn't space to wake the first waiter on
+		 * the grant head, we need to push the AIL again to ensure the
+		 * target reflects both the current log tail and log head
+		 * position before we wait for the tail to move again.
+		 */
+
+		need_bytes = xlog_ticket_reservation(log, head, tic);
+		if (*free_bytes < need_bytes) {
+			if (!woken_task)
+				xlog_grant_push_ail(log, need_bytes);
+			return false;
+		}
+
+		*free_bytes -= need_bytes;
+		trace_xfs_log_grant_wake_up(log, tic);
+		wake_up_process(tic->t_task);
+		woken_task = true;
+	}
+
+	return true;
+}
+
+STATIC int
+xlog_grant_head_wait(
+	struct xlog		*log,
+	struct xlog_grant_head	*head,
+	struct xlog_ticket	*tic,
+	int			need_bytes) __releases(&head->lock)
+					    __acquires(&head->lock)
+{
+	list_add_tail(&tic->t_queue, &head->waiters);
+
+	do {
+		if (xlog_is_shutdown(log))
+			goto shutdown;
+		xlog_grant_push_ail(log, need_bytes);
+
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock(&head->lock);
+
+		XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
+
+		trace_xfs_log_grant_sleep(log, tic);
+		schedule();
+		trace_xfs_log_grant_wake(log, tic);
+
+		spin_lock(&head->lock);
+		if (xlog_is_shutdown(log))
+			goto shutdown;
+	} while (xlog_space_left(log, &head->grant) < need_bytes);
+
+	list_del_init(&tic->t_queue);
+	return 0;
+shutdown:
+	list_del_init(&tic->t_queue);
+	return -EIO;
+}
+
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto head->waiters, it will only return after the
+ * needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off head->waiters under head->lock, we
+ * only need to take that lock if we are going to add the ticket to the queue
+ * and sleep. We can avoid taking the lock if the ticket was never added to
+ * head->waiters because the t_queue list head will be empty and we hold the
+ * only reference to it so it can safely be checked unlocked.
+ */
+STATIC int
+xlog_grant_head_check(
+	struct xlog		*log,
+	struct xlog_grant_head	*head,
+	struct xlog_ticket	*tic,
+	int			*need_bytes)
+{
+	int			free_bytes;
+	int			error = 0;
+
+	ASSERT(!xlog_in_recovery(log));
+
+	/*
+	 * If there are other waiters on the queue then give them a chance at
+	 * logspace before us.  Wake up the first waiters, if we do not wake
+	 * up all the waiters then go to sleep waiting for more free space,
+	 * otherwise try to get some space for this transaction.
+	 */
+	*need_bytes = xlog_ticket_reservation(log, head, tic);
+	free_bytes = xlog_space_left(log, &head->grant);
+	if (!list_empty_careful(&head->waiters)) {
+		spin_lock(&head->lock);
+		if (!xlog_grant_head_wake(log, head, &free_bytes) ||
+		    free_bytes < *need_bytes) {
+			error = xlog_grant_head_wait(log, head, tic,
+						     *need_bytes);
+		}
+		spin_unlock(&head->lock);
+	} else if (free_bytes < *need_bytes) {
+		spin_lock(&head->lock);
+		error = xlog_grant_head_wait(log, head, tic, *need_bytes);
+		spin_unlock(&head->lock);
+	}
+
+	return error;
+}
+
+bool
+xfs_log_writable(
+	struct xfs_mount	*mp)
+{
+	/*
+	 * Do not write to the log on norecovery mounts, if the data or log
+	 * devices are read-only, or if the filesystem is shutdown. Read-only
+	 * mounts allow internal writes for log recovery and unmount purposes,
+	 * so don't restrict that case.
+	 */
+	if (xfs_has_norecovery(mp))
+		return false;
+	if (xfs_readonly_buftarg(mp->m_ddev_targp))
+		return false;
+	if (xfs_readonly_buftarg(mp->m_log->l_targ))
+		return false;
+	if (xlog_is_shutdown(mp->m_log))
+		return false;
+	return true;
+}
+
+/*
+ * Replenish the byte reservation required by moving the grant write head.
+ */
+int
+xfs_log_regrant(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*tic)
+{
+	struct xlog		*log = mp->m_log;
+	int			need_bytes;
+	int			error = 0;
+
+	if (xlog_is_shutdown(log))
+		return -EIO;
+
+	XFS_STATS_INC(mp, xs_try_logspace);
+
+	/*
+	 * This is a new transaction on the ticket, so we need to change the
+	 * transaction ID so that the next transaction has a different TID in
+	 * the log. Just add one to the existing tid so that we can see chains
+	 * of rolling transactions in the log easily.
+	 */
+	tic->t_tid++;
+
+	xlog_grant_push_ail(log, tic->t_unit_res);
+
+	tic->t_curr_res = tic->t_unit_res;
+	if (tic->t_cnt > 0)
+		return 0;
+
+	trace_xfs_log_regrant(log, tic);
+
+	error = xlog_grant_head_check(log, &log->l_write_head, tic,
+				      &need_bytes);
+	if (error)
+		goto out_error;
+
+	xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+	trace_xfs_log_regrant_exit(log, tic);
+	xlog_verify_grant_tail(log);
+	return 0;
+
+out_error:
+	/*
+	 * If we are failing, make sure the ticket doesn't have any current
+	 * reservations.  We don't want to add this back when the ticket/
+	 * transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0;	/* ungrant will give back unit_res * t_cnt. */
+	return error;
+}
+
+/*
+ * Reserve log space and return a ticket corresponding to the reservation.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation.  By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(
+	struct xfs_mount	*mp,
+	int			unit_bytes,
+	int			cnt,
+	struct xlog_ticket	**ticp,
+	bool			permanent)
+{
+	struct xlog		*log = mp->m_log;
+	struct xlog_ticket	*tic;
+	int			need_bytes;
+	int			error = 0;
+
+	if (xlog_is_shutdown(log))
+		return -EIO;
+
+	XFS_STATS_INC(mp, xs_try_logspace);
+
+	ASSERT(*ticp == NULL);
+	tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
+	*ticp = tic;
+
+	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+					    : tic->t_unit_res);
+
+	trace_xfs_log_reserve(log, tic);
+
+	error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
+				      &need_bytes);
+	if (error)
+		goto out_error;
+
+	xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
+	xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+	trace_xfs_log_reserve_exit(log, tic);
+	xlog_verify_grant_tail(log);
+	return 0;
+
+out_error:
+	/*
+	 * If we are failing, make sure the ticket doesn't have any current
+	 * reservations.  We don't want to add this back when the ticket/
+	 * transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0;	/* ungrant will give back unit_res * t_cnt. */
+	return error;
+}
+
+/*
+ * Run all the pending iclog callbacks and wake log force waiters and iclog
+ * space waiters so they can process the newly set shutdown state. We really
+ * don't care what order we process callbacks here because the log is shut down
+ * and so state cannot change on disk anymore. However, we cannot wake waiters
+ * until the callbacks have been processed because we may be in unmount and
+ * we must ensure that all AIL operations the callbacks perform have completed
+ * before we tear down the AIL.
+ *
+ * We avoid processing actively referenced iclogs so that we don't run callbacks
+ * while the iclog owner might still be preparing the iclog for IO submssion.
+ * These will be caught by xlog_state_iclog_release() and call this function
+ * again to process any callbacks that may have been added to that iclog.
+ */
+static void
+xlog_state_shutdown_callbacks(
+	struct xlog		*log)
+{
+	struct xlog_in_core	*iclog;
+	LIST_HEAD(cb_list);
+
+	iclog = log->l_iclog;
+	do {
+		if (atomic_read(&iclog->ic_refcnt)) {
+			/* Reference holder will re-run iclog callbacks. */
+			continue;
+		}
+		list_splice_init(&iclog->ic_callbacks, &cb_list);
+		spin_unlock(&log->l_icloglock);
+
+		xlog_cil_process_committed(&cb_list);
+
+		spin_lock(&log->l_icloglock);
+		wake_up_all(&iclog->ic_write_wait);
+		wake_up_all(&iclog->ic_force_wait);
+	} while ((iclog = iclog->ic_next) != log->l_iclog);
+
+	wake_up_all(&log->l_flush_wait);
+}
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and the
+ * it is in the WANT_SYNC state.
+ *
+ * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
+ * log tail is updated correctly. NEED_FUA indicates that the iclog will be
+ * written to stable storage, and implies that a commit record is contained
+ * within the iclog. We need to ensure that the log tail does not move beyond
+ * the tail that the first commit record in the iclog ordered against, otherwise
+ * correct recovery of that checkpoint becomes dependent on future operations
+ * performed on this iclog.
+ *
+ * Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
+ * current tail into iclog. Once the iclog tail is set, future operations must
+ * not modify it, otherwise they potentially violate ordering constraints for
+ * the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
+ * the iclog will get zeroed on activation of the iclog after sync, so we
+ * always capture the tail lsn on the iclog on the first NEED_FUA release
+ * regardless of the number of active reference counts on this iclog.
+ */
+int
+xlog_state_release_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	struct xlog_ticket	*ticket)
+{
+	xfs_lsn_t		tail_lsn;
+	bool			last_ref;
+
+	lockdep_assert_held(&log->l_icloglock);
+
+	trace_xlog_iclog_release(iclog, _RET_IP_);
+	/*
+	 * Grabbing the current log tail needs to be atomic w.r.t. the writing
+	 * of the tail LSN into the iclog so we guarantee that the log tail does
+	 * not move between the first time we know that the iclog needs to be
+	 * made stable and when we eventually submit it.
+	 */
+	if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+	     (iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
+	    !iclog->ic_header.h_tail_lsn) {
+		tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+	}
+
+	last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
+
+	if (xlog_is_shutdown(log)) {
+		/*
+		 * If there are no more references to this iclog, process the
+		 * pending iclog callbacks that were waiting on the release of
+		 * this iclog.
+		 */
+		if (last_ref)
+			xlog_state_shutdown_callbacks(log);
+		return -EIO;
+	}
+
+	if (!last_ref)
+		return 0;
+
+	if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
+		ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+		return 0;
+	}
+
+	iclog->ic_state = XLOG_STATE_SYNCING;
+	xlog_verify_tail_lsn(log, iclog);
+	trace_xlog_iclog_syncing(iclog, _RET_IP_);
+
+	spin_unlock(&log->l_icloglock);
+	xlog_sync(log, iclog, ticket);
+	spin_lock(&log->l_icloglock);
+	return 0;
+}
+
+/*
+ * Mount a log filesystem
+ *
+ * mp		- ubiquitous xfs mount point structure
+ * log_target	- buftarg of on-disk log device
+ * blk_offset	- Start block # where block size is 512 bytes (BBSIZE)
+ * num_bblocks	- Number of BBSIZE blocks in on-disk log
+ *
+ * Return error or zero.
+ */
+int
+xfs_log_mount(
+	xfs_mount_t	*mp,
+	xfs_buftarg_t	*log_target,
+	xfs_daddr_t	blk_offset,
+	int		num_bblks)
+{
+	struct xlog	*log;
+	bool		fatal = xfs_has_crc(mp);
+	int		error = 0;
+	int		min_logfsbs;
+
+	if (!xfs_has_norecovery(mp)) {
+		xfs_notice(mp, "Mounting V%d Filesystem",
+			   XFS_SB_VERSION_NUM(&mp->m_sb));
+	} else {
+		xfs_notice(mp,
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+			   XFS_SB_VERSION_NUM(&mp->m_sb));
+		ASSERT(xfs_is_readonly(mp));
+	}
+
+	log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+	if (IS_ERR(log)) {
+		error = PTR_ERR(log);
+		goto out;
+	}
+	mp->m_log = log;
+
+	/*
+	 * Validate the given log space and drop a critical message via syslog
+	 * if the log size is too small that would lead to some unexpected
+	 * situations in transaction log space reservation stage.
+	 *
+	 * Note: we can't just reject the mount if the validation fails.  This
+	 * would mean that people would have to downgrade their kernel just to
+	 * remedy the situation as there is no way to grow the log (short of
+	 * black magic surgery with xfs_db).
+	 *
+	 * We can, however, reject mounts for CRC format filesystems, as the
+	 * mkfs binary being used to make the filesystem should never create a
+	 * filesystem with a log that is too small.
+	 */
+	min_logfsbs = xfs_log_calc_minimum_size(mp);
+
+	if (mp->m_sb.sb_logblocks < min_logfsbs) {
+		xfs_warn(mp,
+		"Log size %d blocks too small, minimum size is %d blocks",
+			 mp->m_sb.sb_logblocks, min_logfsbs);
+		error = -EINVAL;
+	} else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
+		xfs_warn(mp,
+		"Log size %d blocks too large, maximum size is %lld blocks",
+			 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
+		error = -EINVAL;
+	} else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
+		xfs_warn(mp,
+		"log size %lld bytes too large, maximum size is %lld bytes",
+			 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
+			 XFS_MAX_LOG_BYTES);
+		error = -EINVAL;
+	} else if (mp->m_sb.sb_logsunit > 1 &&
+		   mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) {
+		xfs_warn(mp,
+		"log stripe unit %u bytes must be a multiple of block size",
+			 mp->m_sb.sb_logsunit);
+		error = -EINVAL;
+		fatal = true;
+	}
+	if (error) {
+		/*
+		 * Log check errors are always fatal on v5; or whenever bad
+		 * metadata leads to a crash.
+		 */
+		if (fatal) {
+			xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
+			ASSERT(0);
+			goto out_free_log;
+		}
+		xfs_crit(mp, "Log size out of supported range.");
+		xfs_crit(mp,
+"Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
+	}
+
+	/*
+	 * Initialize the AIL now we have a log.
+	 */
+	error = xfs_trans_ail_init(mp);
+	if (error) {
+		xfs_warn(mp, "AIL initialisation failed: error %d", error);
+		goto out_free_log;
+	}
+	log->l_ailp = mp->m_ail;
+
+	/*
+	 * skip log recovery on a norecovery mount.  pretend it all
+	 * just worked.
+	 */
+	if (!xfs_has_norecovery(mp)) {
+		/*
+		 * log recovery ignores readonly state and so we need to clear
+		 * mount-based read only state so it can write to disk.
+		 */
+		bool	readonly = test_and_clear_bit(XFS_OPSTATE_READONLY,
+						&mp->m_opstate);
+		error = xlog_recover(log);
+		if (readonly)
+			set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+		if (error) {
+			xfs_warn(mp, "log mount/recovery failed: error %d",
+				error);
+			xlog_recover_cancel(log);
+			goto out_destroy_ail;
+		}
+	}
+
+	error = xfs_sysfs_init(&log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
+			       "log");
+	if (error)
+		goto out_destroy_ail;
+
+	/* Normal transactions can now occur */
+	clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
+
+	/*
+	 * Now the log has been fully initialised and we know were our
+	 * space grant counters are, we can initialise the permanent ticket
+	 * needed for delayed logging to work.
+	 */
+	xlog_cil_init_post_recovery(log);
+
+	return 0;
+
+out_destroy_ail:
+	xfs_trans_ail_destroy(mp);
+out_free_log:
+	xlog_dealloc_log(log);
+out:
+	return error;
+}
+
+/*
+ * Finish the recovery of the file system.  This is separate from the
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
+ * here.
+ *
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
+ */
+int
+xfs_log_mount_finish(
+	struct xfs_mount	*mp)
+{
+	struct xlog		*log = mp->m_log;
+	bool			readonly;
+	int			error = 0;
+
+	if (xfs_has_norecovery(mp)) {
+		ASSERT(xfs_is_readonly(mp));
+		return 0;
+	}
+
+	/*
+	 * log recovery ignores readonly state and so we need to clear
+	 * mount-based read only state so it can write to disk.
+	 */
+	readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+
+	/*
+	 * During the second phase of log recovery, we need iget and
+	 * iput to behave like they do for an active filesystem.
+	 * xfs_fs_drop_inode needs to be able to prevent the deletion
+	 * of inodes before we're done replaying log items on those
+	 * inodes.  Turn it off immediately after recovery finishes
+	 * so that we don't leak the quota inodes if subsequent mount
+	 * activities fail.
+	 *
+	 * We let all inodes involved in redo item processing end up on
+	 * the LRU instead of being evicted immediately so that if we do
+	 * something to an unlinked inode, the irele won't cause
+	 * premature truncation and freeing of the inode, which results
+	 * in log recovery failure.  We have to evict the unreferenced
+	 * lru inodes after clearing SB_ACTIVE because we don't
+	 * otherwise clean up the lru if there's a subsequent failure in
+	 * xfs_mountfs, which leads to us leaking the inodes if nothing
+	 * else (e.g. quotacheck) references the inodes before the
+	 * mount failure occurs.
+	 */
+	mp->m_super->s_flags |= SB_ACTIVE;
+	xfs_log_work_queue(mp);
+	if (xlog_recovery_needed(log))
+		error = xlog_recover_finish(log);
+	mp->m_super->s_flags &= ~SB_ACTIVE;
+	evict_inodes(mp->m_super);
+
+	/*
+	 * Drain the buffer LRU after log recovery. This is required for v4
+	 * filesystems to avoid leaving around buffers with NULL verifier ops,
+	 * but we do it unconditionally to make sure we're always in a clean
+	 * cache state after mount.
+	 *
+	 * Don't push in the error case because the AIL may have pending intents
+	 * that aren't removed until recovery is cancelled.
+	 */
+	if (xlog_recovery_needed(log)) {
+		if (!error) {
+			xfs_log_force(mp, XFS_LOG_SYNC);
+			xfs_ail_push_all_sync(mp->m_ail);
+		}
+		xfs_notice(mp, "Ending recovery (logdev: %s)",
+				mp->m_logname ? mp->m_logname : "internal");
+	} else {
+		xfs_info(mp, "Ending clean mount");
+	}
+	xfs_buftarg_drain(mp->m_ddev_targp);
+
+	clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
+	if (readonly)
+		set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+
+	/* Make sure the log is dead if we're returning failure. */
+	ASSERT(!error || xlog_is_shutdown(log));
+
+	return error;
+}
+
+/*
+ * The mount has failed. Cancel the recovery if it hasn't completed and destroy
+ * the log.
+ */
+void
+xfs_log_mount_cancel(
+	struct xfs_mount	*mp)
+{
+	xlog_recover_cancel(mp->m_log);
+	xfs_log_unmount(mp);
+}
+
+/*
+ * Flush out the iclog to disk ensuring that device caches are flushed and
+ * the iclog hits stable storage before any completion waiters are woken.
+ */
+static inline int
+xlog_force_iclog(
+	struct xlog_in_core	*iclog)
+{
+	atomic_inc(&iclog->ic_refcnt);
+	iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+	if (iclog->ic_state == XLOG_STATE_ACTIVE)
+		xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
+	return xlog_state_release_iclog(iclog->ic_log, iclog, NULL);
+}
+
+/*
+ * Wait for the iclog and all prior iclogs to be written disk as required by the
+ * log force state machine. Waiting on ic_force_wait ensures iclog completions
+ * have been ordered and callbacks run before we are woken here, hence
+ * guaranteeing that all the iclogs up to this one are on stable storage.
+ */
+int
+xlog_wait_on_iclog(
+	struct xlog_in_core	*iclog)
+		__releases(iclog->ic_log->l_icloglock)
+{
+	struct xlog		*log = iclog->ic_log;
+
+	trace_xlog_iclog_wait_on(iclog, _RET_IP_);
+	if (!xlog_is_shutdown(log) &&
+	    iclog->ic_state != XLOG_STATE_ACTIVE &&
+	    iclog->ic_state != XLOG_STATE_DIRTY) {
+		XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
+		xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+	} else {
+		spin_unlock(&log->l_icloglock);
+	}
+
+	if (xlog_is_shutdown(log))
+		return -EIO;
+	return 0;
+}
+
+/*
+ * Write out an unmount record using the ticket provided. We have to account for
+ * the data space used in the unmount ticket as this write is not done from a
+ * transaction context that has already done the accounting for us.
+ */
+static int
+xlog_write_unmount_record(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket)
+{
+	struct  {
+		struct xlog_op_header ophdr;
+		struct xfs_unmount_log_format ulf;
+	} unmount_rec = {
+		.ophdr = {
+			.oh_clientid = XFS_LOG,
+			.oh_tid = cpu_to_be32(ticket->t_tid),
+			.oh_flags = XLOG_UNMOUNT_TRANS,
+		},
+		.ulf = {
+			.magic = XLOG_UNMOUNT_TYPE,
+		},
+	};
+	struct xfs_log_iovec reg = {
+		.i_addr = &unmount_rec,
+		.i_len = sizeof(unmount_rec),
+		.i_type = XLOG_REG_TYPE_UNMOUNT,
+	};
+	struct xfs_log_vec vec = {
+		.lv_niovecs = 1,
+		.lv_iovecp = &reg,
+	};
+	LIST_HEAD(lv_chain);
+	list_add(&vec.lv_list, &lv_chain);
+
+	BUILD_BUG_ON((sizeof(struct xlog_op_header) +
+		      sizeof(struct xfs_unmount_log_format)) !=
+							sizeof(unmount_rec));
+
+	/* account for space used by record data */
+	ticket->t_curr_res -= sizeof(unmount_rec);
+
+	return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len);
+}
+
+/*
+ * Mark the filesystem clean by writing an unmount record to the head of the
+ * log.
+ */
+static void
+xlog_unmount_write(
+	struct xlog		*log)
+{
+	struct xfs_mount	*mp = log->l_mp;
+	struct xlog_in_core	*iclog;
+	struct xlog_ticket	*tic = NULL;
+	int			error;
+
+	error = xfs_log_reserve(mp, 600, 1, &tic, 0);
+	if (error)
+		goto out_err;
+
+	error = xlog_write_unmount_record(log, tic);
+	/*
+	 * At this point, we're umounting anyway, so there's no point in
+	 * transitioning log state to shutdown. Just continue...
+	 */
+out_err:
+	if (error)
+		xfs_alert(mp, "%s: unmount record failed", __func__);
+
+	spin_lock(&log->l_icloglock);
+	iclog = log->l_iclog;
+	error = xlog_force_iclog(iclog);
+	xlog_wait_on_iclog(iclog);
+
+	if (tic) {
+		trace_xfs_log_umount_write(log, tic);
+		xfs_log_ticket_ungrant(log, tic);
+	}
+}
+
+static void
+xfs_log_unmount_verify_iclog(
+	struct xlog		*log)
+{
+	struct xlog_in_core	*iclog = log->l_iclog;
+
+	do {
+		ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+		ASSERT(iclog->ic_offset == 0);
+	} while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+
+/*
+ * Unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now since that particular field isn't
+ * currently architecture converted and "Unmount" is a bit foo.
+ * As far as I know, there weren't any dependencies on the old behaviour.
+ */
+static void
+xfs_log_unmount_write(
+	struct xfs_mount	*mp)
+{
+	struct xlog		*log = mp->m_log;
+
+	if (!xfs_log_writable(mp))
+		return;
+
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	if (xlog_is_shutdown(log))
+		return;
+
+	/*
+	 * If we think the summary counters are bad, avoid writing the unmount
+	 * record to force log recovery at next mount, after which the summary
+	 * counters will be recalculated.  Refer to xlog_check_unmount_rec for
+	 * more details.
+	 */
+	if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
+			XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+		xfs_alert(mp, "%s: will fix summary counters at next mount",
+				__func__);
+		return;
+	}
+
+	xfs_log_unmount_verify_iclog(log);
+	xlog_unmount_write(log);
+}
+
+/*
+ * Empty the log for unmount/freeze.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can cover the log.
+ */
+int
+xfs_log_quiesce(
+	struct xfs_mount	*mp)
+{
+	/*
+	 * Clear log incompat features since we're quiescing the log.  Report
+	 * failures, though it's not fatal to have a higher log feature
+	 * protection level than the log contents actually require.
+	 */
+	if (xfs_clear_incompat_log_features(mp)) {
+		int error;
+
+		error = xfs_sync_sb(mp, false);
+		if (error)
+			xfs_warn(mp,
+	"Failed to clear log incompat features on quiesce");
+	}
+
+	cancel_delayed_work_sync(&mp->m_log->l_work);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+	 * will push it, xfs_buftarg_wait() will not wait for it. Further,
+	 * xfs_buf_iowait() cannot be used because it was pushed with the
+	 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+	 * the IO to complete.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+	xfs_buftarg_wait(mp->m_ddev_targp);
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_unlock(mp->m_sb_bp);
+
+	return xfs_log_cover(mp);
+}
+
+void
+xfs_log_clean(
+	struct xfs_mount	*mp)
+{
+	xfs_log_quiesce(mp);
+	xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
+ *
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
+ */
+void
+xfs_log_unmount(
+	struct xfs_mount	*mp)
+{
+	xfs_log_clean(mp);
+
+	xfs_buftarg_drain(mp->m_ddev_targp);
+
+	xfs_trans_ail_destroy(mp);
+
+	xfs_sysfs_del(&mp->m_log->l_kobj);
+
+	xlog_dealloc_log(mp->m_log);
+}
+
+void
+xfs_log_item_init(
+	struct xfs_mount	*mp,
+	struct xfs_log_item	*item,
+	int			type,
+	const struct xfs_item_ops *ops)
+{
+	item->li_log = mp->m_log;
+	item->li_ailp = mp->m_ail;
+	item->li_type = type;
+	item->li_ops = ops;
+	item->li_lv = NULL;
+
+	INIT_LIST_HEAD(&item->li_ail);
+	INIT_LIST_HEAD(&item->li_cil);
+	INIT_LIST_HEAD(&item->li_bio_list);
+	INIT_LIST_HEAD(&item->li_trans);
+}
+
+/*
+ * Wake up processes waiting for log space after we have moved the log tail.
+ */
+void
+xfs_log_space_wake(
+	struct xfs_mount	*mp)
+{
+	struct xlog		*log = mp->m_log;
+	int			free_bytes;
+
+	if (xlog_is_shutdown(log))
+		return;
+
+	if (!list_empty_careful(&log->l_write_head.waiters)) {
+		ASSERT(!xlog_in_recovery(log));
+
+		spin_lock(&log->l_write_head.lock);
+		free_bytes = xlog_space_left(log, &log->l_write_head.grant);
+		xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
+		spin_unlock(&log->l_write_head.lock);
+	}
+
+	if (!list_empty_careful(&log->l_reserve_head.waiters)) {
+		ASSERT(!xlog_in_recovery(log));
+
+		spin_lock(&log->l_reserve_head.lock);
+		free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
+		xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
+		spin_unlock(&log->l_reserve_head.lock);
+	}
+}
+
+/*
+ * Determine if we have a transaction that has gone to disk that needs to be
+ * covered. To begin the transition to the idle state firstly the log needs to
+ * be idle. That means the CIL, the AIL and the iclogs needs to be empty before
+ * we start attempting to cover the log.
+ *
+ * Only if we are then in a state where covering is needed, the caller is
+ * informed that dummy transactions are required to move the log into the idle
+ * state.
+ *
+ * If there are any items in the AIl or CIL, then we do not want to attempt to
+ * cover the log as we may be in a situation where there isn't log space
+ * available to run a dummy transaction and this can lead to deadlocks when the
+ * tail of the log is pinned by an item that is modified in the CIL.  Hence
+ * there's no point in running a dummy transaction at this point because we
+ * can't start trying to idle the log until both the CIL and AIL are empty.
+ */
+static bool
+xfs_log_need_covered(
+	struct xfs_mount	*mp)
+{
+	struct xlog		*log = mp->m_log;
+	bool			needed = false;
+
+	if (!xlog_cil_empty(log))
+		return false;
+
+	spin_lock(&log->l_icloglock);
+	switch (log->l_covered_state) {
+	case XLOG_STATE_COVER_DONE:
+	case XLOG_STATE_COVER_DONE2:
+	case XLOG_STATE_COVER_IDLE:
+		break;
+	case XLOG_STATE_COVER_NEED:
+	case XLOG_STATE_COVER_NEED2:
+		if (xfs_ail_min_lsn(log->l_ailp))
+			break;
+		if (!xlog_iclogs_empty(log))
+			break;
+
+		needed = true;
+		if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+			log->l_covered_state = XLOG_STATE_COVER_DONE;
+		else
+			log->l_covered_state = XLOG_STATE_COVER_DONE2;
+		break;
+	default:
+		needed = true;
+		break;
+	}
+	spin_unlock(&log->l_icloglock);
+	return needed;
+}
+
+/*
+ * Explicitly cover the log. This is similar to background log covering but
+ * intended for usage in quiesce codepaths. The caller is responsible to ensure
+ * the log is idle and suitable for covering. The CIL, iclog buffers and AIL
+ * must all be empty.
+ */
+static int
+xfs_log_cover(
+	struct xfs_mount	*mp)
+{
+	int			error = 0;
+	bool			need_covered;
+
+	ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) &&
+	        !xfs_ail_min_lsn(mp->m_log->l_ailp)) ||
+		xlog_is_shutdown(mp->m_log));
+
+	if (!xfs_log_writable(mp))
+		return 0;
+
+	/*
+	 * xfs_log_need_covered() is not idempotent because it progresses the
+	 * state machine if the log requires covering. Therefore, we must call
+	 * this function once and use the result until we've issued an sb sync.
+	 * Do so first to make that abundantly clear.
+	 *
+	 * Fall into the covering sequence if the log needs covering or the
+	 * mount has lazy superblock accounting to sync to disk. The sb sync
+	 * used for covering accumulates the in-core counters, so covering
+	 * handles this for us.
+	 */
+	need_covered = xfs_log_need_covered(mp);
+	if (!need_covered && !xfs_has_lazysbcount(mp))
+		return 0;
+
+	/*
+	 * To cover the log, commit the superblock twice (at most) in
+	 * independent checkpoints. The first serves as a reference for the
+	 * tail pointer. The sync transaction and AIL push empties the AIL and
+	 * updates the in-core tail to the LSN of the first checkpoint. The
+	 * second commit updates the on-disk tail with the in-core LSN,
+	 * covering the log. Push the AIL one more time to leave it empty, as
+	 * we found it.
+	 */
+	do {
+		error = xfs_sync_sb(mp, true);
+		if (error)
+			break;
+		xfs_ail_push_all_sync(mp->m_ail);
+	} while (xfs_log_need_covered(mp));
+
+	return error;
+}
+
+/*
+ * We may be holding the log iclog lock upon entering this routine.
+ */
+xfs_lsn_t
+xlog_assign_tail_lsn_locked(
+	struct xfs_mount	*mp)
+{
+	struct xlog		*log = mp->m_log;
+	struct xfs_log_item	*lip;
+	xfs_lsn_t		tail_lsn;
+
+	assert_spin_locked(&mp->m_ail->ail_lock);
+
+	/*
+	 * To make sure we always have a valid LSN for the log tail we keep
+	 * track of the last LSN which was committed in log->l_last_sync_lsn,
+	 * and use that when the AIL was empty.
+	 */
+	lip = xfs_ail_min(mp->m_ail);
+	if (lip)
+		tail_lsn = lip->li_lsn;
+	else
+		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+	trace_xfs_log_assign_tail_lsn(log, tail_lsn);
+	atomic64_set(&log->l_tail_lsn, tail_lsn);
+	return tail_lsn;
+}
+
+xfs_lsn_t
+xlog_assign_tail_lsn(
+	struct xfs_mount	*mp)
+{
+	xfs_lsn_t		tail_lsn;
+
+	spin_lock(&mp->m_ail->ail_lock);
+	tail_lsn = xlog_assign_tail_lsn_locked(mp);
+	spin_unlock(&mp->m_ail->ail_lock);
+
+	return tail_lsn;
+}
+
+/*
+ * Return the space in the log between the tail and the head.  The head
+ * is passed in the cycle/bytes formal parms.  In the special case where
+ * the reserve head has wrapped passed the tail, this calculation is no
+ * longer valid.  In this case, just return 0 which means there is no space
+ * in the log.  This works for all places where this function is called
+ * with the reserve head.  Of course, if the write head were to ever
+ * wrap the tail, we should blow up.  Rather than catch this case here,
+ * we depend on other ASSERTions in other parts of the code.   XXXmiken
+ *
+ * If reservation head is behind the tail, we have a problem. Warn about it,
+ * but then treat it as if the log is empty.
+ *
+ * If the log is shut down, the head and tail may be invalid or out of whack, so
+ * shortcut invalidity asserts in this case so that we don't trigger them
+ * falsely.
+ */
+STATIC int
+xlog_space_left(
+	struct xlog	*log,
+	atomic64_t	*head)
+{
+	int		tail_bytes;
+	int		tail_cycle;
+	int		head_cycle;
+	int		head_bytes;
+
+	xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+	tail_bytes = BBTOB(tail_bytes);
+	if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+		return log->l_logsize - (head_bytes - tail_bytes);
+	if (tail_cycle + 1 < head_cycle)
+		return 0;
+
+	/* Ignore potential inconsistency when shutdown. */
+	if (xlog_is_shutdown(log))
+		return log->l_logsize;
+
+	if (tail_cycle < head_cycle) {
+		ASSERT(tail_cycle == (head_cycle - 1));
+		return tail_bytes - head_bytes;
+	}
+
+	/*
+	 * The reservation head is behind the tail. In this case we just want to
+	 * return the size of the log as the amount of space left.
+	 */
+	xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
+	xfs_alert(log->l_mp, "  tail_cycle = %d, tail_bytes = %d",
+		  tail_cycle, tail_bytes);
+	xfs_alert(log->l_mp, "  GH   cycle = %d, GH   bytes = %d",
+		  head_cycle, head_bytes);
+	ASSERT(0);
+	return log->l_logsize;
+}
+
+
+static void
+xlog_ioend_work(
+	struct work_struct	*work)
+{
+	struct xlog_in_core     *iclog =
+		container_of(work, struct xlog_in_core, ic_end_io_work);
+	struct xlog		*log = iclog->ic_log;
+	int			error;
+
+	error = blk_status_to_errno(iclog->ic_bio.bi_status);
+#ifdef DEBUG
+	/* treat writes with injected CRC errors as failed */
+	if (iclog->ic_fail_crc)
+		error = -EIO;
+#endif
+
+	/*
+	 * Race to shutdown the filesystem if we see an error.
+	 */
+	if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+		xfs_alert(log->l_mp, "log I/O error %d", error);
+		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+	}
+
+	xlog_state_done_syncing(iclog);
+	bio_uninit(&iclog->ic_bio);
+
+	/*
+	 * Drop the lock to signal that we are done. Nothing references the
+	 * iclog after this, so an unmount waiting on this lock can now tear it
+	 * down safely. As such, it is unsafe to reference the iclog after the
+	 * unlock as we could race with it being freed.
+	 */
+	up(&iclog->ic_sema);
+}
+
+/*
+ * Return size of each in-core log record buffer.
+ *
+ * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
+ *
+ * If the filesystem blocksize is too large, we may need to choose a
+ * larger size since the directory code currently logs entire blocks.
+ */
+STATIC void
+xlog_get_iclog_buffer_size(
+	struct xfs_mount	*mp,
+	struct xlog		*log)
+{
+	if (mp->m_logbufs <= 0)
+		mp->m_logbufs = XLOG_MAX_ICLOGS;
+	if (mp->m_logbsize <= 0)
+		mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
+
+	log->l_iclog_bufs = mp->m_logbufs;
+	log->l_iclog_size = mp->m_logbsize;
+
+	/*
+	 * # headers = size / 32k - one header holds cycles from 32k of data.
+	 */
+	log->l_iclog_heads =
+		DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
+	log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
+}
+
+void
+xfs_log_work_queue(
+	struct xfs_mount        *mp)
+{
+	queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work,
+				msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Clear the log incompat flags if we have the opportunity.
+ *
+ * This only happens if we're about to log the second dummy transaction as part
+ * of covering the log and we can get the log incompat feature usage lock.
+ */
+static inline void
+xlog_clear_incompat(
+	struct xlog		*log)
+{
+	struct xfs_mount	*mp = log->l_mp;
+
+	if (!xfs_sb_has_incompat_log_feature(&mp->m_sb,
+				XFS_SB_FEAT_INCOMPAT_LOG_ALL))
+		return;
+
+	if (log->l_covered_state != XLOG_STATE_COVER_DONE2)
+		return;
+
+	if (!down_write_trylock(&log->l_incompat_users))
+		return;
+
+	xfs_clear_incompat_log_features(mp);
+	up_write(&log->l_incompat_users);
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+static void
+xfs_log_worker(
+	struct work_struct	*work)
+{
+	struct xlog		*log = container_of(to_delayed_work(work),
+						struct xlog, l_work);
+	struct xfs_mount	*mp = log->l_mp;
+
+	/* dgc: errors ignored - not fatal and nowhere to report them */
+	if (xfs_fs_writable(mp, SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) {
+		/*
+		 * Dump a transaction into the log that contains no real change.
+		 * This is needed to stamp the current tail LSN into the log
+		 * during the covering operation.
+		 *
+		 * We cannot use an inode here for this - that will push dirty
+		 * state back up into the VFS and then periodic inode flushing
+		 * will prevent log covering from making progress. Hence we
+		 * synchronously log the superblock instead to ensure the
+		 * superblock is immediately unpinned and can be written back.
+		 */
+		xlog_clear_incompat(log);
+		xfs_sync_sb(mp, true);
+	} else
+		xfs_log_force(mp, 0);
+
+	/* start pushing all the metadata that is currently dirty */
+	xfs_ail_push_all(mp->m_ail);
+
+	/* queue us up again */
+	xfs_log_work_queue(mp);
+}
+
+/*
+ * This routine initializes some of the log structure for a given mount point.
+ * Its primary purpose is to fill in enough, so recovery can occur.  However,
+ * some other stuff may be filled in too.
+ */
+STATIC struct xlog *
+xlog_alloc_log(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*log_target,
+	xfs_daddr_t		blk_offset,
+	int			num_bblks)
+{
+	struct xlog		*log;
+	xlog_rec_header_t	*head;
+	xlog_in_core_t		**iclogp;
+	xlog_in_core_t		*iclog, *prev_iclog=NULL;
+	int			i;
+	int			error = -ENOMEM;
+	uint			log2_size = 0;
+
+	log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
+	if (!log) {
+		xfs_warn(mp, "Log allocation failed: No memory!");
+		goto out;
+	}
+
+	log->l_mp	   = mp;
+	log->l_targ	   = log_target;
+	log->l_logsize     = BBTOB(num_bblks);
+	log->l_logBBstart  = blk_offset;
+	log->l_logBBsize   = num_bblks;
+	log->l_covered_state = XLOG_STATE_COVER_IDLE;
+	set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
+	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
+
+	log->l_prev_block  = -1;
+	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
+	xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
+	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
+
+	if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1)
+		log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
+	else
+		log->l_iclog_roundoff = BBSIZE;
+
+	xlog_grant_head_init(&log->l_reserve_head);
+	xlog_grant_head_init(&log->l_write_head);
+
+	error = -EFSCORRUPTED;
+	if (xfs_has_sector(mp)) {
+	        log2_size = mp->m_sb.sb_logsectlog;
+		if (log2_size < BBSHIFT) {
+			xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
+				log2_size, BBSHIFT);
+			goto out_free_log;
+		}
+
+	        log2_size -= BBSHIFT;
+		if (log2_size > mp->m_sectbb_log) {
+			xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
+				log2_size, mp->m_sectbb_log);
+			goto out_free_log;
+		}
+
+		/* for larger sector sizes, must have v2 or external log */
+		if (log2_size && log->l_logBBstart > 0 &&
+			    !xfs_has_logv2(mp)) {
+			xfs_warn(mp,
+		"log sector size (0x%x) invalid for configuration.",
+				log2_size);
+			goto out_free_log;
+		}
+	}
+	log->l_sectBBsize = 1 << log2_size;
+
+	init_rwsem(&log->l_incompat_users);
+
+	xlog_get_iclog_buffer_size(mp, log);
+
+	spin_lock_init(&log->l_icloglock);
+	init_waitqueue_head(&log->l_flush_wait);
+
+	iclogp = &log->l_iclog;
+	/*
+	 * The amount of memory to allocate for the iclog structure is
+	 * rather funky due to the way the structure is defined.  It is
+	 * done this way so that we can use different sizes for machines
+	 * with different amounts of memory.  See the definition of
+	 * xlog_in_core_t in xfs_log_priv.h for details.
+	 */
+	ASSERT(log->l_iclog_size >= 4096);
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
+				sizeof(struct bio_vec);
+
+		iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+		if (!iclog)
+			goto out_free_iclog;
+
+		*iclogp = iclog;
+		iclog->ic_prev = prev_iclog;
+		prev_iclog = iclog;
+
+		iclog->ic_data = kvzalloc(log->l_iclog_size,
+				GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+		if (!iclog->ic_data)
+			goto out_free_iclog;
+		head = &iclog->ic_header;
+		memset(head, 0, sizeof(xlog_rec_header_t));
+		head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+		head->h_version = cpu_to_be32(
+			xfs_has_logv2(log->l_mp) ? 2 : 1);
+		head->h_size = cpu_to_be32(log->l_iclog_size);
+		/* new fields */
+		head->h_fmt = cpu_to_be32(XLOG_FMT);
+		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+
+		iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
+		iclog->ic_state = XLOG_STATE_ACTIVE;
+		iclog->ic_log = log;
+		atomic_set(&iclog->ic_refcnt, 0);
+		INIT_LIST_HEAD(&iclog->ic_callbacks);
+		iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
+
+		init_waitqueue_head(&iclog->ic_force_wait);
+		init_waitqueue_head(&iclog->ic_write_wait);
+		INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
+		sema_init(&iclog->ic_sema, 1);
+
+		iclogp = &iclog->ic_next;
+	}
+	*iclogp = log->l_iclog;			/* complete ring */
+	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
+
+	log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM |
+				    WQ_HIGHPRI),
+			0, mp->m_super->s_id);
+	if (!log->l_ioend_workqueue)
+		goto out_free_iclog;
+
+	error = xlog_cil_init(log);
+	if (error)
+		goto out_destroy_workqueue;
+	return log;
+
+out_destroy_workqueue:
+	destroy_workqueue(log->l_ioend_workqueue);
+out_free_iclog:
+	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
+		prev_iclog = iclog->ic_next;
+		kmem_free(iclog->ic_data);
+		kmem_free(iclog);
+		if (prev_iclog == log->l_iclog)
+			break;
+	}
+out_free_log:
+	kmem_free(log);
+out:
+	return ERR_PTR(error);
+}	/* xlog_alloc_log */
+
+/*
+ * Compute the LSN that we'd need to push the log tail towards in order to have
+ * (a) enough on-disk log space to log the number of bytes specified, (b) at
+ * least 25% of the log space free, and (c) at least 256 blocks free.  If the
+ * log free space already meets all three thresholds, this function returns
+ * NULLCOMMITLSN.
+ */
+xfs_lsn_t
+xlog_grant_push_threshold(
+	struct xlog	*log,
+	int		need_bytes)
+{
+	xfs_lsn_t	threshold_lsn = 0;
+	xfs_lsn_t	last_sync_lsn;
+	int		free_blocks;
+	int		free_bytes;
+	int		threshold_block;
+	int		threshold_cycle;
+	int		free_threshold;
+
+	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+	free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
+	free_blocks = BTOBBT(free_bytes);
+
+	/*
+	 * Set the threshold for the minimum number of free blocks in the
+	 * log to the maximum of what the caller needs, one quarter of the
+	 * log, and 256 blocks.
+	 */
+	free_threshold = BTOBB(need_bytes);
+	free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
+	free_threshold = max(free_threshold, 256);
+	if (free_blocks >= free_threshold)
+		return NULLCOMMITLSN;
+
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+						&threshold_block);
+	threshold_block += free_threshold;
+	if (threshold_block >= log->l_logBBsize) {
+		threshold_block -= log->l_logBBsize;
+		threshold_cycle += 1;
+	}
+	threshold_lsn = xlog_assign_lsn(threshold_cycle,
+					threshold_block);
+	/*
+	 * Don't pass in an lsn greater than the lsn of the last
+	 * log record known to be on disk. Use a snapshot of the last sync lsn
+	 * so that it doesn't change between the compare and the set.
+	 */
+	last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+		threshold_lsn = last_sync_lsn;
+
+	return threshold_lsn;
+}
+
+/*
+ * Push the tail of the log if we need to do so to maintain the free log space
+ * thresholds set out by xlog_grant_push_threshold.  We may need to adopt a
+ * policy which pushes on an lsn which is further along in the log once we
+ * reach the high water mark.  In this manner, we would be creating a low water
+ * mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+	struct xlog	*log,
+	int		need_bytes)
+{
+	xfs_lsn_t	threshold_lsn;
+
+	threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
+	if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log))
+		return;
+
+	/*
+	 * Get the transaction layer to kick the dirty buffers out to
+	 * disk asynchronously. No point in trying to do this if
+	 * the filesystem is shutting down.
+	 */
+	xfs_ail_push(log->l_ailp, threshold_lsn);
+}
+
+/*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			roundoff)
+{
+	int			i, j, k;
+	int			size = iclog->ic_offset + roundoff;
+	__be32			cycle_lsn;
+	char			*dp;
+
+	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+	dp = iclog->ic_datap;
+	for (i = 0; i < BTOBB(size); i++) {
+		if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+			break;
+		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+		*(__be32 *)dp = cycle_lsn;
+		dp += BBSIZE;
+	}
+
+	if (xfs_has_logv2(log->l_mp)) {
+		xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+		for ( ; i < BTOBB(size); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+			*(__be32 *)dp = cycle_lsn;
+			dp += BBSIZE;
+		}
+
+		for (i = 1; i < log->l_iclog_heads; i++)
+			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+	}
+}
+
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__le32
+xlog_cksum(
+	struct xlog		*log,
+	struct xlog_rec_header	*rhead,
+	char			*dp,
+	int			size)
+{
+	uint32_t		crc;
+
+	/* first generate the crc for the record header ... */
+	crc = xfs_start_cksum_update((char *)rhead,
+			      sizeof(struct xlog_rec_header),
+			      offsetof(struct xlog_rec_header, h_crc));
+
+	/* ... then for additional cycle data for v2 logs ... */
+	if (xfs_has_logv2(log->l_mp)) {
+		union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+		int		i;
+		int		xheads;
+
+		xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
+
+		for (i = 1; i < xheads; i++) {
+			crc = crc32c(crc, &xhdr[i].hic_xheader,
+				     sizeof(struct xlog_rec_ext_header));
+		}
+	}
+
+	/* ... and finally for the payload */
+	crc = crc32c(crc, dp, size);
+
+	return xfs_end_cksum(crc);
+}
+
+static void
+xlog_bio_end_io(
+	struct bio		*bio)
+{
+	struct xlog_in_core	*iclog = bio->bi_private;
+
+	queue_work(iclog->ic_log->l_ioend_workqueue,
+		   &iclog->ic_end_io_work);
+}
+
+static int
+xlog_map_iclog_data(
+	struct bio		*bio,
+	void			*data,
+	size_t			count)
+{
+	do {
+		struct page	*page = kmem_to_page(data);
+		unsigned int	off = offset_in_page(data);
+		size_t		len = min_t(size_t, count, PAGE_SIZE - off);
+
+		if (bio_add_page(bio, page, len, off) != len)
+			return -EIO;
+
+		data += len;
+		count -= len;
+	} while (count);
+
+	return 0;
+}
+
+STATIC void
+xlog_write_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	uint64_t		bno,
+	unsigned int		count)
+{
+	ASSERT(bno < log->l_logBBsize);
+	trace_xlog_iclog_write(iclog, _RET_IP_);
+
+	/*
+	 * We lock the iclogbufs here so that we can serialise against I/O
+	 * completion during unmount.  We might be processing a shutdown
+	 * triggered during unmount, and that can occur asynchronously to the
+	 * unmount thread, and hence we need to ensure that completes before
+	 * tearing down the iclogbufs.  Hence we need to hold the buffer lock
+	 * across the log IO to archieve that.
+	 */
+	down(&iclog->ic_sema);
+	if (xlog_is_shutdown(log)) {
+		/*
+		 * It would seem logical to return EIO here, but we rely on
+		 * the log state machine to propagate I/O errors instead of
+		 * doing it here.  We kick of the state machine and unlock
+		 * the buffer manually, the code needs to be kept in sync
+		 * with the I/O completion path.
+		 */
+		xlog_state_done_syncing(iclog);
+		up(&iclog->ic_sema);
+		return;
+	}
+
+	/*
+	 * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
+	 * IOs coming immediately after this one. This prevents the block layer
+	 * writeback throttle from throttling log writes behind background
+	 * metadata writeback and causing priority inversions.
+	 */
+	bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec,
+		 howmany(count, PAGE_SIZE),
+		 REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE);
+	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+	iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+	iclog->ic_bio.bi_private = iclog;
+
+	if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
+		iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+		/*
+		 * For external log devices, we also need to flush the data
+		 * device cache first to ensure all metadata writeback covered
+		 * by the LSN in this iclog is on stable storage. This is slow,
+		 * but it *must* complete before we issue the external log IO.
+		 *
+		 * If the flush fails, we cannot conclude that past metadata
+		 * writeback from the log succeeded.  Repeating the flush is
+		 * not possible, hence we must shut down with log IO error to
+		 * avoid shutdown re-entering this path and erroring out again.
+		 */
+		if (log->l_targ != log->l_mp->m_ddev_targp &&
+		    blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
+			xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+			return;
+		}
+	}
+	if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
+		iclog->ic_bio.bi_opf |= REQ_FUA;
+
+	iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
+
+	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
+		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+		return;
+	}
+	if (is_vmalloc_addr(iclog->ic_data))
+		flush_kernel_vmap_range(iclog->ic_data, count);
+
+	/*
+	 * If this log buffer would straddle the end of the log we will have
+	 * to split it up into two bios, so that we can continue at the start.
+	 */
+	if (bno + BTOBB(count) > log->l_logBBsize) {
+		struct bio *split;
+
+		split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno,
+				  GFP_NOIO, &fs_bio_set);
+		bio_chain(split, &iclog->ic_bio);
+		submit_bio(split);
+
+		/* restart at logical offset zero for the remainder */
+		iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
+	}
+
+	submit_bio(&iclog->ic_bio);
+}
+
+/*
+ * We need to bump cycle number for the part of the iclog that is
+ * written to the start of the log. Watch out for the header magic
+ * number case, though.
+ */
+static void
+xlog_split_iclog(
+	struct xlog		*log,
+	void			*data,
+	uint64_t		bno,
+	unsigned int		count)
+{
+	unsigned int		split_offset = BBTOB(log->l_logBBsize - bno);
+	unsigned int		i;
+
+	for (i = split_offset; i < count; i += BBSIZE) {
+		uint32_t cycle = get_unaligned_be32(data + i);
+
+		if (++cycle == XLOG_HEADER_MAGIC_NUM)
+			cycle++;
+		put_unaligned_be32(cycle, data + i);
+	}
+}
+
+static int
+xlog_calc_iclog_size(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	uint32_t		*roundoff)
+{
+	uint32_t		count_init, count;
+
+	/* Add for LR header */
+	count_init = log->l_iclog_hsize + iclog->ic_offset;
+	count = roundup(count_init, log->l_iclog_roundoff);
+
+	*roundoff = count - count_init;
+
+	ASSERT(count >= count_init);
+	ASSERT(*roundoff < log->l_iclog_roundoff);
+	return count;
+}
+
+/*
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
+ * fashion.  Previously, we should have moved the current iclog
+ * ptr in the log to point to the next available iclog.  This allows further
+ * write to continue while this code syncs out an iclog ready to go.
+ * Before an in-core log can be written out, the data section must be scanned
+ * to save away the 1st word of each BBSIZE block into the header.  We replace
+ * it with the current cycle count.  Each BBSIZE block is tagged with the
+ * cycle count because there in an implicit assumption that drives will
+ * guarantee that entire 512 byte blocks get written at once.  In other words,
+ * we can't have part of a 512 byte block written and part not written.  By
+ * tagging each block, we will know which blocks are valid when recovering
+ * after an unclean shutdown.
+ *
+ * This routine is single threaded on the iclog.  No other thread can be in
+ * this routine with the same iclog.  Changing contents of iclog can there-
+ * fore be done without grabbing the state machine lock.  Updating the global
+ * log will require grabbing the lock though.
+ *
+ * The entire log manager uses a logical block numbering scheme.  Only
+ * xlog_write_iclog knows about the fact that the log may not start with
+ * block zero on a given device.
+ */
+STATIC void
+xlog_sync(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	struct xlog_ticket	*ticket)
+{
+	unsigned int		count;		/* byte count of bwrite */
+	unsigned int		roundoff;       /* roundoff to BB or stripe */
+	uint64_t		bno;
+	unsigned int		size;
+
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+	trace_xlog_iclog_sync(iclog, _RET_IP_);
+
+	count = xlog_calc_iclog_size(log, iclog, &roundoff);
+
+	/*
+	 * If we have a ticket, account for the roundoff via the ticket
+	 * reservation to avoid touching the hot grant heads needlessly.
+	 * Otherwise, we have to move grant heads directly.
+	 */
+	if (ticket) {
+		ticket->t_curr_res -= roundoff;
+	} else {
+		xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+		xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+	}
+
+	/* put cycle number in every block */
+	xlog_pack_data(log, iclog, roundoff);
+
+	/* real byte length */
+	size = iclog->ic_offset;
+	if (xfs_has_logv2(log->l_mp))
+		size += roundoff;
+	iclog->ic_header.h_len = cpu_to_be32(size);
+
+	XFS_STATS_INC(log->l_mp, xs_log_writes);
+	XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
+
+	bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
+
+	/* Do we need to split this write into 2 parts? */
+	if (bno + BTOBB(count) > log->l_logBBsize)
+		xlog_split_iclog(log, &iclog->ic_header, bno, count);
+
+	/* calculcate the checksum */
+	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+					    iclog->ic_datap, size);
+	/*
+	 * Intentionally corrupt the log record CRC based on the error injection
+	 * frequency, if defined. This facilitates testing log recovery in the
+	 * event of torn writes. Hence, set the IOABORT state to abort the log
+	 * write on I/O completion and shutdown the fs. The subsequent mount
+	 * detects the bad CRC and attempts to recover.
+	 */
+#ifdef DEBUG
+	if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
+		iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
+		iclog->ic_fail_crc = true;
+		xfs_warn(log->l_mp,
+	"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
+			 be64_to_cpu(iclog->ic_header.h_lsn));
+	}
+#endif
+	xlog_verify_iclog(log, iclog, count);
+	xlog_write_iclog(log, iclog, bno, count);
+}
+
+/*
+ * Deallocate a log structure
+ */
+STATIC void
+xlog_dealloc_log(
+	struct xlog	*log)
+{
+	xlog_in_core_t	*iclog, *next_iclog;
+	int		i;
+
+	/*
+	 * Cycle all the iclogbuf locks to make sure all log IO completion
+	 * is done before we tear down these buffers.
+	 */
+	iclog = log->l_iclog;
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		down(&iclog->ic_sema);
+		up(&iclog->ic_sema);
+		iclog = iclog->ic_next;
+	}
+
+	/*
+	 * Destroy the CIL after waiting for iclog IO completion because an
+	 * iclog EIO error will try to shut down the log, which accesses the
+	 * CIL to wake up the waiters.
+	 */
+	xlog_cil_destroy(log);
+
+	iclog = log->l_iclog;
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		next_iclog = iclog->ic_next;
+		kmem_free(iclog->ic_data);
+		kmem_free(iclog);
+		iclog = next_iclog;
+	}
+
+	log->l_mp->m_log = NULL;
+	destroy_workqueue(log->l_ioend_workqueue);
+	kmem_free(log);
+}
+
+/*
+ * Update counters atomically now that memcpy is done.
+ */
+static inline void
+xlog_state_finish_copy(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			record_cnt,
+	int			copy_bytes)
+{
+	lockdep_assert_held(&log->l_icloglock);
+
+	be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
+	iclog->ic_offset += copy_bytes;
+}
+
+/*
+ * print out info relating to regions written which consume
+ * the reservation
+ */
+void
+xlog_print_tic_res(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*ticket)
+{
+	xfs_warn(mp, "ticket reservation summary:");
+	xfs_warn(mp, "  unit res    = %d bytes", ticket->t_unit_res);
+	xfs_warn(mp, "  current res = %d bytes", ticket->t_curr_res);
+	xfs_warn(mp, "  original count  = %d", ticket->t_ocnt);
+	xfs_warn(mp, "  remaining count = %d", ticket->t_cnt);
+}
+
+/*
+ * Print a summary of the transaction.
+ */
+void
+xlog_print_trans(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_log_item	*lip;
+
+	/* dump core transaction and ticket info */
+	xfs_warn(mp, "transaction summary:");
+	xfs_warn(mp, "  log res   = %d", tp->t_log_res);
+	xfs_warn(mp, "  log count = %d", tp->t_log_count);
+	xfs_warn(mp, "  flags     = 0x%x", tp->t_flags);
+
+	xlog_print_tic_res(mp, tp->t_ticket);
+
+	/* dump each log item */
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
+		struct xfs_log_vec	*lv = lip->li_lv;
+		struct xfs_log_iovec	*vec;
+		int			i;
+
+		xfs_warn(mp, "log item: ");
+		xfs_warn(mp, "  type	= 0x%x", lip->li_type);
+		xfs_warn(mp, "  flags	= 0x%lx", lip->li_flags);
+		if (!lv)
+			continue;
+		xfs_warn(mp, "  niovecs	= %d", lv->lv_niovecs);
+		xfs_warn(mp, "  size	= %d", lv->lv_size);
+		xfs_warn(mp, "  bytes	= %d", lv->lv_bytes);
+		xfs_warn(mp, "  buf len	= %d", lv->lv_buf_len);
+
+		/* dump each iovec for the log item */
+		vec = lv->lv_iovecp;
+		for (i = 0; i < lv->lv_niovecs; i++) {
+			int dumplen = min(vec->i_len, 32);
+
+			xfs_warn(mp, "  iovec[%d]", i);
+			xfs_warn(mp, "    type	= 0x%x", vec->i_type);
+			xfs_warn(mp, "    len	= %d", vec->i_len);
+			xfs_warn(mp, "    first %d bytes of iovec[%d]:", dumplen, i);
+			xfs_hex_dump(vec->i_addr, dumplen);
+
+			vec++;
+		}
+	}
+}
+
+static inline void
+xlog_write_iovec(
+	struct xlog_in_core	*iclog,
+	uint32_t		*log_offset,
+	void			*data,
+	uint32_t		write_len,
+	int			*bytes_left,
+	uint32_t		*record_cnt,
+	uint32_t		*data_cnt)
+{
+	ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
+	ASSERT(*log_offset % sizeof(int32_t) == 0);
+	ASSERT(write_len % sizeof(int32_t) == 0);
+
+	memcpy(iclog->ic_datap + *log_offset, data, write_len);
+	*log_offset += write_len;
+	*bytes_left -= write_len;
+	(*record_cnt)++;
+	*data_cnt += write_len;
+}
+
+/*
+ * Write log vectors into a single iclog which is guaranteed by the caller
+ * to have enough space to write the entire log vector into.
+ */
+static void
+xlog_write_full(
+	struct xfs_log_vec	*lv,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	*iclog,
+	uint32_t		*log_offset,
+	uint32_t		*len,
+	uint32_t		*record_cnt,
+	uint32_t		*data_cnt)
+{
+	int			index;
+
+	ASSERT(*log_offset + *len <= iclog->ic_size ||
+		iclog->ic_state == XLOG_STATE_WANT_SYNC);
+
+	/*
+	 * Ordered log vectors have no regions to write so this
+	 * loop will naturally skip them.
+	 */
+	for (index = 0; index < lv->lv_niovecs; index++) {
+		struct xfs_log_iovec	*reg = &lv->lv_iovecp[index];
+		struct xlog_op_header	*ophdr = reg->i_addr;
+
+		ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+		xlog_write_iovec(iclog, log_offset, reg->i_addr,
+				reg->i_len, len, record_cnt, data_cnt);
+	}
+}
+
+static int
+xlog_write_get_more_iclog_space(
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclogp,
+	uint32_t		*log_offset,
+	uint32_t		len,
+	uint32_t		*record_cnt,
+	uint32_t		*data_cnt)
+{
+	struct xlog_in_core	*iclog = *iclogp;
+	struct xlog		*log = iclog->ic_log;
+	int			error;
+
+	spin_lock(&log->l_icloglock);
+	ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
+	xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+	error = xlog_state_release_iclog(log, iclog, ticket);
+	spin_unlock(&log->l_icloglock);
+	if (error)
+		return error;
+
+	error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+					log_offset);
+	if (error)
+		return error;
+	*record_cnt = 0;
+	*data_cnt = 0;
+	*iclogp = iclog;
+	return 0;
+}
+
+/*
+ * Write log vectors into a single iclog which is smaller than the current chain
+ * length. We write until we cannot fit a full record into the remaining space
+ * and then stop. We return the log vector that is to be written that cannot
+ * wholly fit in the iclog.
+ */
+static int
+xlog_write_partial(
+	struct xfs_log_vec	*lv,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclogp,
+	uint32_t		*log_offset,
+	uint32_t		*len,
+	uint32_t		*record_cnt,
+	uint32_t		*data_cnt)
+{
+	struct xlog_in_core	*iclog = *iclogp;
+	struct xlog_op_header	*ophdr;
+	int			index = 0;
+	uint32_t		rlen;
+	int			error;
+
+	/* walk the logvec, copying until we run out of space in the iclog */
+	for (index = 0; index < lv->lv_niovecs; index++) {
+		struct xfs_log_iovec	*reg = &lv->lv_iovecp[index];
+		uint32_t		reg_offset = 0;
+
+		/*
+		 * The first region of a continuation must have a non-zero
+		 * length otherwise log recovery will just skip over it and
+		 * start recovering from the next opheader it finds. Because we
+		 * mark the next opheader as a continuation, recovery will then
+		 * incorrectly add the continuation to the previous region and
+		 * that breaks stuff.
+		 *
+		 * Hence if there isn't space for region data after the
+		 * opheader, then we need to start afresh with a new iclog.
+		 */
+		if (iclog->ic_size - *log_offset <=
+					sizeof(struct xlog_op_header)) {
+			error = xlog_write_get_more_iclog_space(ticket,
+					&iclog, log_offset, *len, record_cnt,
+					data_cnt);
+			if (error)
+				return error;
+		}
+
+		ophdr = reg->i_addr;
+		rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
+
+		ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+		ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
+		if (rlen != reg->i_len)
+			ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+		xlog_write_iovec(iclog, log_offset, reg->i_addr,
+				rlen, len, record_cnt, data_cnt);
+
+		/* If we wrote the whole region, move to the next. */
+		if (rlen == reg->i_len)
+			continue;
+
+		/*
+		 * We now have a partially written iovec, but it can span
+		 * multiple iclogs so we loop here. First we release the iclog
+		 * we currently have, then we get a new iclog and add a new
+		 * opheader. Then we continue copying from where we were until
+		 * we either complete the iovec or fill the iclog. If we
+		 * complete the iovec, then we increment the index and go right
+		 * back to the top of the outer loop. if we fill the iclog, we
+		 * run the inner loop again.
+		 *
+		 * This is complicated by the tail of a region using all the
+		 * space in an iclog and hence requiring us to release the iclog
+		 * and get a new one before returning to the outer loop. We must
+		 * always guarantee that we exit this inner loop with at least
+		 * space for log transaction opheaders left in the current
+		 * iclog, hence we cannot just terminate the loop at the end
+		 * of the of the continuation. So we loop while there is no
+		 * space left in the current iclog, and check for the end of the
+		 * continuation after getting a new iclog.
+		 */
+		do {
+			/*
+			 * Ensure we include the continuation opheader in the
+			 * space we need in the new iclog by adding that size
+			 * to the length we require. This continuation opheader
+			 * needs to be accounted to the ticket as the space it
+			 * consumes hasn't been accounted to the lv we are
+			 * writing.
+			 */
+			error = xlog_write_get_more_iclog_space(ticket,
+					&iclog, log_offset,
+					*len + sizeof(struct xlog_op_header),
+					record_cnt, data_cnt);
+			if (error)
+				return error;
+
+			ophdr = iclog->ic_datap + *log_offset;
+			ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+			ophdr->oh_clientid = XFS_TRANSACTION;
+			ophdr->oh_res2 = 0;
+			ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
+
+			ticket->t_curr_res -= sizeof(struct xlog_op_header);
+			*log_offset += sizeof(struct xlog_op_header);
+			*data_cnt += sizeof(struct xlog_op_header);
+
+			/*
+			 * If rlen fits in the iclog, then end the region
+			 * continuation. Otherwise we're going around again.
+			 */
+			reg_offset += rlen;
+			rlen = reg->i_len - reg_offset;
+			if (rlen <= iclog->ic_size - *log_offset)
+				ophdr->oh_flags |= XLOG_END_TRANS;
+			else
+				ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+			rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
+			ophdr->oh_len = cpu_to_be32(rlen);
+
+			xlog_write_iovec(iclog, log_offset,
+					reg->i_addr + reg_offset,
+					rlen, len, record_cnt, data_cnt);
+
+		} while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
+	}
+
+	/*
+	 * No more iovecs remain in this logvec so return the next log vec to
+	 * the caller so it can go back to fast path copying.
+	 */
+	*iclogp = iclog;
+	return 0;
+}
+
+/*
+ * Write some region out to in-core log
+ *
+ * This will be called when writing externally provided regions or when
+ * writing out a commit record for a given transaction.
+ *
+ * General algorithm:
+ *	1. Find total length of this write.  This may include adding to the
+ *		lengths passed in.
+ *	2. Check whether we violate the tickets reservation.
+ *	3. While writing to this iclog
+ *	    A. Reserve as much space in this iclog as can get
+ *	    B. If this is first write, save away start lsn
+ *	    C. While writing this region:
+ *		1. If first write of transaction, write start record
+ *		2. Write log operation header (header per region)
+ *		3. Find out if we can fit entire region into this iclog
+ *		4. Potentially, verify destination memcpy ptr
+ *		5. Memcpy (partial) region
+ *		6. If partial copy, release iclog; otherwise, continue
+ *			copying more regions into current iclog
+ *	4. Mark want sync bit (in simulation mode)
+ *	5. Release iclog for potential flush to on-disk log.
+ *
+ * ERRORS:
+ * 1.	Panic if reservation is overrun.  This should never happen since
+ *	reservation amounts are generated internal to the filesystem.
+ * NOTES:
+ * 1. Tickets are single threaded data structures.
+ * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
+ *	syncing routine.  When a single log_write region needs to span
+ *	multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
+ *	on all log operation writes which don't contain the end of the
+ *	region.  The XLOG_END_TRANS bit is used for the in-core log
+ *	operation which contains the end of the continued log_write region.
+ * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
+ *	we don't really know exactly how much space will be used.  As a result,
+ *	we don't update ic_offset until the end when we know exactly how many
+ *	bytes have been written out.
+ */
+int
+xlog_write(
+	struct xlog		*log,
+	struct xfs_cil_ctx	*ctx,
+	struct list_head	*lv_chain,
+	struct xlog_ticket	*ticket,
+	uint32_t		len)
+
+{
+	struct xlog_in_core	*iclog = NULL;
+	struct xfs_log_vec	*lv;
+	uint32_t		record_cnt = 0;
+	uint32_t		data_cnt = 0;
+	int			error = 0;
+	int			log_offset;
+
+	if (ticket->t_curr_res < 0) {
+		xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+		     "ctx ticket reservation ran out. Need to up reservation");
+		xlog_print_tic_res(log->l_mp, ticket);
+		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+	}
+
+	error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+					   &log_offset);
+	if (error)
+		return error;
+
+	ASSERT(log_offset <= iclog->ic_size - 1);
+
+	/*
+	 * If we have a context pointer, pass it the first iclog we are
+	 * writing to so it can record state needed for iclog write
+	 * ordering.
+	 */
+	if (ctx)
+		xlog_cil_set_ctx_write_state(ctx, iclog);
+
+	list_for_each_entry(lv, lv_chain, lv_list) {
+		/*
+		 * If the entire log vec does not fit in the iclog, punt it to
+		 * the partial copy loop which can handle this case.
+		 */
+		if (lv->lv_niovecs &&
+		    lv->lv_bytes > iclog->ic_size - log_offset) {
+			error = xlog_write_partial(lv, ticket, &iclog,
+					&log_offset, &len, &record_cnt,
+					&data_cnt);
+			if (error) {
+				/*
+				 * We have no iclog to release, so just return
+				 * the error immediately.
+				 */
+				return error;
+			}
+		} else {
+			xlog_write_full(lv, ticket, iclog, &log_offset,
+					 &len, &record_cnt, &data_cnt);
+		}
+	}
+	ASSERT(len == 0);
+
+	/*
+	 * We've already been guaranteed that the last writes will fit inside
+	 * the current iclog, and hence it will already have the space used by
+	 * those writes accounted to it. Hence we do not need to update the
+	 * iclog with the number of bytes written here.
+	 */
+	spin_lock(&log->l_icloglock);
+	xlog_state_finish_copy(log, iclog, record_cnt, 0);
+	error = xlog_state_release_iclog(log, iclog, ticket);
+	spin_unlock(&log->l_icloglock);
+
+	return error;
+}
+
+static void
+xlog_state_activate_iclog(
+	struct xlog_in_core	*iclog,
+	int			*iclogs_changed)
+{
+	ASSERT(list_empty_careful(&iclog->ic_callbacks));
+	trace_xlog_iclog_activate(iclog, _RET_IP_);
+
+	/*
+	 * If the number of ops in this iclog indicate it just contains the
+	 * dummy transaction, we can change state into IDLE (the second time
+	 * around). Otherwise we should change the state into NEED a dummy.
+	 * We don't need to cover the dummy.
+	 */
+	if (*iclogs_changed == 0 &&
+	    iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
+		*iclogs_changed = 1;
+	} else {
+		/*
+		 * We have two dirty iclogs so start over.  This could also be
+		 * num of ops indicating this is not the dummy going out.
+		 */
+		*iclogs_changed = 2;
+	}
+
+	iclog->ic_state	= XLOG_STATE_ACTIVE;
+	iclog->ic_offset = 0;
+	iclog->ic_header.h_num_logops = 0;
+	memset(iclog->ic_header.h_cycle_data, 0,
+		sizeof(iclog->ic_header.h_cycle_data));
+	iclog->ic_header.h_lsn = 0;
+	iclog->ic_header.h_tail_lsn = 0;
+}
+
+/*
+ * Loop through all iclogs and mark all iclogs currently marked DIRTY as
+ * ACTIVE after iclog I/O has completed.
+ */
+static void
+xlog_state_activate_iclogs(
+	struct xlog		*log,
+	int			*iclogs_changed)
+{
+	struct xlog_in_core	*iclog = log->l_iclog;
+
+	do {
+		if (iclog->ic_state == XLOG_STATE_DIRTY)
+			xlog_state_activate_iclog(iclog, iclogs_changed);
+		/*
+		 * The ordering of marking iclogs ACTIVE must be maintained, so
+		 * an iclog doesn't become ACTIVE beyond one that is SYNCING.
+		 */
+		else if (iclog->ic_state != XLOG_STATE_ACTIVE)
+			break;
+	} while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+
+static int
+xlog_covered_state(
+	int			prev_state,
+	int			iclogs_changed)
+{
+	/*
+	 * We go to NEED for any non-covering writes. We go to NEED2 if we just
+	 * wrote the first covering record (DONE). We go to IDLE if we just
+	 * wrote the second covering record (DONE2) and remain in IDLE until a
+	 * non-covering write occurs.
+	 */
+	switch (prev_state) {
+	case XLOG_STATE_COVER_IDLE:
+		if (iclogs_changed == 1)
+			return XLOG_STATE_COVER_IDLE;
+		fallthrough;
+	case XLOG_STATE_COVER_NEED:
+	case XLOG_STATE_COVER_NEED2:
+		break;
+	case XLOG_STATE_COVER_DONE:
+		if (iclogs_changed == 1)
+			return XLOG_STATE_COVER_NEED2;
+		break;
+	case XLOG_STATE_COVER_DONE2:
+		if (iclogs_changed == 1)
+			return XLOG_STATE_COVER_IDLE;
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	return XLOG_STATE_COVER_NEED;
+}
+
+STATIC void
+xlog_state_clean_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*dirty_iclog)
+{
+	int			iclogs_changed = 0;
+
+	trace_xlog_iclog_clean(dirty_iclog, _RET_IP_);
+
+	dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+
+	xlog_state_activate_iclogs(log, &iclogs_changed);
+	wake_up_all(&dirty_iclog->ic_force_wait);
+
+	if (iclogs_changed) {
+		log->l_covered_state = xlog_covered_state(log->l_covered_state,
+				iclogs_changed);
+	}
+}
+
+STATIC xfs_lsn_t
+xlog_get_lowest_lsn(
+	struct xlog		*log)
+{
+	struct xlog_in_core	*iclog = log->l_iclog;
+	xfs_lsn_t		lowest_lsn = 0, lsn;
+
+	do {
+		if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+		    iclog->ic_state == XLOG_STATE_DIRTY)
+			continue;
+
+		lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+		if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0)
+			lowest_lsn = lsn;
+	} while ((iclog = iclog->ic_next) != log->l_iclog);
+
+	return lowest_lsn;
+}
+
+/*
+ * Completion of a iclog IO does not imply that a transaction has completed, as
+ * transactions can be large enough to span many iclogs. We cannot change the
+ * tail of the log half way through a transaction as this may be the only
+ * transaction in the log and moving the tail to point to the middle of it
+ * will prevent recovery from finding the start of the transaction. Hence we
+ * should only update the last_sync_lsn if this iclog contains transaction
+ * completion callbacks on it.
+ *
+ * We have to do this before we drop the icloglock to ensure we are the only one
+ * that can update it.
+ *
+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
+ * the reservation grant head pushing. This is due to the fact that the push
+ * target is bound by the current last_sync_lsn value. Hence if we have a large
+ * amount of log space bound up in this committing transaction then the
+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
+ * should push the AIL to ensure the push target (and hence the grant head) is
+ * no longer bound by the old log head location and can move forwards and make
+ * progress again.
+ */
+static void
+xlog_state_set_callback(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	xfs_lsn_t		header_lsn)
+{
+	trace_xlog_iclog_callback(iclog, _RET_IP_);
+	iclog->ic_state = XLOG_STATE_CALLBACK;
+
+	ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+			   header_lsn) <= 0);
+
+	if (list_empty_careful(&iclog->ic_callbacks))
+		return;
+
+	atomic64_set(&log->l_last_sync_lsn, header_lsn);
+	xlog_grant_push_ail(log, 0);
+}
+
+/*
+ * Return true if we need to stop processing, false to continue to the next
+ * iclog. The caller will need to run callbacks if the iclog is returned in the
+ * XLOG_STATE_CALLBACK state.
+ */
+static bool
+xlog_state_iodone_process_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog)
+{
+	xfs_lsn_t		lowest_lsn;
+	xfs_lsn_t		header_lsn;
+
+	switch (iclog->ic_state) {
+	case XLOG_STATE_ACTIVE:
+	case XLOG_STATE_DIRTY:
+		/*
+		 * Skip all iclogs in the ACTIVE & DIRTY states:
+		 */
+		return false;
+	case XLOG_STATE_DONE_SYNC:
+		/*
+		 * Now that we have an iclog that is in the DONE_SYNC state, do
+		 * one more check here to see if we have chased our tail around.
+		 * If this is not the lowest lsn iclog, then we will leave it
+		 * for another completion to process.
+		 */
+		header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+		lowest_lsn = xlog_get_lowest_lsn(log);
+		if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+			return false;
+		xlog_state_set_callback(log, iclog, header_lsn);
+		return false;
+	default:
+		/*
+		 * Can only perform callbacks in order.  Since this iclog is not
+		 * in the DONE_SYNC state, we skip the rest and just try to
+		 * clean up.
+		 */
+		return true;
+	}
+}
+
+/*
+ * Loop over all the iclogs, running attached callbacks on them. Return true if
+ * we ran any callbacks, indicating that we dropped the icloglock. We don't need
+ * to handle transient shutdown state here at all because
+ * xlog_state_shutdown_callbacks() will be run to do the necessary shutdown
+ * cleanup of the callbacks.
+ */
+static bool
+xlog_state_do_iclog_callbacks(
+	struct xlog		*log)
+		__releases(&log->l_icloglock)
+		__acquires(&log->l_icloglock)
+{
+	struct xlog_in_core	*first_iclog = log->l_iclog;
+	struct xlog_in_core	*iclog = first_iclog;
+	bool			ran_callback = false;
+
+	do {
+		LIST_HEAD(cb_list);
+
+		if (xlog_state_iodone_process_iclog(log, iclog))
+			break;
+		if (iclog->ic_state != XLOG_STATE_CALLBACK) {
+			iclog = iclog->ic_next;
+			continue;
+		}
+		list_splice_init(&iclog->ic_callbacks, &cb_list);
+		spin_unlock(&log->l_icloglock);
+
+		trace_xlog_iclog_callbacks_start(iclog, _RET_IP_);
+		xlog_cil_process_committed(&cb_list);
+		trace_xlog_iclog_callbacks_done(iclog, _RET_IP_);
+		ran_callback = true;
+
+		spin_lock(&log->l_icloglock);
+		xlog_state_clean_iclog(log, iclog);
+		iclog = iclog->ic_next;
+	} while (iclog != first_iclog);
+
+	return ran_callback;
+}
+
+
+/*
+ * Loop running iclog completion callbacks until there are no more iclogs in a
+ * state that can run callbacks.
+ */
+STATIC void
+xlog_state_do_callback(
+	struct xlog		*log)
+{
+	int			flushcnt = 0;
+	int			repeats = 0;
+
+	spin_lock(&log->l_icloglock);
+	while (xlog_state_do_iclog_callbacks(log)) {
+		if (xlog_is_shutdown(log))
+			break;
+
+		if (++repeats > 5000) {
+			flushcnt += repeats;
+			repeats = 0;
+			xfs_warn(log->l_mp,
+				"%s: possible infinite loop (%d iterations)",
+				__func__, flushcnt);
+		}
+	}
+
+	if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE)
+		wake_up_all(&log->l_flush_wait);
+
+	spin_unlock(&log->l_icloglock);
+}
+
+
+/*
+ * Finish transitioning this iclog to the dirty state.
+ *
+ * Callbacks could take time, so they are done outside the scope of the
+ * global state machine log lock.
+ */
+STATIC void
+xlog_state_done_syncing(
+	struct xlog_in_core	*iclog)
+{
+	struct xlog		*log = iclog->ic_log;
+
+	spin_lock(&log->l_icloglock);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+	trace_xlog_iclog_sync_done(iclog, _RET_IP_);
+
+	/*
+	 * If we got an error, either on the first buffer, or in the case of
+	 * split log writes, on the second, we shut down the file system and
+	 * no iclogs should ever be attempted to be written to disk again.
+	 */
+	if (!xlog_is_shutdown(log)) {
+		ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
+		iclog->ic_state = XLOG_STATE_DONE_SYNC;
+	}
+
+	/*
+	 * Someone could be sleeping prior to writing out the next
+	 * iclog buffer, we wake them all, one will get to do the
+	 * I/O, the others get to wait for the result.
+	 */
+	wake_up_all(&iclog->ic_write_wait);
+	spin_unlock(&log->l_icloglock);
+	xlog_state_do_callback(log);
+}
+
+/*
+ * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
+ * sleep.  We wait on the flush queue on the head iclog as that should be
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
+ * we will wait here and all new writes will sleep until a sync completes.
+ *
+ * The in-core logs are used in a circular fashion. They are not used
+ * out-of-order even when an iclog past the head is free.
+ *
+ * return:
+ *	* log_offset where xlog_write() can start writing into the in-core
+ *		log's data space.
+ *	* in-core log pointer to which xlog_write() should write.
+ *	* boolean indicating this is a continued write to an in-core log.
+ *		If this is the last write, then the in-core log's offset field
+ *		needs to be incremented, depending on the amount of data which
+ *		is copied.
+ */
+STATIC int
+xlog_state_get_iclog_space(
+	struct xlog		*log,
+	int			len,
+	struct xlog_in_core	**iclogp,
+	struct xlog_ticket	*ticket,
+	int			*logoffsetp)
+{
+	int		  log_offset;
+	xlog_rec_header_t *head;
+	xlog_in_core_t	  *iclog;
+
+restart:
+	spin_lock(&log->l_icloglock);
+	if (xlog_is_shutdown(log)) {
+		spin_unlock(&log->l_icloglock);
+		return -EIO;
+	}
+
+	iclog = log->l_iclog;
+	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
+		XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
+
+		/* Wait for log writes to have flushed */
+		xlog_wait(&log->l_flush_wait, &log->l_icloglock);
+		goto restart;
+	}
+
+	head = &iclog->ic_header;
+
+	atomic_inc(&iclog->ic_refcnt);	/* prevents sync */
+	log_offset = iclog->ic_offset;
+
+	trace_xlog_iclog_get_space(iclog, _RET_IP_);
+
+	/* On the 1st write to an iclog, figure out lsn.  This works
+	 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
+	 * committing to.  If the offset is set, that's how many blocks
+	 * must be written.
+	 */
+	if (log_offset == 0) {
+		ticket->t_curr_res -= log->l_iclog_hsize;
+		head->h_cycle = cpu_to_be32(log->l_curr_cycle);
+		head->h_lsn = cpu_to_be64(
+			xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
+		ASSERT(log->l_curr_block >= 0);
+	}
+
+	/* If there is enough room to write everything, then do it.  Otherwise,
+	 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
+	 * bit is on, so this will get flushed out.  Don't update ic_offset
+	 * until you know exactly how many bytes get copied.  Therefore, wait
+	 * until later to update ic_offset.
+	 *
+	 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+	 * can fit into remaining data section.
+	 */
+	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+		int		error = 0;
+
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+
+		/*
+		 * If we are the only one writing to this iclog, sync it to
+		 * disk.  We need to do an atomic compare and decrement here to
+		 * avoid racing with concurrent atomic_dec_and_lock() calls in
+		 * xlog_state_release_iclog() when there is more than one
+		 * reference to the iclog.
+		 */
+		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
+			error = xlog_state_release_iclog(log, iclog, ticket);
+		spin_unlock(&log->l_icloglock);
+		if (error)
+			return error;
+		goto restart;
+	}
+
+	/* Do we have enough room to write the full amount in the remainder
+	 * of this iclog?  Or must we continue a write on the next iclog and
+	 * mark this iclog as completely taken?  In the case where we switch
+	 * iclogs (to mark it taken), this particular iclog will release/sync
+	 * to disk in xlog_write().
+	 */
+	if (len <= iclog->ic_size - iclog->ic_offset)
+		iclog->ic_offset += len;
+	else
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+	*iclogp = iclog;
+
+	ASSERT(iclog->ic_offset <= iclog->ic_size);
+	spin_unlock(&log->l_icloglock);
+
+	*logoffsetp = log_offset;
+	return 0;
+}
+
+/*
+ * The first cnt-1 times a ticket goes through here we don't need to move the
+ * grant write head because the permanent reservation has reserved cnt times the
+ * unit amount.  Release part of current permanent unit reservation and reset
+ * current reservation to be one units worth.  Also move grant reservation head
+ * forward.
+ */
+void
+xfs_log_ticket_regrant(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket)
+{
+	trace_xfs_log_ticket_regrant(log, ticket);
+
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	xlog_grant_sub_space(log, &log->l_reserve_head.grant,
+					ticket->t_curr_res);
+	xlog_grant_sub_space(log, &log->l_write_head.grant,
+					ticket->t_curr_res);
+	ticket->t_curr_res = ticket->t_unit_res;
+
+	trace_xfs_log_ticket_regrant_sub(log, ticket);
+
+	/* just return if we still have some of the pre-reserved space */
+	if (!ticket->t_cnt) {
+		xlog_grant_add_space(log, &log->l_reserve_head.grant,
+				     ticket->t_unit_res);
+		trace_xfs_log_ticket_regrant_exit(log, ticket);
+
+		ticket->t_curr_res = ticket->t_unit_res;
+	}
+
+	xfs_log_ticket_put(ticket);
+}
+
+/*
+ * Give back the space left from a reservation.
+ *
+ * All the information we need to make a correct determination of space left
+ * is present.  For non-permanent reservations, things are quite easy.  The
+ * count should have been decremented to zero.  We only need to deal with the
+ * space remaining in the current reservation part of the ticket.  If the
+ * ticket contains a permanent reservation, there may be left over space which
+ * needs to be released.  A count of N means that N-1 refills of the current
+ * reservation can be done before we need to ask for more space.  The first
+ * one goes to fill up the first current reservation.  Once we run out of
+ * space, the count will stay at zero and the only space remaining will be
+ * in the current reservation field.
+ */
+void
+xfs_log_ticket_ungrant(
+	struct xlog		*log,
+	struct xlog_ticket	*ticket)
+{
+	int			bytes;
+
+	trace_xfs_log_ticket_ungrant(log, ticket);
+
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	trace_xfs_log_ticket_ungrant_sub(log, ticket);
+
+	/*
+	 * If this is a permanent reservation ticket, we may be able to free
+	 * up more space based on the remaining count.
+	 */
+	bytes = ticket->t_curr_res;
+	if (ticket->t_cnt > 0) {
+		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
+		bytes += ticket->t_unit_res*ticket->t_cnt;
+	}
+
+	xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
+	xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
+
+	trace_xfs_log_ticket_ungrant_exit(log, ticket);
+
+	xfs_log_space_wake(log->l_mp);
+	xfs_log_ticket_put(ticket);
+}
+
+/*
+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
+ * the current iclog pointer to the next iclog in the ring.
+ */
+void
+xlog_state_switch_iclogs(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			eventual_size)
+{
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+	assert_spin_locked(&log->l_icloglock);
+	trace_xlog_iclog_switch(iclog, _RET_IP_);
+
+	if (!eventual_size)
+		eventual_size = iclog->ic_offset;
+	iclog->ic_state = XLOG_STATE_WANT_SYNC;
+	iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
+	log->l_prev_block = log->l_curr_block;
+	log->l_prev_cycle = log->l_curr_cycle;
+
+	/* roll log?: ic_offset changed later */
+	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
+
+	/* Round up to next log-sunit */
+	if (log->l_iclog_roundoff > BBSIZE) {
+		uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff);
+		log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
+	}
+
+	if (log->l_curr_block >= log->l_logBBsize) {
+		/*
+		 * Rewind the current block before the cycle is bumped to make
+		 * sure that the combined LSN never transiently moves forward
+		 * when the log wraps to the next cycle. This is to support the
+		 * unlocked sample of these fields from xlog_valid_lsn(). Most
+		 * other cases should acquire l_icloglock.
+		 */
+		log->l_curr_block -= log->l_logBBsize;
+		ASSERT(log->l_curr_block >= 0);
+		smp_wmb();
+		log->l_curr_cycle++;
+		if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
+			log->l_curr_cycle++;
+	}
+	ASSERT(iclog == log->l_iclog);
+	log->l_iclog = iclog->ic_next;
+}
+
+/*
+ * Force the iclog to disk and check if the iclog has been completed before
+ * xlog_force_iclog() returns. This can happen on synchronous (e.g.
+ * pmem) or fast async storage because we drop the icloglock to issue the IO.
+ * If completion has already occurred, tell the caller so that it can avoid an
+ * unnecessary wait on the iclog.
+ */
+static int
+xlog_force_and_check_iclog(
+	struct xlog_in_core	*iclog,
+	bool			*completed)
+{
+	xfs_lsn_t		lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+	int			error;
+
+	*completed = false;
+	error = xlog_force_iclog(iclog);
+	if (error)
+		return error;
+
+	/*
+	 * If the iclog has already been completed and reused the header LSN
+	 * will have been rewritten by completion
+	 */
+	if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
+		*completed = true;
+	return 0;
+}
+
+/*
+ * Write out all data in the in-core log as of this exact moment in time.
+ *
+ * Data may be written to the in-core log during this call.  However,
+ * we don't guarantee this data will be written out.  A change from past
+ * implementation means this routine will *not* write out zero length LRs.
+ *
+ * Basically, we try and perform an intelligent scan of the in-core logs.
+ * If we determine there is no flushable data, we just return.  There is no
+ * flushable data if:
+ *
+ *	1. the current iclog is active and has no data; the previous iclog
+ *		is in the active or dirty state.
+ *	2. the current iclog is drity, and the previous iclog is in the
+ *		active or dirty state.
+ *
+ * We may sleep if:
+ *
+ *	1. the current iclog is not in the active nor dirty state.
+ *	2. the current iclog dirty, and the previous iclog is not in the
+ *		active nor dirty state.
+ *	3. the current iclog is active, and there is another thread writing
+ *		to this particular iclog.
+ *	4. a) the current iclog is active and has no other writers
+ *	   b) when we return from flushing out this iclog, it is still
+ *		not in the active nor dirty state.
+ */
+int
+xfs_log_force(
+	struct xfs_mount	*mp,
+	uint			flags)
+{
+	struct xlog		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+
+	XFS_STATS_INC(mp, xs_log_force);
+	trace_xfs_log_force(mp, 0, _RET_IP_);
+
+	xlog_cil_force(log);
+
+	spin_lock(&log->l_icloglock);
+	if (xlog_is_shutdown(log))
+		goto out_error;
+
+	iclog = log->l_iclog;
+	trace_xlog_iclog_force(iclog, _RET_IP_);
+
+	if (iclog->ic_state == XLOG_STATE_DIRTY ||
+	    (iclog->ic_state == XLOG_STATE_ACTIVE &&
+	     atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) {
+		/*
+		 * If the head is dirty or (active and empty), then we need to
+		 * look at the previous iclog.
+		 *
+		 * If the previous iclog is active or dirty we are done.  There
+		 * is nothing to sync out. Otherwise, we attach ourselves to the
+		 * previous iclog and go to sleep.
+		 */
+		iclog = iclog->ic_prev;
+	} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+		if (atomic_read(&iclog->ic_refcnt) == 0) {
+			/* We have exclusive access to this iclog. */
+			bool	completed;
+
+			if (xlog_force_and_check_iclog(iclog, &completed))
+				goto out_error;
+
+			if (completed)
+				goto out_unlock;
+		} else {
+			/*
+			 * Someone else is still writing to this iclog, so we
+			 * need to ensure that when they release the iclog it
+			 * gets synced immediately as we may be waiting on it.
+			 */
+			xlog_state_switch_iclogs(log, iclog, 0);
+		}
+	}
+
+	/*
+	 * The iclog we are about to wait on may contain the checkpoint pushed
+	 * by the above xlog_cil_force() call, but it may not have been pushed
+	 * to disk yet. Like the ACTIVE case above, we need to make sure caches
+	 * are flushed when this iclog is written.
+	 */
+	if (iclog->ic_state == XLOG_STATE_WANT_SYNC)
+		iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+
+	if (flags & XFS_LOG_SYNC)
+		return xlog_wait_on_iclog(iclog);
+out_unlock:
+	spin_unlock(&log->l_icloglock);
+	return 0;
+out_error:
+	spin_unlock(&log->l_icloglock);
+	return -EIO;
+}
+
+/*
+ * Force the log to a specific LSN.
+ *
+ * If an iclog with that lsn can be found:
+ *	If it is in the DIRTY state, just return.
+ *	If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ *		state and go to sleep or return.
+ *	If it is in any other state, go to sleep or return.
+ *
+ * Synchronous forces are implemented with a wait queue.  All callers trying
+ * to force a given lsn to disk must wait on the queue attached to the
+ * specific in-core log.  When given in-core log finally completes its write
+ * to disk, that thread will wake up all threads waiting on the queue.
+ */
+static int
+xlog_force_lsn(
+	struct xlog		*log,
+	xfs_lsn_t		lsn,
+	uint			flags,
+	int			*log_flushed,
+	bool			already_slept)
+{
+	struct xlog_in_core	*iclog;
+	bool			completed;
+
+	spin_lock(&log->l_icloglock);
+	if (xlog_is_shutdown(log))
+		goto out_error;
+
+	iclog = log->l_iclog;
+	while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+		trace_xlog_iclog_force_lsn(iclog, _RET_IP_);
+		iclog = iclog->ic_next;
+		if (iclog == log->l_iclog)
+			goto out_unlock;
+	}
+
+	switch (iclog->ic_state) {
+	case XLOG_STATE_ACTIVE:
+		/*
+		 * We sleep here if we haven't already slept (e.g. this is the
+		 * first time we've looked at the correct iclog buf) and the
+		 * buffer before us is going to be sync'ed.  The reason for this
+		 * is that if we are doing sync transactions here, by waiting
+		 * for the previous I/O to complete, we can allow a few more
+		 * transactions into this iclog before we close it down.
+		 *
+		 * Otherwise, we mark the buffer WANT_SYNC, and bump up the
+		 * refcnt so we can release the log (which drops the ref count).
+		 * The state switch keeps new transaction commits from using
+		 * this buffer.  When the current commits finish writing into
+		 * the buffer, the refcount will drop to zero and the buffer
+		 * will go out then.
+		 */
+		if (!already_slept &&
+		    (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
+		     iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
+			xlog_wait(&iclog->ic_prev->ic_write_wait,
+					&log->l_icloglock);
+			return -EAGAIN;
+		}
+		if (xlog_force_and_check_iclog(iclog, &completed))
+			goto out_error;
+		if (log_flushed)
+			*log_flushed = 1;
+		if (completed)
+			goto out_unlock;
+		break;
+	case XLOG_STATE_WANT_SYNC:
+		/*
+		 * This iclog may contain the checkpoint pushed by the
+		 * xlog_cil_force_seq() call, but there are other writers still
+		 * accessing it so it hasn't been pushed to disk yet. Like the
+		 * ACTIVE case above, we need to make sure caches are flushed
+		 * when this iclog is written.
+		 */
+		iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+		break;
+	default:
+		/*
+		 * The entire checkpoint was written by the CIL force and is on
+		 * its way to disk already. It will be stable when it
+		 * completes, so we don't need to manipulate caches here at all.
+		 * We just need to wait for completion if necessary.
+		 */
+		break;
+	}
+
+	if (flags & XFS_LOG_SYNC)
+		return xlog_wait_on_iclog(iclog);
+out_unlock:
+	spin_unlock(&log->l_icloglock);
+	return 0;
+out_error:
+	spin_unlock(&log->l_icloglock);
+	return -EIO;
+}
+
+/*
+ * Force the log to a specific checkpoint sequence.
+ *
+ * First force the CIL so that all the required changes have been flushed to the
+ * iclogs. If the CIL force completed it will return a commit LSN that indicates
+ * the iclog that needs to be flushed to stable storage. If the caller needs
+ * a synchronous log force, we will wait on the iclog with the LSN returned by
+ * xlog_cil_force_seq() to be completed.
+ */
+int
+xfs_log_force_seq(
+	struct xfs_mount	*mp,
+	xfs_csn_t		seq,
+	uint			flags,
+	int			*log_flushed)
+{
+	struct xlog		*log = mp->m_log;
+	xfs_lsn_t		lsn;
+	int			ret;
+	ASSERT(seq != 0);
+
+	XFS_STATS_INC(mp, xs_log_force);
+	trace_xfs_log_force(mp, seq, _RET_IP_);
+
+	lsn = xlog_cil_force_seq(log, seq);
+	if (lsn == NULLCOMMITLSN)
+		return 0;
+
+	ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
+	if (ret == -EAGAIN) {
+		XFS_STATS_INC(mp, xs_log_force_sleep);
+		ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
+	}
+	return ret;
+}
+
+/*
+ * Free a used ticket when its refcount falls to zero.
+ */
+void
+xfs_log_ticket_put(
+	xlog_ticket_t	*ticket)
+{
+	ASSERT(atomic_read(&ticket->t_ref) > 0);
+	if (atomic_dec_and_test(&ticket->t_ref))
+		kmem_cache_free(xfs_log_ticket_cache, ticket);
+}
+
+xlog_ticket_t *
+xfs_log_ticket_get(
+	xlog_ticket_t	*ticket)
+{
+	ASSERT(atomic_read(&ticket->t_ref) > 0);
+	atomic_inc(&ticket->t_ref);
+	return ticket;
+}
+
+/*
+ * Figure out the total log space unit (in bytes) that would be
+ * required for a log ticket.
+ */
+static int
+xlog_calc_unit_res(
+	struct xlog		*log,
+	int			unit_bytes,
+	int			*niclogs)
+{
+	int			iclog_space;
+	uint			num_headers;
+
+	/*
+	 * Permanent reservations have up to 'cnt'-1 active log operations
+	 * in the log.  A unit in this case is the amount of space for one
+	 * of these log operations.  Normal reservations have a cnt of 1
+	 * and their unit amount is the total amount of space required.
+	 *
+	 * The following lines of code account for non-transaction data
+	 * which occupy space in the on-disk log.
+	 *
+	 * Normal form of a transaction is:
+	 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
+	 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
+	 *
+	 * We need to account for all the leadup data and trailer data
+	 * around the transaction data.
+	 * And then we need to account for the worst case in terms of using
+	 * more space.
+	 * The worst case will happen if:
+	 * - the placement of the transaction happens to be such that the
+	 *   roundoff is at its maximum
+	 * - the transaction data is synced before the commit record is synced
+	 *   i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
+	 *   Therefore the commit record is in its own Log Record.
+	 *   This can happen as the commit record is called with its
+	 *   own region to xlog_write().
+	 *   This then means that in the worst case, roundoff can happen for
+	 *   the commit-rec as well.
+	 *   The commit-rec is smaller than padding in this scenario and so it is
+	 *   not added separately.
+	 */
+
+	/* for trans header */
+	unit_bytes += sizeof(xlog_op_header_t);
+	unit_bytes += sizeof(xfs_trans_header_t);
+
+	/* for start-rec */
+	unit_bytes += sizeof(xlog_op_header_t);
+
+	/*
+	 * for LR headers - the space for data in an iclog is the size minus
+	 * the space used for the headers. If we use the iclog size, then we
+	 * undercalculate the number of headers required.
+	 *
+	 * Furthermore - the addition of op headers for split-recs might
+	 * increase the space required enough to require more log and op
+	 * headers, so take that into account too.
+	 *
+	 * IMPORTANT: This reservation makes the assumption that if this
+	 * transaction is the first in an iclog and hence has the LR headers
+	 * accounted to it, then the remaining space in the iclog is
+	 * exclusively for this transaction.  i.e. if the transaction is larger
+	 * than the iclog, it will be the only thing in that iclog.
+	 * Fundamentally, this means we must pass the entire log vector to
+	 * xlog_write to guarantee this.
+	 */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	num_headers = howmany(unit_bytes, iclog_space);
+
+	/* for split-recs - ophdrs added when data split over LRs */
+	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+
+	/* add extra header reservations if we overrun */
+	while (!num_headers ||
+	       howmany(unit_bytes, iclog_space) > num_headers) {
+		unit_bytes += sizeof(xlog_op_header_t);
+		num_headers++;
+	}
+	unit_bytes += log->l_iclog_hsize * num_headers;
+
+	/* for commit-rec LR header - note: padding will subsume the ophdr */
+	unit_bytes += log->l_iclog_hsize;
+
+	/* roundoff padding for transaction data and one for commit record */
+	unit_bytes += 2 * log->l_iclog_roundoff;
+
+	if (niclogs)
+		*niclogs = num_headers;
+	return unit_bytes;
+}
+
+int
+xfs_log_calc_unit_res(
+	struct xfs_mount	*mp,
+	int			unit_bytes)
+{
+	return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL);
+}
+
+/*
+ * Allocate and initialise a new log ticket.
+ */
+struct xlog_ticket *
+xlog_ticket_alloc(
+	struct xlog		*log,
+	int			unit_bytes,
+	int			cnt,
+	bool			permanent)
+{
+	struct xlog_ticket	*tic;
+	int			unit_res;
+
+	tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
+
+	unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
+
+	atomic_set(&tic->t_ref, 1);
+	tic->t_task		= current;
+	INIT_LIST_HEAD(&tic->t_queue);
+	tic->t_unit_res		= unit_res;
+	tic->t_curr_res		= unit_res;
+	tic->t_cnt		= cnt;
+	tic->t_ocnt		= cnt;
+	tic->t_tid		= get_random_u32();
+	if (permanent)
+		tic->t_flags |= XLOG_TIC_PERM_RESERV;
+
+	return tic;
+}
+
+#if defined(DEBUG)
+/*
+ * Check to make sure the grant write head didn't just over lap the tail.  If
+ * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
+ * the cycles differ by exactly one and check the byte count.
+ *
+ * This check is run unlocked, so can give false positives. Rather than assert
+ * on failures, use a warn-once flag and a panic tag to allow the admin to
+ * determine if they want to panic the machine when such an error occurs. For
+ * debug kernels this will have the same effect as using an assert but, unlinke
+ * an assert, it can be turned off at runtime.
+ */
+STATIC void
+xlog_verify_grant_tail(
+	struct xlog	*log)
+{
+	int		tail_cycle, tail_blocks;
+	int		cycle, space;
+
+	xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+	if (tail_cycle != cycle) {
+		if (cycle - 1 != tail_cycle &&
+		    !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) {
+			xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+				"%s: cycle - 1 != tail_cycle", __func__);
+		}
+
+		if (space > BBTOB(tail_blocks) &&
+		    !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) {
+			xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+				"%s: space > BBTOB(tail_blocks)", __func__);
+		}
+	}
+}
+
+/* check if it will fit */
+STATIC void
+xlog_verify_tail_lsn(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog)
+{
+	xfs_lsn_t	tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
+	int		blocks;
+
+    if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
+	blocks =
+	    log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
+	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
+		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
+    } else {
+	ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
+
+	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
+		xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
+
+	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
+	if (blocks < BTOBB(iclog->ic_offset) + 1)
+		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
+    }
+}
+
+/*
+ * Perform a number of checks on the iclog before writing to disk.
+ *
+ * 1. Make sure the iclogs are still circular
+ * 2. Make sure we have a good magic number
+ * 3. Make sure we don't have magic numbers in the data
+ * 4. Check fields of each log operation header for:
+ *	A. Valid client identifier
+ *	B. tid ptr value falls in valid ptr space (user space code)
+ *	C. Length in log record header is correct according to the
+ *		individual operation headers within record.
+ * 5. When a bwrite will occur within 5 blocks of the front of the physical
+ *	log, check the preceding blocks of the physical log to make sure all
+ *	the cycle numbers agree with the current cycle number.
+ */
+STATIC void
+xlog_verify_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			count)
+{
+	xlog_op_header_t	*ophead;
+	xlog_in_core_t		*icptr;
+	xlog_in_core_2_t	*xhdr;
+	void			*base_ptr, *ptr, *p;
+	ptrdiff_t		field_offset;
+	uint8_t			clientid;
+	int			len, i, j, k, op_len;
+	int			idx;
+
+	/* check validity of iclog pointers */
+	spin_lock(&log->l_icloglock);
+	icptr = log->l_iclog;
+	for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
+		ASSERT(icptr);
+
+	if (icptr != log->l_iclog)
+		xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
+	spin_unlock(&log->l_icloglock);
+
+	/* check log magic numbers */
+	if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
+		xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
+
+	base_ptr = ptr = &iclog->ic_header;
+	p = &iclog->ic_header;
+	for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
+		if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
+			xfs_emerg(log->l_mp, "%s: unexpected magic num",
+				__func__);
+	}
+
+	/* check fields */
+	len = be32_to_cpu(iclog->ic_header.h_num_logops);
+	base_ptr = ptr = iclog->ic_datap;
+	ophead = ptr;
+	xhdr = iclog->ic_data;
+	for (i = 0; i < len; i++) {
+		ophead = ptr;
+
+		/* clientid is only 1 byte */
+		p = &ophead->oh_clientid;
+		field_offset = p - base_ptr;
+		if (field_offset & 0x1ff) {
+			clientid = ophead->oh_clientid;
+		} else {
+			idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
+			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				clientid = xlog_get_client_id(
+					xhdr[j].hic_xheader.xh_cycle_data[k]);
+			} else {
+				clientid = xlog_get_client_id(
+					iclog->ic_header.h_cycle_data[idx]);
+			}
+		}
+		if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
+			xfs_warn(log->l_mp,
+				"%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
+				__func__, i, clientid, ophead,
+				(unsigned long)field_offset);
+		}
+
+		/* check length */
+		p = &ophead->oh_len;
+		field_offset = p - base_ptr;
+		if (field_offset & 0x1ff) {
+			op_len = be32_to_cpu(ophead->oh_len);
+		} else {
+			idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
+			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
+			} else {
+				op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
+			}
+		}
+		ptr += sizeof(xlog_op_header_t) + op_len;
+	}
+}
+#endif
+
+/*
+ * Perform a forced shutdown on the log.
+ *
+ * This can be called from low level log code to trigger a shutdown, or from the
+ * high level mount shutdown code when the mount shuts down.
+ *
+ * Our main objectives here are to make sure that:
+ *	a. if the shutdown was not due to a log IO error, flush the logs to
+ *	   disk. Anything modified after this is ignored.
+ *	b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested
+ *	   parties to find out. Nothing new gets queued after this is done.
+ *	c. Tasks sleeping on log reservations, pinned objects and
+ *	   other resources get woken up.
+ *	d. The mount is also marked as shut down so that log triggered shutdowns
+ *	   still behave the same as if they called xfs_forced_shutdown().
+ *
+ * Return true if the shutdown cause was a log IO error and we actually shut the
+ * log down.
+ */
+bool
+xlog_force_shutdown(
+	struct xlog	*log,
+	uint32_t	shutdown_flags)
+{
+	bool		log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
+
+	if (!log)
+		return false;
+
+	/*
+	 * Flush all the completed transactions to disk before marking the log
+	 * being shut down. We need to do this first as shutting down the log
+	 * before the force will prevent the log force from flushing the iclogs
+	 * to disk.
+	 *
+	 * When we are in recovery, there are no transactions to flush, and
+	 * we don't want to touch the log because we don't want to perturb the
+	 * current head/tail for future recovery attempts. Hence we need to
+	 * avoid a log force in this case.
+	 *
+	 * If we are shutting down due to a log IO error, then we must avoid
+	 * trying to write the log as that may just result in more IO errors and
+	 * an endless shutdown/force loop.
+	 */
+	if (!log_error && !xlog_in_recovery(log))
+		xfs_log_force(log->l_mp, XFS_LOG_SYNC);
+
+	/*
+	 * Atomically set the shutdown state. If the shutdown state is already
+	 * set, there someone else is performing the shutdown and so we are done
+	 * here. This should never happen because we should only ever get called
+	 * once by the first shutdown caller.
+	 *
+	 * Much of the log state machine transitions assume that shutdown state
+	 * cannot change once they hold the log->l_icloglock. Hence we need to
+	 * hold that lock here, even though we use the atomic test_and_set_bit()
+	 * operation to set the shutdown state.
+	 */
+	spin_lock(&log->l_icloglock);
+	if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) {
+		spin_unlock(&log->l_icloglock);
+		return false;
+	}
+	spin_unlock(&log->l_icloglock);
+
+	/*
+	 * If this log shutdown also sets the mount shutdown state, issue a
+	 * shutdown warning message.
+	 */
+	if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) {
+		xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+"Filesystem has been shut down due to log error (0x%x).",
+				shutdown_flags);
+		xfs_alert(log->l_mp,
+"Please unmount the filesystem and rectify the problem(s).");
+		if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+			xfs_stack_trace();
+	}
+
+	/*
+	 * We don't want anybody waiting for log reservations after this. That
+	 * means we have to wake up everybody queued up on reserveq as well as
+	 * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
+	 * we don't enqueue anything once the SHUTDOWN flag is set, and this
+	 * action is protected by the grant locks.
+	 */
+	xlog_grant_head_wake_all(&log->l_reserve_head);
+	xlog_grant_head_wake_all(&log->l_write_head);
+
+	/*
+	 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
+	 * as if the log writes were completed. The abort handling in the log
+	 * item committed callback functions will do this again under lock to
+	 * avoid races.
+	 */
+	spin_lock(&log->l_cilp->xc_push_lock);
+	wake_up_all(&log->l_cilp->xc_start_wait);
+	wake_up_all(&log->l_cilp->xc_commit_wait);
+	spin_unlock(&log->l_cilp->xc_push_lock);
+
+	spin_lock(&log->l_icloglock);
+	xlog_state_shutdown_callbacks(log);
+	spin_unlock(&log->l_icloglock);
+
+	wake_up_var(&log->l_opstate);
+	return log_error;
+}
+
+STATIC int
+xlog_iclogs_empty(
+	struct xlog	*log)
+{
+	xlog_in_core_t	*iclog;
+
+	iclog = log->l_iclog;
+	do {
+		/* endianness does not matter here, zero is zero in
+		 * any language.
+		 */
+		if (iclog->ic_header.h_num_logops)
+			return 0;
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+	return 1;
+}
+
+/*
+ * Verify that an LSN stamped into a piece of metadata is valid. This is
+ * intended for use in read verifiers on v5 superblocks.
+ */
+bool
+xfs_log_check_lsn(
+	struct xfs_mount	*mp,
+	xfs_lsn_t		lsn)
+{
+	struct xlog		*log = mp->m_log;
+	bool			valid;
+
+	/*
+	 * norecovery mode skips mount-time log processing and unconditionally
+	 * resets the in-core LSN. We can't validate in this mode, but
+	 * modifications are not allowed anyways so just return true.
+	 */
+	if (xfs_has_norecovery(mp))
+		return true;
+
+	/*
+	 * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
+	 * handled by recovery and thus safe to ignore here.
+	 */
+	if (lsn == NULLCOMMITLSN)
+		return true;
+
+	valid = xlog_valid_lsn(mp->m_log, lsn);
+
+	/* warn the user about what's gone wrong before verifier failure */
+	if (!valid) {
+		spin_lock(&log->l_icloglock);
+		xfs_warn(mp,
+"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
+"Please unmount and run xfs_repair (>= v4.3) to resolve.",
+			 CYCLE_LSN(lsn), BLOCK_LSN(lsn),
+			 log->l_curr_cycle, log->l_curr_block);
+		spin_unlock(&log->l_icloglock);
+	}
+
+	return valid;
+}
+
+/*
+ * Notify the log that we're about to start using a feature that is protected
+ * by a log incompat feature flag.  This will prevent log covering from
+ * clearing those flags.
+ */
+void
+xlog_use_incompat_feat(
+	struct xlog		*log)
+{
+	down_read(&log->l_incompat_users);
+}
+
+/* Notify the log that we've finished using log incompat features. */
+void
+xlog_drop_incompat_feat(
+	struct xlog		*log)
+{
+	up_read(&log->l_incompat_users);
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
new file mode 100644
index 000000000..2728886c2
--- /dev/null
+++ b/fs/xfs/xfs_log.h
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_LOG_H__
+#define __XFS_LOG_H__
+
+struct xfs_cil_ctx;
+
+struct xfs_log_vec {
+	struct list_head	lv_list;	/* CIL lv chain ptrs */
+	uint32_t		lv_order_id;	/* chain ordering info */
+	int			lv_niovecs;	/* number of iovecs in lv */
+	struct xfs_log_iovec	*lv_iovecp;	/* iovec array */
+	struct xfs_log_item	*lv_item;	/* owner */
+	char			*lv_buf;	/* formatted buffer */
+	int			lv_bytes;	/* accounted space in buffer */
+	int			lv_buf_len;	/* aligned size of buffer */
+	int			lv_size;	/* size of allocated lv */
+};
+
+#define XFS_LOG_VEC_ORDERED	(-1)
+
+/*
+ * Calculate the log iovec length for a given user buffer length. Intended to be
+ * used by ->iop_size implementations when sizing buffers of arbitrary
+ * alignments.
+ */
+static inline int
+xlog_calc_iovec_len(int len)
+{
+	return roundup(len, sizeof(uint32_t));
+}
+
+void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+		uint type);
+
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
+		int data_len)
+{
+	struct xlog_op_header	*oph = vec->i_addr;
+	int			len;
+
+	/*
+	 * Always round up the length to the correct alignment so callers don't
+	 * need to know anything about this log vec layout requirement. This
+	 * means we have to zero the area the data to be written does not cover.
+	 * This is complicated by fact the payload region is offset into the
+	 * logvec region by the opheader that tracks the payload.
+	 */
+	len = xlog_calc_iovec_len(data_len);
+	if (len - data_len != 0) {
+		char	*buf = vec->i_addr + sizeof(struct xlog_op_header);
+
+		memset(buf + data_len, 0, len - data_len);
+	}
+
+	/*
+	 * The opheader tracks aligned payload length, whilst the logvec tracks
+	 * the overall region length.
+	 */
+	oph->oh_len = cpu_to_be32(len);
+
+	len += sizeof(struct xlog_op_header);
+	lv->lv_buf_len += len;
+	lv->lv_bytes += len;
+	vec->i_len = len;
+
+	/* Catch buffer overruns */
+	ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
+}
+
+/*
+ * Copy the amount of data requested by the caller into a new log iovec.
+ */
+static inline void *
+xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+		uint type, void *data, int len)
+{
+	void *buf;
+
+	buf = xlog_prepare_iovec(lv, vecp, type);
+	memcpy(buf, data, len);
+	xlog_finish_iovec(lv, *vecp, len);
+	return buf;
+}
+
+static inline void *
+xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+		const struct xfs_log_iovec *src)
+{
+	return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len);
+}
+
+/*
+ * By comparing each component, we don't have to worry about extra
+ * endian issues in treating two 32 bit numbers as one 64 bit number
+ */
+static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+{
+	if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
+		return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
+
+	if (BLOCK_LSN(lsn1) != BLOCK_LSN(lsn2))
+		return (BLOCK_LSN(lsn1)<BLOCK_LSN(lsn2))? -999 : 999;
+
+	return 0;
+}
+
+#define	XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
+
+/*
+ * Flags to xfs_log_force()
+ *
+ *	XFS_LOG_SYNC:	Synchronous force in-core log to disk
+ */
+#define XFS_LOG_SYNC		0x1
+
+/* Log manager interfaces */
+struct xfs_mount;
+struct xlog_in_core;
+struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+struct xfs_trans;
+struct xlog;
+
+int	  xfs_log_force(struct xfs_mount *mp, uint flags);
+int	  xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags,
+		int *log_forced);
+int	  xfs_log_mount(struct xfs_mount	*mp,
+			struct xfs_buftarg	*log_target,
+			xfs_daddr_t		start_block,
+			int		 	num_bblocks);
+int	  xfs_log_mount_finish(struct xfs_mount *mp);
+void	xfs_log_mount_cancel(struct xfs_mount *);
+xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
+void	xfs_log_space_wake(struct xfs_mount *mp);
+int	xfs_log_reserve(struct xfs_mount *mp, int length, int count,
+			struct xlog_ticket **ticket, bool permanent);
+int	xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+void	xfs_log_unmount(struct xfs_mount *mp);
+bool	xfs_log_writable(struct xfs_mount *mp);
+
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
+void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
+
+void	xlog_cil_process_committed(struct list_head *list);
+bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+
+void	xfs_log_work_queue(struct xfs_mount *mp);
+int	xfs_log_quiesce(struct xfs_mount *mp);
+void	xfs_log_clean(struct xfs_mount *mp);
+bool	xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
+
+xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
+bool	  xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
+
+void xlog_use_incompat_feat(struct xlog *log);
+void xlog_drop_incompat_feat(struct xlog *log);
+int xfs_attr_use_log_assist(struct xfs_mount *mp);
+
+#endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000..eccbfb99e
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,1897 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_extent_busy.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_trace.h"
+
+struct workqueue_struct *xfs_discard_wq;
+
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+	struct xlog	*log)
+{
+	struct xlog_ticket *tic;
+
+	tic = xlog_ticket_alloc(log, 0, 1, 0);
+
+	/*
+	 * set the current reservation to zero so we know to steal the basic
+	 * transaction overhead reservation from the first transaction commit.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_iclog_hdrs = 0;
+	return tic;
+}
+
+static inline void
+xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil)
+{
+	struct xlog	*log = cil->xc_log;
+
+	atomic_set(&cil->xc_iclog_hdrs,
+		   (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) /
+			(log->l_iclog_size - log->l_iclog_hsize)));
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+static bool
+xlog_item_in_current_chkpt(
+	struct xfs_cil		*cil,
+	struct xfs_log_item	*lip)
+{
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
+		return false;
+
+	/*
+	 * li_seq is written on the first commit of a log item to record the
+	 * first checkpoint it is written to. Hence if it is different to the
+	 * current sequence, we're in a new checkpoint.
+	 */
+	return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+}
+
+bool
+xfs_log_item_in_current_chkpt(
+	struct xfs_log_item *lip)
+{
+	return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip);
+}
+
+/*
+ * Unavoidable forward declaration - xlog_cil_push_work() calls
+ * xlog_cil_ctx_alloc() itself.
+ */
+static void xlog_cil_push_work(struct work_struct *work);
+
+static struct xfs_cil_ctx *
+xlog_cil_ctx_alloc(void)
+{
+	struct xfs_cil_ctx	*ctx;
+
+	ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
+	INIT_LIST_HEAD(&ctx->committing);
+	INIT_LIST_HEAD(&ctx->busy_extents);
+	INIT_LIST_HEAD(&ctx->log_items);
+	INIT_LIST_HEAD(&ctx->lv_chain);
+	INIT_WORK(&ctx->push_work, xlog_cil_push_work);
+	return ctx;
+}
+
+/*
+ * Aggregate the CIL per cpu structures into global counts, lists, etc and
+ * clear the percpu state ready for the next context to use. This is called
+ * from the push code with the context lock held exclusively, hence nothing else
+ * will be accessing or modifying the per-cpu counters.
+ */
+static void
+xlog_cil_push_pcp_aggregate(
+	struct xfs_cil		*cil,
+	struct xfs_cil_ctx	*ctx)
+{
+	struct xlog_cil_pcp	*cilpcp;
+	int			cpu;
+
+	for_each_online_cpu(cpu) {
+		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+
+		ctx->ticket->t_curr_res += cilpcp->space_reserved;
+		cilpcp->space_reserved = 0;
+
+		if (!list_empty(&cilpcp->busy_extents)) {
+			list_splice_init(&cilpcp->busy_extents,
+					&ctx->busy_extents);
+		}
+		if (!list_empty(&cilpcp->log_items))
+			list_splice_init(&cilpcp->log_items, &ctx->log_items);
+
+		/*
+		 * We're in the middle of switching cil contexts.  Reset the
+		 * counter we use to detect when the current context is nearing
+		 * full.
+		 */
+		cilpcp->space_used = 0;
+	}
+}
+
+/*
+ * Aggregate the CIL per-cpu space used counters into the global atomic value.
+ * This is called when the per-cpu counter aggregation will first pass the soft
+ * limit threshold so we can switch to atomic counter aggregation for accurate
+ * detection of hard limit traversal.
+ */
+static void
+xlog_cil_insert_pcp_aggregate(
+	struct xfs_cil		*cil,
+	struct xfs_cil_ctx	*ctx)
+{
+	struct xlog_cil_pcp	*cilpcp;
+	int			cpu;
+	int			count = 0;
+
+	/* Trigger atomic updates then aggregate only for the first caller */
+	if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
+		return;
+
+	for_each_online_cpu(cpu) {
+		int	old, prev;
+
+		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+		do {
+			old = cilpcp->space_used;
+			prev = cmpxchg(&cilpcp->space_used, old, 0);
+		} while (old != prev);
+		count += old;
+	}
+	atomic_add(count, &ctx->space_used);
+}
+
+static void
+xlog_cil_ctx_switch(
+	struct xfs_cil		*cil,
+	struct xfs_cil_ctx	*ctx)
+{
+	xlog_cil_set_iclog_hdr_count(cil);
+	set_bit(XLOG_CIL_EMPTY, &cil->xc_flags);
+	set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags);
+	ctx->sequence = ++cil->xc_current_sequence;
+	ctx->cil = cil;
+	cil->xc_ctx = ctx;
+}
+
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+	struct xlog	*log)
+{
+	log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+	log->l_cilp->xc_ctx->sequence = 1;
+	xlog_cil_set_iclog_hdr_count(log->l_cilp);
+}
+
+static inline int
+xlog_cil_iovec_space(
+	uint	niovecs)
+{
+	return round_up((sizeof(struct xfs_log_vec) +
+					niovecs * sizeof(struct xfs_log_iovec)),
+			sizeof(uint64_t));
+}
+
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ *	a) does not exist; or
+ *	b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+	struct xlog		*log,
+	struct xfs_trans	*tp)
+{
+	struct xfs_log_item	*lip;
+
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
+		struct xfs_log_vec *lv;
+		int	niovecs = 0;
+		int	nbytes = 0;
+		int	buf_size;
+		bool	ordered = false;
+
+		/* Skip items which aren't dirty in this transaction. */
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+			continue;
+
+		/* get number of vecs and size of data to be stored */
+		lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
+		/*
+		 * Ordered items need to be tracked but we do not wish to write
+		 * them. We need a logvec to track the object, but we do not
+		 * need an iovec or buffer to be allocated for copying data.
+		 */
+		if (niovecs == XFS_LOG_VEC_ORDERED) {
+			ordered = true;
+			niovecs = 0;
+			nbytes = 0;
+		}
+
+		/*
+		 * We 64-bit align the length of each iovec so that the start of
+		 * the next one is naturally aligned.  We'll need to account for
+		 * that slack space here.
+		 *
+		 * We also add the xlog_op_header to each region when
+		 * formatting, but that's not accounted to the size of the item
+		 * at this point. Hence we'll need an addition number of bytes
+		 * for each vector to hold an opheader.
+		 *
+		 * Then round nbytes up to 64-bit alignment so that the initial
+		 * buffer alignment is easy to calculate and verify.
+		 */
+		nbytes += niovecs *
+			(sizeof(uint64_t) + sizeof(struct xlog_op_header));
+		nbytes = round_up(nbytes, sizeof(uint64_t));
+
+		/*
+		 * The data buffer needs to start 64-bit aligned, so round up
+		 * that space to ensure we can align it appropriately and not
+		 * overrun the buffer.
+		 */
+		buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+
+		/*
+		 * if we have no shadow buffer, or it is too small, we need to
+		 * reallocate it.
+		 */
+		if (!lip->li_lv_shadow ||
+		    buf_size > lip->li_lv_shadow->lv_size) {
+			/*
+			 * We free and allocate here as a realloc would copy
+			 * unnecessary data. We don't use kvzalloc() for the
+			 * same reason - we don't need to zero the data area in
+			 * the buffer, only the log vector header and the iovec
+			 * storage.
+			 */
+			kmem_free(lip->li_lv_shadow);
+			lv = xlog_kvmalloc(buf_size);
+
+			memset(lv, 0, xlog_cil_iovec_space(niovecs));
+
+			INIT_LIST_HEAD(&lv->lv_list);
+			lv->lv_item = lip;
+			lv->lv_size = buf_size;
+			if (ordered)
+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+			else
+				lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+			lip->li_lv_shadow = lv;
+		} else {
+			/* same or smaller, optimise common overwrite case */
+			lv = lip->li_lv_shadow;
+			if (ordered)
+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+			else
+				lv->lv_buf_len = 0;
+			lv->lv_bytes = 0;
+		}
+
+		/* Ensure the lv is set up according to ->iop_size */
+		lv->lv_niovecs = niovecs;
+
+		/* The allocated data region lies beyond the iovec region */
+		lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+	}
+
+}
+
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space it will consume, and if it is a new item pin it as well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+	struct xlog		*log,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_vec	*old_lv,
+	int			*diff_len)
+{
+	/* Account for the new LV being passed in */
+	if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+		*diff_len += lv->lv_bytes;
+
+	/*
+	 * If there is no old LV, this is the first time we've seen the item in
+	 * this CIL context and so we need to pin it. If we are replacing the
+	 * old_lv, then remove the space it accounts for and make it the shadow
+	 * buffer for later freeing. In both cases we are now switching to the
+	 * shadow buffer, so update the pointer to it appropriately.
+	 */
+	if (!old_lv) {
+		if (lv->lv_item->li_ops->iop_pin)
+			lv->lv_item->li_ops->iop_pin(lv->lv_item);
+		lv->lv_item->li_lv_shadow = NULL;
+	} else if (old_lv != lv) {
+		ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
+
+		*diff_len -= old_lv->lv_bytes;
+		lv->lv_item->li_lv_shadow = old_lv;
+	}
+
+	/* attach new log vector to log item */
+	lv->lv_item->li_lv = lv;
+
+	/*
+	 * If this is the first time the item is being committed to the
+	 * CIL, store the sequence number on the log item so we can
+	 * tell in future commits whether this is the first checkpoint
+	 * the item is being committed into.
+	 */
+	if (!lv->lv_item->li_seq)
+		lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function takes the prepared log vectors attached to each log item, and
+ * formats the changes into the log vector buffer. The buffer it uses is
+ * dependent on the current state of the vector in the CIL - the shadow lv is
+ * guaranteed to be large enough for the current modification, but we will only
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_insert_format_items(
+	struct xlog		*log,
+	struct xfs_trans	*tp,
+	int			*diff_len)
+{
+	struct xfs_log_item	*lip;
+
+	/* Bail out if we didn't find a log item.  */
+	if (list_empty(&tp->t_items)) {
+		ASSERT(0);
+		return;
+	}
+
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
+		struct xfs_log_vec *lv;
+		struct xfs_log_vec *old_lv = NULL;
+		struct xfs_log_vec *shadow;
+		bool	ordered = false;
+
+		/* Skip items which aren't dirty in this transaction. */
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+			continue;
+
+		/*
+		 * The formatting size information is already attached to
+		 * the shadow lv on the log item.
+		 */
+		shadow = lip->li_lv_shadow;
+		if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
+			ordered = true;
+
+		/* Skip items that do not have any vectors for writing */
+		if (!shadow->lv_niovecs && !ordered)
+			continue;
+
+		/* compare to existing item size */
+		old_lv = lip->li_lv;
+		if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
+			/* same or smaller, optimise common overwrite case */
+			lv = lip->li_lv;
+
+			if (ordered)
+				goto insert;
+
+			/*
+			 * set the item up as though it is a new insertion so
+			 * that the space reservation accounting is correct.
+			 */
+			*diff_len -= lv->lv_bytes;
+
+			/* Ensure the lv is set up according to ->iop_size */
+			lv->lv_niovecs = shadow->lv_niovecs;
+
+			/* reset the lv buffer information for new formatting */
+			lv->lv_buf_len = 0;
+			lv->lv_bytes = 0;
+			lv->lv_buf = (char *)lv +
+					xlog_cil_iovec_space(lv->lv_niovecs);
+		} else {
+			/* switch to shadow buffer! */
+			lv = shadow;
+			lv->lv_item = lip;
+			if (ordered) {
+				/* track as an ordered logvec */
+				ASSERT(lip->li_lv == NULL);
+				goto insert;
+			}
+		}
+
+		ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
+		lip->li_ops->iop_format(lip, lv);
+insert:
+		xfs_cil_prepare_item(log, lv, old_lv, diff_len);
+	}
+}
+
+/*
+ * The use of lockless waitqueue_active() requires that the caller has
+ * serialised itself against the wakeup call in xlog_cil_push_work(). That
+ * can be done by either holding the push lock or the context lock.
+ */
+static inline bool
+xlog_cil_over_hard_limit(
+	struct xlog	*log,
+	int32_t		space_used)
+{
+	if (waitqueue_active(&log->l_cilp->xc_push_wait))
+		return true;
+	if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+		return true;
+	return false;
+}
+
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we added to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
+static void
+xlog_cil_insert_items(
+	struct xlog		*log,
+	struct xfs_trans	*tp,
+	uint32_t		released_space)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
+	struct xfs_log_item	*lip;
+	int			len = 0;
+	int			iovhdr_res = 0, split_res = 0, ctx_res = 0;
+	int			space_used;
+	int			order;
+	struct xlog_cil_pcp	*cilpcp;
+
+	ASSERT(tp);
+
+	/*
+	 * We can do this safely because the context can't checkpoint until we
+	 * are done so it doesn't matter exactly how we update the CIL.
+	 */
+	xlog_cil_insert_format_items(log, tp, &len);
+
+	/*
+	 * Subtract the space released by intent cancelation from the space we
+	 * consumed so that we remove it from the CIL space and add it back to
+	 * the current transaction reservation context.
+	 */
+	len -= released_space;
+
+	/*
+	 * Grab the per-cpu pointer for the CIL before we start any accounting.
+	 * That ensures that we are running with pre-emption disabled and so we
+	 * can't be scheduled away between split sample/update operations that
+	 * are done without outside locking to serialise them.
+	 */
+	cilpcp = get_cpu_ptr(cil->xc_pcp);
+
+	/*
+	 * We need to take the CIL checkpoint unit reservation on the first
+	 * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't
+	 * unnecessarily do an atomic op in the fast path here. We can clear the
+	 * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that
+	 * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit.
+	 */
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) &&
+	    test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
+		ctx_res = ctx->ticket->t_unit_res;
+
+	/*
+	 * Check if we need to steal iclog headers. atomic_read() is not a
+	 * locked atomic operation, so we can check the value before we do any
+	 * real atomic ops in the fast path. If we've already taken the CIL unit
+	 * reservation from this commit, we've already got one iclog header
+	 * space reserved so we have to account for that otherwise we risk
+	 * overrunning the reservation on this ticket.
+	 *
+	 * If the CIL is already at the hard limit, we might need more header
+	 * space that originally reserved. So steal more header space from every
+	 * commit that occurs once we are over the hard limit to ensure the CIL
+	 * push won't run out of reservation space.
+	 *
+	 * This can steal more than we need, but that's OK.
+	 *
+	 * The cil->xc_ctx_lock provides the serialisation necessary for safely
+	 * calling xlog_cil_over_hard_limit() in this context.
+	 */
+	space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len;
+	if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
+	    xlog_cil_over_hard_limit(log, space_used)) {
+		split_res = log->l_iclog_hsize +
+					sizeof(struct xlog_op_header);
+		if (ctx_res)
+			ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1);
+		else
+			ctx_res = split_res * tp->t_ticket->t_iclog_hdrs;
+		atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
+	}
+	cilpcp->space_reserved += ctx_res;
+
+	/*
+	 * Accurately account when over the soft limit, otherwise fold the
+	 * percpu count into the global count if over the per-cpu threshold.
+	 */
+	if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) {
+		atomic_add(len, &ctx->space_used);
+	} else if (cilpcp->space_used + len >
+			(XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) {
+		space_used = atomic_add_return(cilpcp->space_used + len,
+						&ctx->space_used);
+		cilpcp->space_used = 0;
+
+		/*
+		 * If we just transitioned over the soft limit, we need to
+		 * transition to the global atomic counter.
+		 */
+		if (space_used >= XLOG_CIL_SPACE_LIMIT(log))
+			xlog_cil_insert_pcp_aggregate(cil, ctx);
+	} else {
+		cilpcp->space_used += len;
+	}
+	/* attach the transaction to the CIL if it has any busy extents */
+	if (!list_empty(&tp->t_busy))
+		list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
+
+	/*
+	 * Now update the order of everything modified in the transaction
+	 * and insert items into the CIL if they aren't already there.
+	 * We do this here so we only need to take the CIL lock once during
+	 * the transaction commit.
+	 */
+	order = atomic_inc_return(&ctx->order_id);
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
+		/* Skip items which aren't dirty in this transaction. */
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+			continue;
+
+		lip->li_order_id = order;
+		if (!list_empty(&lip->li_cil))
+			continue;
+		list_add_tail(&lip->li_cil, &cilpcp->log_items);
+	}
+	put_cpu_ptr(cilpcp);
+
+	/*
+	 * If we've overrun the reservation, dump the tx details before we move
+	 * the log items. Shutdown is imminent...
+	 */
+	tp->t_ticket->t_curr_res -= ctx_res + len;
+	if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
+		xfs_warn(log->l_mp, "Transaction log reservation overrun:");
+		xfs_warn(log->l_mp,
+			 "  log items: %d bytes (iov hdrs: %d bytes)",
+			 len, iovhdr_res);
+		xfs_warn(log->l_mp, "  split region headers: %d bytes",
+			 split_res);
+		xfs_warn(log->l_mp, "  ctx ticket: %d bytes", ctx_res);
+		xlog_print_trans(tp);
+		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+	}
+}
+
+static void
+xlog_cil_free_logvec(
+	struct list_head	*lv_chain)
+{
+	struct xfs_log_vec	*lv;
+
+	while (!list_empty(lv_chain)) {
+		lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
+		list_del_init(&lv->lv_list);
+		kmem_free(lv);
+	}
+}
+
+static void
+xlog_discard_endio_work(
+	struct work_struct	*work)
+{
+	struct xfs_cil_ctx	*ctx =
+		container_of(work, struct xfs_cil_ctx, discard_endio_work);
+	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
+
+	xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
+	kmem_free(ctx);
+}
+
+/*
+ * Queue up the actual completion to a thread to avoid IRQ-safe locking for
+ * pagb_lock.  Note that we need a unbounded workqueue, otherwise we might
+ * get the execution delayed up to 30 seconds for weird reasons.
+ */
+static void
+xlog_discard_endio(
+	struct bio		*bio)
+{
+	struct xfs_cil_ctx	*ctx = bio->bi_private;
+
+	INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
+	queue_work(xfs_discard_wq, &ctx->discard_endio_work);
+	bio_put(bio);
+}
+
+static void
+xlog_discard_busy_extents(
+	struct xfs_mount	*mp,
+	struct xfs_cil_ctx	*ctx)
+{
+	struct list_head	*list = &ctx->busy_extents;
+	struct xfs_extent_busy	*busyp;
+	struct bio		*bio = NULL;
+	struct blk_plug		plug;
+	int			error = 0;
+
+	ASSERT(xfs_has_discard(mp));
+
+	blk_start_plug(&plug);
+	list_for_each_entry(busyp, list, list) {
+		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+					 busyp->length);
+
+		error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+				XFS_FSB_TO_BB(mp, busyp->length),
+				GFP_NOFS, &bio);
+		if (error && error != -EOPNOTSUPP) {
+			xfs_info(mp,
+	 "discard failed for extent [0x%llx,%u], error %d",
+				 (unsigned long long)busyp->bno,
+				 busyp->length,
+				 error);
+			break;
+		}
+	}
+
+	if (bio) {
+		bio->bi_private = ctx;
+		bio->bi_end_io = xlog_discard_endio;
+		submit_bio(bio);
+	} else {
+		xlog_discard_endio_work(&ctx->discard_endio_work);
+	}
+	blk_finish_plug(&plug);
+}
+
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+	struct xfs_cil_ctx	*ctx)
+{
+	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
+	bool			abort = xlog_is_shutdown(ctx->cil->xc_log);
+
+	/*
+	 * If the I/O failed, we're aborting the commit and already shutdown.
+	 * Wake any commit waiters before aborting the log items so we don't
+	 * block async log pushers on callbacks. Async log pushers explicitly do
+	 * not wait on log force completion because they may be holding locks
+	 * required to unpin items.
+	 */
+	if (abort) {
+		spin_lock(&ctx->cil->xc_push_lock);
+		wake_up_all(&ctx->cil->xc_start_wait);
+		wake_up_all(&ctx->cil->xc_commit_wait);
+		spin_unlock(&ctx->cil->xc_push_lock);
+	}
+
+	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
+					ctx->start_lsn, abort);
+
+	xfs_extent_busy_sort(&ctx->busy_extents);
+	xfs_extent_busy_clear(mp, &ctx->busy_extents,
+			      xfs_has_discard(mp) && !abort);
+
+	spin_lock(&ctx->cil->xc_push_lock);
+	list_del(&ctx->committing);
+	spin_unlock(&ctx->cil->xc_push_lock);
+
+	xlog_cil_free_logvec(&ctx->lv_chain);
+
+	if (!list_empty(&ctx->busy_extents))
+		xlog_discard_busy_extents(mp, ctx);
+	else
+		kmem_free(ctx);
+}
+
+void
+xlog_cil_process_committed(
+	struct list_head	*list)
+{
+	struct xfs_cil_ctx	*ctx;
+
+	while ((ctx = list_first_entry_or_null(list,
+			struct xfs_cil_ctx, iclog_entry))) {
+		list_del(&ctx->iclog_entry);
+		xlog_cil_committed(ctx);
+	}
+}
+
+/*
+* Record the LSN of the iclog we were just granted space to start writing into.
+* If the context doesn't have a start_lsn recorded, then this iclog will
+* contain the start record for the checkpoint. Otherwise this write contains
+* the commit record for the checkpoint.
+*/
+void
+xlog_cil_set_ctx_write_state(
+	struct xfs_cil_ctx	*ctx,
+	struct xlog_in_core	*iclog)
+{
+	struct xfs_cil		*cil = ctx->cil;
+	xfs_lsn_t		lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+
+	ASSERT(!ctx->commit_lsn);
+	if (!ctx->start_lsn) {
+		spin_lock(&cil->xc_push_lock);
+		/*
+		 * The LSN we need to pass to the log items on transaction
+		 * commit is the LSN reported by the first log vector write, not
+		 * the commit lsn. If we use the commit record lsn then we can
+		 * move the grant write head beyond the tail LSN and overwrite
+		 * it.
+		 */
+		ctx->start_lsn = lsn;
+		wake_up_all(&cil->xc_start_wait);
+		spin_unlock(&cil->xc_push_lock);
+
+		/*
+		 * Make sure the metadata we are about to overwrite in the log
+		 * has been flushed to stable storage before this iclog is
+		 * issued.
+		 */
+		spin_lock(&cil->xc_log->l_icloglock);
+		iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+		spin_unlock(&cil->xc_log->l_icloglock);
+		return;
+	}
+
+	/*
+	 * Take a reference to the iclog for the context so that we still hold
+	 * it when xlog_write is done and has released it. This means the
+	 * context controls when the iclog is released for IO.
+	 */
+	atomic_inc(&iclog->ic_refcnt);
+
+	/*
+	 * xlog_state_get_iclog_space() guarantees there is enough space in the
+	 * iclog for an entire commit record, so we can attach the context
+	 * callbacks now.  This needs to be done before we make the commit_lsn
+	 * visible to waiters so that checkpoints with commit records in the
+	 * same iclog order their IO completion callbacks in the same order that
+	 * the commit records appear in the iclog.
+	 */
+	spin_lock(&cil->xc_log->l_icloglock);
+	list_add_tail(&ctx->iclog_entry, &iclog->ic_callbacks);
+	spin_unlock(&cil->xc_log->l_icloglock);
+
+	/*
+	 * Now we can record the commit LSN and wake anyone waiting for this
+	 * sequence to have the ordered commit record assigned to a physical
+	 * location in the log.
+	 */
+	spin_lock(&cil->xc_push_lock);
+	ctx->commit_iclog = iclog;
+	ctx->commit_lsn = lsn;
+	wake_up_all(&cil->xc_commit_wait);
+	spin_unlock(&cil->xc_push_lock);
+}
+
+
+/*
+ * Ensure that the order of log writes follows checkpoint sequence order. This
+ * relies on the context LSN being zero until the log write has guaranteed the
+ * LSN that the log write will start at via xlog_state_get_iclog_space().
+ */
+enum _record_type {
+	_START_RECORD,
+	_COMMIT_RECORD,
+};
+
+static int
+xlog_cil_order_write(
+	struct xfs_cil		*cil,
+	xfs_csn_t		sequence,
+	enum _record_type	record)
+{
+	struct xfs_cil_ctx	*ctx;
+
+restart:
+	spin_lock(&cil->xc_push_lock);
+	list_for_each_entry(ctx, &cil->xc_committing, committing) {
+		/*
+		 * Avoid getting stuck in this loop because we were woken by the
+		 * shutdown, but then went back to sleep once already in the
+		 * shutdown state.
+		 */
+		if (xlog_is_shutdown(cil->xc_log)) {
+			spin_unlock(&cil->xc_push_lock);
+			return -EIO;
+		}
+
+		/*
+		 * Higher sequences will wait for this one so skip them.
+		 * Don't wait for our own sequence, either.
+		 */
+		if (ctx->sequence >= sequence)
+			continue;
+
+		/* Wait until the LSN for the record has been recorded. */
+		switch (record) {
+		case _START_RECORD:
+			if (!ctx->start_lsn) {
+				xlog_wait(&cil->xc_start_wait, &cil->xc_push_lock);
+				goto restart;
+			}
+			break;
+		case _COMMIT_RECORD:
+			if (!ctx->commit_lsn) {
+				xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
+				goto restart;
+			}
+			break;
+		}
+	}
+	spin_unlock(&cil->xc_push_lock);
+	return 0;
+}
+
+/*
+ * Write out the log vector change now attached to the CIL context. This will
+ * write a start record that needs to be strictly ordered in ascending CIL
+ * sequence order so that log recovery will always use in-order start LSNs when
+ * replaying checkpoints.
+ */
+static int
+xlog_cil_write_chain(
+	struct xfs_cil_ctx	*ctx,
+	uint32_t		chain_len)
+{
+	struct xlog		*log = ctx->cil->xc_log;
+	int			error;
+
+	error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
+	if (error)
+		return error;
+	return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len);
+}
+
+/*
+ * Write out the commit record of a checkpoint transaction to close off a
+ * running log write. These commit records are strictly ordered in ascending CIL
+ * sequence order so that log recovery will always replay the checkpoints in the
+ * correct order.
+ */
+static int
+xlog_cil_write_commit_record(
+	struct xfs_cil_ctx	*ctx)
+{
+	struct xlog		*log = ctx->cil->xc_log;
+	struct xlog_op_header	ophdr = {
+		.oh_clientid = XFS_TRANSACTION,
+		.oh_tid = cpu_to_be32(ctx->ticket->t_tid),
+		.oh_flags = XLOG_COMMIT_TRANS,
+	};
+	struct xfs_log_iovec	reg = {
+		.i_addr = &ophdr,
+		.i_len = sizeof(struct xlog_op_header),
+		.i_type = XLOG_REG_TYPE_COMMIT,
+	};
+	struct xfs_log_vec	vec = {
+		.lv_niovecs = 1,
+		.lv_iovecp = &reg,
+	};
+	int			error;
+	LIST_HEAD(lv_chain);
+	list_add(&vec.lv_list, &lv_chain);
+
+	if (xlog_is_shutdown(log))
+		return -EIO;
+
+	error = xlog_cil_order_write(ctx->cil, ctx->sequence, _COMMIT_RECORD);
+	if (error)
+		return error;
+
+	/* account for space used by record data */
+	ctx->ticket->t_curr_res -= reg.i_len;
+	error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len);
+	if (error)
+		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+	return error;
+}
+
+struct xlog_cil_trans_hdr {
+	struct xlog_op_header	oph[2];
+	struct xfs_trans_header	thdr;
+	struct xfs_log_iovec	lhdr[2];
+};
+
+/*
+ * Build a checkpoint transaction header to begin the journal transaction.  We
+ * need to account for the space used by the transaction header here as it is
+ * not accounted for in xlog_write().
+ *
+ * This is the only place we write a transaction header, so we also build the
+ * log opheaders that indicate the start of a log transaction and wrap the
+ * transaction header. We keep the start record in it's own log vector rather
+ * than compacting them into a single region as this ends up making the logic
+ * in xlog_write() for handling empty opheaders for start, commit and unmount
+ * records much simpler.
+ */
+static void
+xlog_cil_build_trans_hdr(
+	struct xfs_cil_ctx	*ctx,
+	struct xlog_cil_trans_hdr *hdr,
+	struct xfs_log_vec	*lvhdr,
+	int			num_iovecs)
+{
+	struct xlog_ticket	*tic = ctx->ticket;
+	__be32			tid = cpu_to_be32(tic->t_tid);
+
+	memset(hdr, 0, sizeof(*hdr));
+
+	/* Log start record */
+	hdr->oph[0].oh_tid = tid;
+	hdr->oph[0].oh_clientid = XFS_TRANSACTION;
+	hdr->oph[0].oh_flags = XLOG_START_TRANS;
+
+	/* log iovec region pointer */
+	hdr->lhdr[0].i_addr = &hdr->oph[0];
+	hdr->lhdr[0].i_len = sizeof(struct xlog_op_header);
+	hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER;
+
+	/* log opheader */
+	hdr->oph[1].oh_tid = tid;
+	hdr->oph[1].oh_clientid = XFS_TRANSACTION;
+	hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header));
+
+	/* transaction header in host byte order format */
+	hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+	hdr->thdr.th_type = XFS_TRANS_CHECKPOINT;
+	hdr->thdr.th_tid = tic->t_tid;
+	hdr->thdr.th_num_items = num_iovecs;
+
+	/* log iovec region pointer */
+	hdr->lhdr[1].i_addr = &hdr->oph[1];
+	hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) +
+				sizeof(struct xfs_trans_header);
+	hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR;
+
+	lvhdr->lv_niovecs = 2;
+	lvhdr->lv_iovecp = &hdr->lhdr[0];
+	lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
+
+	tic->t_curr_res -= lvhdr->lv_bytes;
+}
+
+/*
+ * CIL item reordering compare function. We want to order in ascending ID order,
+ * but we want to leave items with the same ID in the order they were added to
+ * the list. This is important for operations like reflink where we log 4 order
+ * dependent intents in a single transaction when we overwrite an existing
+ * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop),
+ * CUI (inc), BUI(remap)...
+ */
+static int
+xlog_cil_order_cmp(
+	void			*priv,
+	const struct list_head	*a,
+	const struct list_head	*b)
+{
+	struct xfs_log_vec	*l1 = container_of(a, struct xfs_log_vec, lv_list);
+	struct xfs_log_vec	*l2 = container_of(b, struct xfs_log_vec, lv_list);
+
+	return l1->lv_order_id > l2->lv_order_id;
+}
+
+/*
+ * Pull all the log vectors off the items in the CIL, and remove the items from
+ * the CIL. We don't need the CIL lock here because it's only needed on the
+ * transaction commit side which is currently locked out by the flush lock.
+ *
+ * If a log item is marked with a whiteout, we do not need to write it to the
+ * journal and so we just move them to the whiteout list for the caller to
+ * dispose of appropriately.
+ */
+static void
+xlog_cil_build_lv_chain(
+	struct xfs_cil_ctx	*ctx,
+	struct list_head	*whiteouts,
+	uint32_t		*num_iovecs,
+	uint32_t		*num_bytes)
+{
+	while (!list_empty(&ctx->log_items)) {
+		struct xfs_log_item	*item;
+		struct xfs_log_vec	*lv;
+
+		item = list_first_entry(&ctx->log_items,
+					struct xfs_log_item, li_cil);
+
+		if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
+			list_move(&item->li_cil, whiteouts);
+			trace_xfs_cil_whiteout_skip(item);
+			continue;
+		}
+
+		lv = item->li_lv;
+		lv->lv_order_id = item->li_order_id;
+
+		/* we don't write ordered log vectors */
+		if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+			*num_bytes += lv->lv_bytes;
+		*num_iovecs += lv->lv_niovecs;
+		list_add_tail(&lv->lv_list, &ctx->lv_chain);
+
+		list_del_init(&item->li_cil);
+		item->li_order_id = 0;
+		item->li_lv = NULL;
+	}
+}
+
+static void
+xlog_cil_cleanup_whiteouts(
+	struct list_head	*whiteouts)
+{
+	while (!list_empty(whiteouts)) {
+		struct xfs_log_item *item = list_first_entry(whiteouts,
+						struct xfs_log_item, li_cil);
+		list_del_init(&item->li_cil);
+		trace_xfs_cil_whiteout_unpin(item);
+		item->li_ops->iop_unpin(item, 1);
+	}
+}
+
+/*
+ * Push the Committed Item List to the log.
+ *
+ * If the current sequence is the same as xc_push_seq we need to do a flush. If
+ * xc_push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * xc_push_seq is checked unlocked against the sequence number for a match.
+ * Hence we can allow log forces to run racily and not issue pushes for the
+ * same sequence twice.  If we get a race between multiple pushes for the same
+ * sequence they will block on the first one and then abort, hence avoiding
+ * needless pushes.
+ */
+static void
+xlog_cil_push_work(
+	struct work_struct	*work)
+{
+	struct xfs_cil_ctx	*ctx =
+		container_of(work, struct xfs_cil_ctx, push_work);
+	struct xfs_cil		*cil = ctx->cil;
+	struct xlog		*log = cil->xc_log;
+	struct xfs_cil_ctx	*new_ctx;
+	int			num_iovecs = 0;
+	int			num_bytes = 0;
+	int			error = 0;
+	struct xlog_cil_trans_hdr thdr;
+	struct xfs_log_vec	lvhdr = {};
+	xfs_csn_t		push_seq;
+	bool			push_commit_stable;
+	LIST_HEAD		(whiteouts);
+	struct xlog_ticket	*ticket;
+
+	new_ctx = xlog_cil_ctx_alloc();
+	new_ctx->ticket = xlog_cil_ticket_alloc(log);
+
+	down_write(&cil->xc_ctx_lock);
+
+	spin_lock(&cil->xc_push_lock);
+	push_seq = cil->xc_push_seq;
+	ASSERT(push_seq <= ctx->sequence);
+	push_commit_stable = cil->xc_push_commit_stable;
+	cil->xc_push_commit_stable = false;
+
+	/*
+	 * As we are about to switch to a new, empty CIL context, we no longer
+	 * need to throttle tasks on CIL space overruns. Wake any waiters that
+	 * the hard push throttle may have caught so they can start committing
+	 * to the new context. The ctx->xc_push_lock provides the serialisation
+	 * necessary for safely using the lockless waitqueue_active() check in
+	 * this context.
+	 */
+	if (waitqueue_active(&cil->xc_push_wait))
+		wake_up_all(&cil->xc_push_wait);
+
+	xlog_cil_push_pcp_aggregate(cil, ctx);
+
+	/*
+	 * Check if we've anything to push. If there is nothing, then we don't
+	 * move on to a new sequence number and so we have to be able to push
+	 * this sequence again later.
+	 */
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
+		cil->xc_push_seq = 0;
+		spin_unlock(&cil->xc_push_lock);
+		goto out_skip;
+	}
+
+
+	/* check for a previously pushed sequence */
+	if (push_seq < ctx->sequence) {
+		spin_unlock(&cil->xc_push_lock);
+		goto out_skip;
+	}
+
+	/*
+	 * We are now going to push this context, so add it to the committing
+	 * list before we do anything else. This ensures that anyone waiting on
+	 * this push can easily detect the difference between a "push in
+	 * progress" and "CIL is empty, nothing to do".
+	 *
+	 * IOWs, a wait loop can now check for:
+	 *	the current sequence not being found on the committing list;
+	 *	an empty CIL; and
+	 *	an unchanged sequence number
+	 * to detect a push that had nothing to do and therefore does not need
+	 * waiting on. If the CIL is not empty, we get put on the committing
+	 * list before emptying the CIL and bumping the sequence number. Hence
+	 * an empty CIL and an unchanged sequence number means we jumped out
+	 * above after doing nothing.
+	 *
+	 * Hence the waiter will either find the commit sequence on the
+	 * committing list or the sequence number will be unchanged and the CIL
+	 * still dirty. In that latter case, the push has not yet started, and
+	 * so the waiter will have to continue trying to check the CIL
+	 * committing list until it is found. In extreme cases of delay, the
+	 * sequence may fully commit between the attempts the wait makes to wait
+	 * on the commit sequence.
+	 */
+	list_add(&ctx->committing, &cil->xc_committing);
+	spin_unlock(&cil->xc_push_lock);
+
+	xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes);
+
+	/*
+	 * Switch the contexts so we can drop the context lock and move out
+	 * of a shared context. We can't just go straight to the commit record,
+	 * though - we need to synchronise with previous and future commits so
+	 * that the commit records are correctly ordered in the log to ensure
+	 * that we process items during log IO completion in the correct order.
+	 *
+	 * For example, if we get an EFI in one checkpoint and the EFD in the
+	 * next (e.g. due to log forces), we do not want the checkpoint with
+	 * the EFD to be committed before the checkpoint with the EFI.  Hence
+	 * we must strictly order the commit records of the checkpoints so
+	 * that: a) the checkpoint callbacks are attached to the iclogs in the
+	 * correct order; and b) the checkpoints are replayed in correct order
+	 * in log recovery.
+	 *
+	 * Hence we need to add this context to the committing context list so
+	 * that higher sequences will wait for us to write out a commit record
+	 * before they do.
+	 *
+	 * xfs_log_force_seq requires us to mirror the new sequence into the cil
+	 * structure atomically with the addition of this sequence to the
+	 * committing list. This also ensures that we can do unlocked checks
+	 * against the current sequence in log forces without risking
+	 * deferencing a freed context pointer.
+	 */
+	spin_lock(&cil->xc_push_lock);
+	xlog_cil_ctx_switch(cil, new_ctx);
+	spin_unlock(&cil->xc_push_lock);
+	up_write(&cil->xc_ctx_lock);
+
+	/*
+	 * Sort the log vector chain before we add the transaction headers.
+	 * This ensures we always have the transaction headers at the start
+	 * of the chain.
+	 */
+	list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp);
+
+	/*
+	 * Build a checkpoint transaction header and write it to the log to
+	 * begin the transaction. We need to account for the space used by the
+	 * transaction header here as it is not accounted for in xlog_write().
+	 * Add the lvhdr to the head of the lv chain we pass to xlog_write() so
+	 * it gets written into the iclog first.
+	 */
+	xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
+	num_bytes += lvhdr.lv_bytes;
+	list_add(&lvhdr.lv_list, &ctx->lv_chain);
+
+	/*
+	 * Take the lvhdr back off the lv_chain immediately after calling
+	 * xlog_cil_write_chain() as it should not be passed to log IO
+	 * completion.
+	 */
+	error = xlog_cil_write_chain(ctx, num_bytes);
+	list_del(&lvhdr.lv_list);
+	if (error)
+		goto out_abort_free_ticket;
+
+	error = xlog_cil_write_commit_record(ctx);
+	if (error)
+		goto out_abort_free_ticket;
+
+	/*
+	 * Grab the ticket from the ctx so we can ungrant it after releasing the
+	 * commit_iclog. The ctx may be freed by the time we return from
+	 * releasing the commit_iclog (i.e. checkpoint has been completed and
+	 * callback run) so we can't reference the ctx after the call to
+	 * xlog_state_release_iclog().
+	 */
+	ticket = ctx->ticket;
+
+	/*
+	 * If the checkpoint spans multiple iclogs, wait for all previous iclogs
+	 * to complete before we submit the commit_iclog. We can't use state
+	 * checks for this - ACTIVE can be either a past completed iclog or a
+	 * future iclog being filled, while WANT_SYNC through SYNC_DONE can be a
+	 * past or future iclog awaiting IO or ordered IO completion to be run.
+	 * In the latter case, if it's a future iclog and we wait on it, the we
+	 * will hang because it won't get processed through to ic_force_wait
+	 * wakeup until this commit_iclog is written to disk.  Hence we use the
+	 * iclog header lsn and compare it to the commit lsn to determine if we
+	 * need to wait on iclogs or not.
+	 */
+	spin_lock(&log->l_icloglock);
+	if (ctx->start_lsn != ctx->commit_lsn) {
+		xfs_lsn_t	plsn;
+
+		plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn);
+		if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) {
+			/*
+			 * Waiting on ic_force_wait orders the completion of
+			 * iclogs older than ic_prev. Hence we only need to wait
+			 * on the most recent older iclog here.
+			 */
+			xlog_wait_on_iclog(ctx->commit_iclog->ic_prev);
+			spin_lock(&log->l_icloglock);
+		}
+
+		/*
+		 * We need to issue a pre-flush so that the ordering for this
+		 * checkpoint is correctly preserved down to stable storage.
+		 */
+		ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+	}
+
+	/*
+	 * The commit iclog must be written to stable storage to guarantee
+	 * journal IO vs metadata writeback IO is correctly ordered on stable
+	 * storage.
+	 *
+	 * If the push caller needs the commit to be immediately stable and the
+	 * commit_iclog is not yet marked as XLOG_STATE_WANT_SYNC to indicate it
+	 * will be written when released, switch it's state to WANT_SYNC right
+	 * now.
+	 */
+	ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
+	if (push_commit_stable &&
+	    ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
+		xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
+	ticket = ctx->ticket;
+	xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
+
+	/* Not safe to reference ctx now! */
+
+	spin_unlock(&log->l_icloglock);
+	xlog_cil_cleanup_whiteouts(&whiteouts);
+	xfs_log_ticket_ungrant(log, ticket);
+	return;
+
+out_skip:
+	up_write(&cil->xc_ctx_lock);
+	xfs_log_ticket_put(new_ctx->ticket);
+	kmem_free(new_ctx);
+	return;
+
+out_abort_free_ticket:
+	ASSERT(xlog_is_shutdown(log));
+	xlog_cil_cleanup_whiteouts(&whiteouts);
+	if (!ctx->commit_iclog) {
+		xfs_log_ticket_ungrant(log, ctx->ticket);
+		xlog_cil_committed(ctx);
+		return;
+	}
+	spin_lock(&log->l_icloglock);
+	ticket = ctx->ticket;
+	xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
+	/* Not safe to reference ctx now! */
+	spin_unlock(&log->l_icloglock);
+	xfs_log_ticket_ungrant(log, ticket);
+}
+
+/*
+ * We need to push CIL every so often so we don't cache more than we can fit in
+ * the log. The limit really is that a checkpoint can't be more than half the
+ * log (the current checkpoint is not allowed to overwrite the previous
+ * checkpoint), but commit latency and memory usage limit this to a smaller
+ * size.
+ */
+static void
+xlog_cil_push_background(
+	struct xlog	*log) __releases(cil->xc_ctx_lock)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+	int		space_used = atomic_read(&cil->xc_ctx->space_used);
+
+	/*
+	 * The cil won't be empty because we are called while holding the
+	 * context lock so whatever we added to the CIL will still be there.
+	 */
+	ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
+
+	/*
+	 * We are done if:
+	 * - we haven't used up all the space available yet; or
+	 * - we've already queued up a push; and
+	 * - we're not over the hard limit; and
+	 * - nothing has been over the hard limit.
+	 *
+	 * If so, we don't need to take the push lock as there's nothing to do.
+	 */
+	if (space_used < XLOG_CIL_SPACE_LIMIT(log) ||
+	    (cil->xc_push_seq == cil->xc_current_sequence &&
+	     space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) &&
+	     !waitqueue_active(&cil->xc_push_wait))) {
+		up_read(&cil->xc_ctx_lock);
+		return;
+	}
+
+	spin_lock(&cil->xc_push_lock);
+	if (cil->xc_push_seq < cil->xc_current_sequence) {
+		cil->xc_push_seq = cil->xc_current_sequence;
+		queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
+	}
+
+	/*
+	 * Drop the context lock now, we can't hold that if we need to sleep
+	 * because we are over the blocking threshold. The push_lock is still
+	 * held, so blocking threshold sleep/wakeup is still correctly
+	 * serialised here.
+	 */
+	up_read(&cil->xc_ctx_lock);
+
+	/*
+	 * If we are well over the space limit, throttle the work that is being
+	 * done until the push work on this context has begun. Enforce the hard
+	 * throttle on all transaction commits once it has been activated, even
+	 * if the committing transactions have resulted in the space usage
+	 * dipping back down under the hard limit.
+	 *
+	 * The ctx->xc_push_lock provides the serialisation necessary for safely
+	 * calling xlog_cil_over_hard_limit() in this context.
+	 */
+	if (xlog_cil_over_hard_limit(log, space_used)) {
+		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
+		ASSERT(space_used < log->l_logsize);
+		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
+		return;
+	}
+
+	spin_unlock(&cil->xc_push_lock);
+
+}
+
+/*
+ * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
+ * number that is passed. When it returns, the work will be queued for
+ * @push_seq, but it won't be completed.
+ *
+ * If the caller is performing a synchronous force, we will flush the workqueue
+ * to get previously queued work moving to minimise the wait time they will
+ * undergo waiting for all outstanding pushes to complete. The caller is
+ * expected to do the required waiting for push_seq to complete.
+ *
+ * If the caller is performing an async push, we need to ensure that the
+ * checkpoint is fully flushed out of the iclogs when we finish the push. If we
+ * don't do this, then the commit record may remain sitting in memory in an
+ * ACTIVE iclog. This then requires another full log force to push to disk,
+ * which defeats the purpose of having an async, non-blocking CIL force
+ * mechanism. Hence in this case we need to pass a flag to the push work to
+ * indicate it needs to flush the commit record itself.
+ */
+static void
+xlog_cil_push_now(
+	struct xlog	*log,
+	xfs_lsn_t	push_seq,
+	bool		async)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+
+	if (!cil)
+		return;
+
+	ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
+
+	/* start on any pending background push to minimise wait time on it */
+	if (!async)
+		flush_workqueue(cil->xc_push_wq);
+
+	spin_lock(&cil->xc_push_lock);
+
+	/*
+	 * If this is an async flush request, we always need to set the
+	 * xc_push_commit_stable flag even if something else has already queued
+	 * a push. The flush caller is asking for the CIL to be on stable
+	 * storage when the next push completes, so regardless of who has queued
+	 * the push, the flush requires stable semantics from it.
+	 */
+	cil->xc_push_commit_stable = async;
+
+	/*
+	 * If the CIL is empty or we've already pushed the sequence then
+	 * there's no more work that we need to do.
+	 */
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) ||
+	    push_seq <= cil->xc_push_seq) {
+		spin_unlock(&cil->xc_push_lock);
+		return;
+	}
+
+	cil->xc_push_seq = push_seq;
+	queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
+	spin_unlock(&cil->xc_push_lock);
+}
+
+bool
+xlog_cil_empty(
+	struct xlog	*log)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+	bool		empty = false;
+
+	spin_lock(&cil->xc_push_lock);
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
+		empty = true;
+	spin_unlock(&cil->xc_push_lock);
+	return empty;
+}
+
+/*
+ * If there are intent done items in this transaction and the related intent was
+ * committed in the current (same) CIL checkpoint, we don't need to write either
+ * the intent or intent done item to the journal as the change will be
+ * journalled atomically within this checkpoint. As we cannot remove items from
+ * the CIL here, mark the related intent with a whiteout so that the CIL push
+ * can remove it rather than writing it to the journal. Then remove the intent
+ * done item from the current transaction and release it so it doesn't get put
+ * into the CIL at all.
+ */
+static uint32_t
+xlog_cil_process_intents(
+	struct xfs_cil		*cil,
+	struct xfs_trans	*tp)
+{
+	struct xfs_log_item	*lip, *ilip, *next;
+	uint32_t		len = 0;
+
+	list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+		if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE))
+			continue;
+
+		ilip = lip->li_ops->iop_intent(lip);
+		if (!ilip || !xlog_item_in_current_chkpt(cil, ilip))
+			continue;
+		set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
+		trace_xfs_cil_whiteout_mark(ilip);
+		len += ilip->li_lv->lv_bytes;
+		kmem_free(ilip->li_lv);
+		ilip->li_lv = NULL;
+
+		xfs_trans_del_item(lip);
+		lip->li_ops->iop_release(lip);
+	}
+	return len;
+}
+
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+void
+xlog_cil_commit(
+	struct xlog		*log,
+	struct xfs_trans	*tp,
+	xfs_csn_t		*commit_seq,
+	bool			regrant)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_log_item	*lip, *next;
+	uint32_t		released_space = 0;
+
+	/*
+	 * Do all necessary memory allocation before we lock the CIL.
+	 * This ensures the allocation does not deadlock with a CIL
+	 * push in memory reclaim (e.g. from kswapd).
+	 */
+	xlog_cil_alloc_shadow_bufs(log, tp);
+
+	/* lock out background commit */
+	down_read(&cil->xc_ctx_lock);
+
+	if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE)
+		released_space = xlog_cil_process_intents(cil, tp);
+
+	xlog_cil_insert_items(log, tp, released_space);
+
+	if (regrant && !xlog_is_shutdown(log))
+		xfs_log_ticket_regrant(log, tp->t_ticket);
+	else
+		xfs_log_ticket_ungrant(log, tp->t_ticket);
+	tp->t_ticket = NULL;
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * Once all the items of the transaction have been copied to the CIL,
+	 * the items can be unlocked and possibly freed.
+	 *
+	 * This needs to be done before we drop the CIL context lock because we
+	 * have to update state in the log items and unlock them before they go
+	 * to disk. If we don't, then the CIL checkpoint can race with us and
+	 * we can run checkpoint completion before we've updated and unlocked
+	 * the log items. This affects (at least) processing of stale buffers,
+	 * inodes and EFIs.
+	 */
+	trace_xfs_trans_commit_items(tp, _RET_IP_);
+	list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+		xfs_trans_del_item(lip);
+		if (lip->li_ops->iop_committing)
+			lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence);
+	}
+	if (commit_seq)
+		*commit_seq = cil->xc_ctx->sequence;
+
+	/* xlog_cil_push_background() releases cil->xc_ctx_lock */
+	xlog_cil_push_background(log);
+}
+
+/*
+ * Flush the CIL to stable storage but don't wait for it to complete. This
+ * requires the CIL push to ensure the commit record for the push hits the disk,
+ * but otherwise is no different to a push done from a log force.
+ */
+void
+xlog_cil_flush(
+	struct xlog	*log)
+{
+	xfs_csn_t	seq = log->l_cilp->xc_current_sequence;
+
+	trace_xfs_log_force(log->l_mp, seq, _RET_IP_);
+	xlog_cil_push_now(log, seq, true);
+
+	/*
+	 * If the CIL is empty, make sure that any previous checkpoint that may
+	 * still be in an active iclog is pushed to stable storage.
+	 */
+	if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags))
+		xfs_log_force(log->l_mp, 0);
+}
+
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence number given.
+ * Hence the only time we will trigger a push here is if the push sequence is
+ * the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ */
+xfs_lsn_t
+xlog_cil_force_seq(
+	struct xlog	*log,
+	xfs_csn_t	sequence)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx;
+	xfs_lsn_t		commit_lsn = NULLCOMMITLSN;
+
+	ASSERT(sequence <= cil->xc_current_sequence);
+
+	if (!sequence)
+		sequence = cil->xc_current_sequence;
+	trace_xfs_log_force(log->l_mp, sequence, _RET_IP_);
+
+	/*
+	 * check to see if we need to force out the current context.
+	 * xlog_cil_push() handles racing pushes for the same sequence,
+	 * so no need to deal with it here.
+	 */
+restart:
+	xlog_cil_push_now(log, sequence, false);
+
+	/*
+	 * See if we can find a previous sequence still committing.
+	 * We need to wait for all previous sequence commits to complete
+	 * before allowing the force of push_seq to go ahead. Hence block
+	 * on commits for those as well.
+	 */
+	spin_lock(&cil->xc_push_lock);
+	list_for_each_entry(ctx, &cil->xc_committing, committing) {
+		/*
+		 * Avoid getting stuck in this loop because we were woken by the
+		 * shutdown, but then went back to sleep once already in the
+		 * shutdown state.
+		 */
+		if (xlog_is_shutdown(log))
+			goto out_shutdown;
+		if (ctx->sequence > sequence)
+			continue;
+		if (!ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
+			xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
+			goto restart;
+		}
+		if (ctx->sequence != sequence)
+			continue;
+		/* found it! */
+		commit_lsn = ctx->commit_lsn;
+	}
+
+	/*
+	 * The call to xlog_cil_push_now() executes the push in the background.
+	 * Hence by the time we have got here it our sequence may not have been
+	 * pushed yet. This is true if the current sequence still matches the
+	 * push sequence after the above wait loop and the CIL still contains
+	 * dirty objects. This is guaranteed by the push code first adding the
+	 * context to the committing list before emptying the CIL.
+	 *
+	 * Hence if we don't find the context in the committing list and the
+	 * current sequence number is unchanged then the CIL contents are
+	 * significant.  If the CIL is empty, if means there was nothing to push
+	 * and that means there is nothing to wait for. If the CIL is not empty,
+	 * it means we haven't yet started the push, because if it had started
+	 * we would have found the context on the committing list.
+	 */
+	if (sequence == cil->xc_current_sequence &&
+	    !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
+		spin_unlock(&cil->xc_push_lock);
+		goto restart;
+	}
+
+	spin_unlock(&cil->xc_push_lock);
+	return commit_lsn;
+
+	/*
+	 * We detected a shutdown in progress. We need to trigger the log force
+	 * to pass through it's iclog state machine error handling, even though
+	 * we are already in a shutdown state. Hence we can't return
+	 * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
+	 * LSN is already stable), so we return a zero LSN instead.
+	 */
+out_shutdown:
+	spin_unlock(&cil->xc_push_lock);
+	return 0;
+}
+
+/*
+ * Move dead percpu state to the relevant CIL context structures.
+ *
+ * We have to lock the CIL context here to ensure that nothing is modifying
+ * the percpu state, either addition or removal. Both of these are done under
+ * the CIL context lock, so grabbing that exclusively here will ensure we can
+ * safely drain the cilpcp for the CPU that is dying.
+ */
+void
+xlog_cil_pcp_dead(
+	struct xlog		*log,
+	unsigned int		cpu)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xlog_cil_pcp	*cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+	struct xfs_cil_ctx	*ctx;
+
+	down_write(&cil->xc_ctx_lock);
+	ctx = cil->xc_ctx;
+	if (ctx->ticket)
+		ctx->ticket->t_curr_res += cilpcp->space_reserved;
+	cilpcp->space_reserved = 0;
+
+	if (!list_empty(&cilpcp->log_items))
+		list_splice_init(&cilpcp->log_items, &ctx->log_items);
+	if (!list_empty(&cilpcp->busy_extents))
+		list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
+	atomic_add(cilpcp->space_used, &ctx->space_used);
+	cilpcp->space_used = 0;
+	up_write(&cil->xc_ctx_lock);
+}
+
+/*
+ * Perform initial CIL structure initialisation.
+ */
+int
+xlog_cil_init(
+	struct xlog		*log)
+{
+	struct xfs_cil		*cil;
+	struct xfs_cil_ctx	*ctx;
+	struct xlog_cil_pcp	*cilpcp;
+	int			cpu;
+
+	cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
+	if (!cil)
+		return -ENOMEM;
+	/*
+	 * Limit the CIL pipeline depth to 4 concurrent works to bound the
+	 * concurrency the log spinlocks will be exposed to.
+	 */
+	cil->xc_push_wq = alloc_workqueue("xfs-cil/%s",
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
+			4, log->l_mp->m_super->s_id);
+	if (!cil->xc_push_wq)
+		goto out_destroy_cil;
+
+	cil->xc_log = log;
+	cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp);
+	if (!cil->xc_pcp)
+		goto out_destroy_wq;
+
+	for_each_possible_cpu(cpu) {
+		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+		INIT_LIST_HEAD(&cilpcp->busy_extents);
+		INIT_LIST_HEAD(&cilpcp->log_items);
+	}
+
+	INIT_LIST_HEAD(&cil->xc_committing);
+	spin_lock_init(&cil->xc_push_lock);
+	init_waitqueue_head(&cil->xc_push_wait);
+	init_rwsem(&cil->xc_ctx_lock);
+	init_waitqueue_head(&cil->xc_start_wait);
+	init_waitqueue_head(&cil->xc_commit_wait);
+	log->l_cilp = cil;
+
+	ctx = xlog_cil_ctx_alloc();
+	xlog_cil_ctx_switch(cil, ctx);
+	return 0;
+
+out_destroy_wq:
+	destroy_workqueue(cil->xc_push_wq);
+out_destroy_cil:
+	kmem_free(cil);
+	return -ENOMEM;
+}
+
+void
+xlog_cil_destroy(
+	struct xlog	*log)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+
+	if (cil->xc_ctx) {
+		if (cil->xc_ctx->ticket)
+			xfs_log_ticket_put(cil->xc_ctx->ticket);
+		kmem_free(cil->xc_ctx);
+	}
+
+	ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
+	free_percpu(cil->xc_pcp);
+	destroy_workqueue(cil->xc_push_wq);
+	kmem_free(cil);
+}
+
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
new file mode 100644
index 000000000..1bd2963e8
--- /dev/null
+++ b/fs/xfs/xfs_log_priv.h
@@ -0,0 +1,713 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_LOG_PRIV_H__
+#define __XFS_LOG_PRIV_H__
+
+struct xfs_buf;
+struct xlog;
+struct xlog_ticket;
+struct xfs_mount;
+
+/*
+ * get client id from packed copy.
+ *
+ * this hack is here because the xlog_pack code copies four bytes
+ * of xlog_op_header containing the fields oh_clientid, oh_flags
+ * and oh_res2 into the packed copy.
+ *
+ * later on this four byte chunk is treated as an int and the
+ * client id is pulled out.
+ *
+ * this has endian issues, of course.
+ */
+static inline uint xlog_get_client_id(__be32 i)
+{
+	return be32_to_cpu(i) >> 24;
+}
+
+/*
+ * In core log state
+ */
+enum xlog_iclog_state {
+	XLOG_STATE_ACTIVE,	/* Current IC log being written to */
+	XLOG_STATE_WANT_SYNC,	/* Want to sync this iclog; no more writes */
+	XLOG_STATE_SYNCING,	/* This IC log is syncing */
+	XLOG_STATE_DONE_SYNC,	/* Done syncing to disk */
+	XLOG_STATE_CALLBACK,	/* Callback functions now */
+	XLOG_STATE_DIRTY,	/* Dirty IC log, not ready for ACTIVE status */
+};
+
+#define XLOG_STATE_STRINGS \
+	{ XLOG_STATE_ACTIVE,	"XLOG_STATE_ACTIVE" }, \
+	{ XLOG_STATE_WANT_SYNC,	"XLOG_STATE_WANT_SYNC" }, \
+	{ XLOG_STATE_SYNCING,	"XLOG_STATE_SYNCING" }, \
+	{ XLOG_STATE_DONE_SYNC,	"XLOG_STATE_DONE_SYNC" }, \
+	{ XLOG_STATE_CALLBACK,	"XLOG_STATE_CALLBACK" }, \
+	{ XLOG_STATE_DIRTY,	"XLOG_STATE_DIRTY" }
+
+/*
+ * In core log flags
+ */
+#define XLOG_ICL_NEED_FLUSH	(1u << 0)	/* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA	(1u << 1)	/* iclog needs REQ_FUA */
+
+#define XLOG_ICL_STRINGS \
+	{ XLOG_ICL_NEED_FLUSH,	"XLOG_ICL_NEED_FLUSH" }, \
+	{ XLOG_ICL_NEED_FUA,	"XLOG_ICL_NEED_FUA" }
+
+
+/*
+ * Log ticket flags
+ */
+#define XLOG_TIC_PERM_RESERV	(1u << 0)	/* permanent reservation */
+
+#define XLOG_TIC_FLAGS \
+	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }
+
+/*
+ * Below are states for covering allocation transactions.
+ * By covering, we mean changing the h_tail_lsn in the last on-disk
+ * log write such that no allocation transactions will be re-done during
+ * recovery after a system crash. Recovery starts at the last on-disk
+ * log write.
+ *
+ * These states are used to insert dummy log entries to cover
+ * space allocation transactions which can undo non-transactional changes
+ * after a crash. Writes to a file with space
+ * already allocated do not result in any transactions. Allocations
+ * might include space beyond the EOF. So if we just push the EOF a
+ * little, the last transaction for the file could contain the wrong
+ * size. If there is no file system activity, after an allocation
+ * transaction, and the system crashes, the allocation transaction
+ * will get replayed and the file will be truncated. This could
+ * be hours/days/... after the allocation occurred.
+ *
+ * The fix for this is to do two dummy transactions when the
+ * system is idle. We need two dummy transaction because the h_tail_lsn
+ * in the log record header needs to point beyond the last possible
+ * non-dummy transaction. The first dummy changes the h_tail_lsn to
+ * the first transaction before the dummy. The second dummy causes
+ * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
+ *
+ * These dummy transactions get committed when everything
+ * is idle (after there has been some activity).
+ *
+ * There are 5 states used to control this.
+ *
+ *  IDLE -- no logging has been done on the file system or
+ *		we are done covering previous transactions.
+ *  NEED -- logging has occurred and we need a dummy transaction
+ *		when the log becomes idle.
+ *  DONE -- we were in the NEED state and have committed a dummy
+ *		transaction.
+ *  NEED2 -- we detected that a dummy transaction has gone to the
+ *		on disk log with no other transactions.
+ *  DONE2 -- we committed a dummy transaction when in the NEED2 state.
+ *
+ * There are two places where we switch states:
+ *
+ * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
+ *	We commit the dummy transaction and switch to DONE or DONE2,
+ *	respectively. In all other states, we don't do anything.
+ *
+ * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
+ *
+ *	No matter what state we are in, if this isn't the dummy
+ *	transaction going out, the next state is NEED.
+ *	So, if we aren't in the DONE or DONE2 states, the next state
+ *	is NEED. We can't be finishing a write of the dummy record
+ *	unless it was committed and the state switched to DONE or DONE2.
+ *
+ *	If we are in the DONE state and this was a write of the
+ *		dummy transaction, we move to NEED2.
+ *
+ *	If we are in the DONE2 state and this was a write of the
+ *		dummy transaction, we move to IDLE.
+ *
+ *
+ * Writing only one dummy transaction can get appended to
+ * one file space allocation. When this happens, the log recovery
+ * code replays the space allocation and a file could be truncated.
+ * This is why we have the NEED2 and DONE2 states before going idle.
+ */
+
+#define XLOG_STATE_COVER_IDLE	0
+#define XLOG_STATE_COVER_NEED	1
+#define XLOG_STATE_COVER_DONE	2
+#define XLOG_STATE_COVER_NEED2	3
+#define XLOG_STATE_COVER_DONE2	4
+
+#define XLOG_COVER_OPS		5
+
+typedef struct xlog_ticket {
+	struct list_head	t_queue;	/* reserve/write queue */
+	struct task_struct	*t_task;	/* task that owns this ticket */
+	xlog_tid_t		t_tid;		/* transaction identifier */
+	atomic_t		t_ref;		/* ticket reference count */
+	int			t_curr_res;	/* current reservation */
+	int			t_unit_res;	/* unit reservation */
+	char			t_ocnt;		/* original unit count */
+	char			t_cnt;		/* current unit count */
+	uint8_t			t_flags;	/* properties of reservation */
+	int			t_iclog_hdrs;	/* iclog hdrs in t_curr_res */
+} xlog_ticket_t;
+
+/*
+ * - A log record header is 512 bytes.  There is plenty of room to grow the
+ *	xlog_rec_header_t into the reserved space.
+ * - ic_data follows, so a write to disk can start at the beginning of
+ *	the iclog.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
+ * - ic_next is the pointer to the next iclog in the ring.
+ * - ic_log is a pointer back to the global log structure.
+ * - ic_size is the full size of the log buffer, minus the cycle headers.
+ * - ic_offset is the current number of bytes written to in this iclog.
+ * - ic_refcnt is bumped when someone is writing to the log.
+ * - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ *	- ic_callbacks
+ *	- ic_refcnt
+ *	- fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
+ */
+typedef struct xlog_in_core {
+	wait_queue_head_t	ic_force_wait;
+	wait_queue_head_t	ic_write_wait;
+	struct xlog_in_core	*ic_next;
+	struct xlog_in_core	*ic_prev;
+	struct xlog		*ic_log;
+	u32			ic_size;
+	u32			ic_offset;
+	enum xlog_iclog_state	ic_state;
+	unsigned int		ic_flags;
+	void			*ic_datap;	/* pointer to iclog data */
+	struct list_head	ic_callbacks;
+
+	/* reference counts need their own cacheline */
+	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
+	xlog_in_core_2_t	*ic_data;
+#define ic_header	ic_data->hic_header
+#ifdef DEBUG
+	bool			ic_fail_crc : 1;
+#endif
+	struct semaphore	ic_sema;
+	struct work_struct	ic_end_io_work;
+	struct bio		ic_bio;
+	struct bio_vec		ic_bvec[];
+} xlog_in_core_t;
+
+/*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+
+struct xfs_cil_ctx {
+	struct xfs_cil		*cil;
+	xfs_csn_t		sequence;	/* chkpt sequence # */
+	xfs_lsn_t		start_lsn;	/* first LSN of chkpt commit */
+	xfs_lsn_t		commit_lsn;	/* chkpt commit record lsn */
+	struct xlog_in_core	*commit_iclog;
+	struct xlog_ticket	*ticket;	/* chkpt ticket */
+	atomic_t		space_used;	/* aggregate size of regions */
+	struct list_head	busy_extents;	/* busy extents in chkpt */
+	struct list_head	log_items;	/* log items in chkpt */
+	struct list_head	lv_chain;	/* logvecs being pushed */
+	struct list_head	iclog_entry;
+	struct list_head	committing;	/* ctx committing list */
+	struct work_struct	discard_endio_work;
+	struct work_struct	push_work;
+	atomic_t		order_id;
+};
+
+/*
+ * Per-cpu CIL tracking items
+ */
+struct xlog_cil_pcp {
+	int32_t			space_used;
+	uint32_t		space_reserved;
+	struct list_head	busy_extents;
+	struct list_head	log_items;
+};
+
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+	struct xlog		*xc_log;
+	unsigned long		xc_flags;
+	atomic_t		xc_iclog_hdrs;
+	struct workqueue_struct	*xc_push_wq;
+
+	struct rw_semaphore	xc_ctx_lock ____cacheline_aligned_in_smp;
+	struct xfs_cil_ctx	*xc_ctx;
+
+	spinlock_t		xc_push_lock ____cacheline_aligned_in_smp;
+	xfs_csn_t		xc_push_seq;
+	bool			xc_push_commit_stable;
+	struct list_head	xc_committing;
+	wait_queue_head_t	xc_commit_wait;
+	wait_queue_head_t	xc_start_wait;
+	xfs_csn_t		xc_current_sequence;
+	wait_queue_head_t	xc_push_wait;	/* background push throttle */
+
+	void __percpu		*xc_pcp;	/* percpu CIL structures */
+#ifdef CONFIG_HOTPLUG_CPU
+	struct list_head	xc_pcp_list;
+#endif
+} ____cacheline_aligned_in_smp;
+
+/* xc_flags bit values */
+#define	XLOG_CIL_EMPTY		1
+#define XLOG_CIL_PCP_SPACE	2
+
+/*
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
+ * Whatever we choose, we have to make sure we can get a reservation for the
+ * log space effectively, that it is large enough to capture sufficient
+ * relogging to reduce log buffer IO significantly, but it is not too large for
+ * the log or induces too much latency when writing out through the iclogs. We
+ * track both space consumed and the number of vectors in the checkpoint
+ * context, so we need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can effectively make up arbitrary limits for
+ * the checkpoint size so long as they don't violate any other size rules.
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
+ * limited by that.  Furthermore, the log transaction reservation subsystem
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we only need to ensure the
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
+ * CIL but does not limit performance by reducing relogging efficiency
+ * significantly.
+ *
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
+ *   made without excessive blocking of incoming transaction commits. This is
+ *   defined to be 12.5% of the log space - half the 25% push threshold of the
+ *   AIL.
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
+ *   close to peak relogging efficiency. This is defined to be 16x the iclog
+ *   buffer window (32MB) as measurements have shown this to be roughly the
+ *   point of diminishing performance increases under highly concurrent
+ *   modification workloads.
+ *
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
+ * new threshold at which we block committing transactions until the background
+ * CIL commit commences and switches to a new context. While this is not a hard
+ * limit, it forces the process committing a transaction to the CIL to block and
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
+ * work. This prevents a process running lots of transactions from overfilling
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
+ * twice the background push space threshold so we keep in line with the AIL
+ * push thresholds.
+ *
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
+ * is inserted into the CIL and the push has been triggered. It is largely a
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
+ * limit will be difficult to implement without introducing global serialisation
+ * in the CIL commit fast path, and it's not at all clear that we actually need
+ * such hard limits given the ~7 years we've run without a hard limit before
+ * finding the first situation where a checkpoint size overflow actually
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
+ * we've overrun the max size.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log)	\
+	min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
+
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log)	\
+	(XLOG_CIL_SPACE_LIMIT(log) * 2)
+
+/*
+ * ticket grant locks, queues and accounting have their own cachlines
+ * as these are quite hot and can be operated on concurrently.
+ */
+struct xlog_grant_head {
+	spinlock_t		lock ____cacheline_aligned_in_smp;
+	struct list_head	waiters;
+	atomic64_t		grant;
+};
+
+/*
+ * The reservation head lsn is not made up of a cycle number and block number.
+ * Instead, it uses a cycle number and byte number.  Logs don't expect to
+ * overflow 31 bits worth of byte offset, so using a byte number will mean
+ * that round off problems won't occur when releasing partial reservations.
+ */
+struct xlog {
+	/* The following fields don't need locking */
+	struct xfs_mount	*l_mp;	        /* mount point */
+	struct xfs_ail		*l_ailp;	/* AIL log is working with */
+	struct xfs_cil		*l_cilp;	/* CIL log is working with */
+	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	struct workqueue_struct	*l_ioend_workqueue; /* for I/O completions */
+	struct delayed_work	l_work;		/* background flush work */
+	long			l_opstate;	/* operational state */
+	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+	struct list_head	*l_buf_cancel_table;
+	int			l_iclog_hsize;  /* size of iclog header */
+	int			l_iclog_heads;  /* # of iclog header sectors */
+	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
+	int			l_iclog_size;	/* size of log in bytes */
+	int			l_iclog_bufs;	/* number of iclog buffers */
+	xfs_daddr_t		l_logBBstart;   /* start block of log */
+	int			l_logsize;      /* size of log in bytes */
+	int			l_logBBsize;    /* size of log in BB chunks */
+
+	/* The following block of fields are changed while holding icloglock */
+	wait_queue_head_t	l_flush_wait ____cacheline_aligned_in_smp;
+						/* waiting for iclog flush */
+	int			l_covered_state;/* state of "covering disk
+						 * log entries" */
+	xlog_in_core_t		*l_iclog;       /* head log queue	*/
+	spinlock_t		l_icloglock;    /* grab to change iclog state */
+	int			l_curr_cycle;   /* Cycle number of log writes */
+	int			l_prev_cycle;   /* Cycle number before last
+						 * block increment */
+	int			l_curr_block;   /* current logical log block */
+	int			l_prev_block;   /* previous logical log block */
+
+	/*
+	 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
+	 * read without needing to hold specific locks. To avoid operations
+	 * contending with other hot objects, place each of them on a separate
+	 * cacheline.
+	 */
+	/* lsn of last LR on disk */
+	atomic64_t		l_last_sync_lsn ____cacheline_aligned_in_smp;
+	/* lsn of 1st LR with unflushed * buffers */
+	atomic64_t		l_tail_lsn ____cacheline_aligned_in_smp;
+
+	struct xlog_grant_head	l_reserve_head;
+	struct xlog_grant_head	l_write_head;
+
+	struct xfs_kobj		l_kobj;
+
+	/* log recovery lsn tracking (for buffer submission */
+	xfs_lsn_t		l_recovery_lsn;
+
+	uint32_t		l_iclog_roundoff;/* padding roundoff */
+
+	/* Users of log incompat features should take a read lock. */
+	struct rw_semaphore	l_incompat_users;
+};
+
+/*
+ * Bits for operational state
+ */
+#define XLOG_ACTIVE_RECOVERY	0	/* in the middle of recovery */
+#define XLOG_RECOVERY_NEEDED	1	/* log was recovered */
+#define XLOG_IO_ERROR		2	/* log hit an I/O error, and being
+				   shutdown */
+#define XLOG_TAIL_WARN		3	/* log tail verify warning issued */
+
+static inline bool
+xlog_recovery_needed(struct xlog *log)
+{
+	return test_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
+}
+
+static inline bool
+xlog_in_recovery(struct xlog *log)
+{
+	return test_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
+}
+
+static inline bool
+xlog_is_shutdown(struct xlog *log)
+{
+	return test_bit(XLOG_IO_ERROR, &log->l_opstate);
+}
+
+/*
+ * Wait until the xlog_force_shutdown() has marked the log as shut down
+ * so xlog_is_shutdown() will always return true.
+ */
+static inline void
+xlog_shutdown_wait(
+	struct xlog	*log)
+{
+	wait_var_event(&log->l_opstate, xlog_is_shutdown(log));
+}
+
+/* common routines */
+extern int
+xlog_recover(
+	struct xlog		*log);
+extern int
+xlog_recover_finish(
+	struct xlog		*log);
+extern void
+xlog_recover_cancel(struct xlog *);
+
+extern __le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+			    char *dp, int size);
+
+extern struct kmem_cache *xfs_log_ticket_cache;
+struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
+		int count, bool permanent);
+
+void	xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+void	xlog_print_trans(struct xfs_trans *);
+int	xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
+		struct list_head *lv_chain, struct xlog_ticket *tic,
+		uint32_t len);
+void	xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
+void	xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
+
+void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
+		int eventual_size);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+		struct xlog_ticket *ticket);
+
+/*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to sample and crack LSNs that are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+	xfs_lsn_t val = atomic64_read(lsn);
+
+	*cycle = CYCLE_LSN(val);
+	*block = BLOCK_LSN(val);
+}
+
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+	atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+	*cycle = val >> 32;
+	*space = val & 0xffffffff;
+}
+
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+	xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+	return ((int64_t)cycle << 32) | space;
+}
+
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+	atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+
+/*
+ * Committed Item List interfaces
+ */
+int	xlog_cil_init(struct xlog *log);
+void	xlog_cil_init_post_recovery(struct xlog *log);
+void	xlog_cil_destroy(struct xlog *log);
+bool	xlog_cil_empty(struct xlog *log);
+void	xlog_cil_commit(struct xlog *log, struct xfs_trans *tp,
+			xfs_csn_t *commit_seq, bool regrant);
+void	xlog_cil_set_ctx_write_state(struct xfs_cil_ctx *ctx,
+			struct xlog_in_core *iclog);
+
+
+/*
+ * CIL force routines
+ */
+void xlog_cil_flush(struct xlog *log);
+xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
+
+static inline void
+xlog_cil_force(struct xlog *log)
+{
+	xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence);
+}
+
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void
+xlog_wait(
+	struct wait_queue_head	*wq,
+	struct spinlock		*lock)
+		__releases(lock)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(wq, &wait);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
+}
+
+int xlog_wait_on_iclog(struct xlog_in_core *iclog);
+
+/*
+ * The LSN is valid so long as it is behind the current LSN. If it isn't, this
+ * means that the next log record that includes this metadata could have a
+ * smaller LSN. In turn, this means that the modification in the log would not
+ * replay.
+ */
+static inline bool
+xlog_valid_lsn(
+	struct xlog	*log,
+	xfs_lsn_t	lsn)
+{
+	int		cur_cycle;
+	int		cur_block;
+	bool		valid = true;
+
+	/*
+	 * First, sample the current lsn without locking to avoid added
+	 * contention from metadata I/O. The current cycle and block are updated
+	 * (in xlog_state_switch_iclogs()) and read here in a particular order
+	 * to avoid false negatives (e.g., thinking the metadata LSN is valid
+	 * when it is not).
+	 *
+	 * The current block is always rewound before the cycle is bumped in
+	 * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
+	 * a transiently forward state. Instead, we can see the LSN in a
+	 * transiently behind state if we happen to race with a cycle wrap.
+	 */
+	cur_cycle = READ_ONCE(log->l_curr_cycle);
+	smp_rmb();
+	cur_block = READ_ONCE(log->l_curr_block);
+
+	if ((CYCLE_LSN(lsn) > cur_cycle) ||
+	    (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
+		/*
+		 * If the metadata LSN appears invalid, it's possible the check
+		 * above raced with a wrap to the next log cycle. Grab the lock
+		 * to check for sure.
+		 */
+		spin_lock(&log->l_icloglock);
+		cur_cycle = log->l_curr_cycle;
+		cur_block = log->l_curr_block;
+		spin_unlock(&log->l_icloglock);
+
+		if ((CYCLE_LSN(lsn) > cur_cycle) ||
+		    (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
+			valid = false;
+	}
+
+	return valid;
+}
+
+/*
+ * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
+ * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
+ * to fall back to vmalloc, so we can't actually do anything useful with gfp
+ * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
+ * will do direct reclaim and compaction in the slow path, both of which are
+ * horrendously expensive. We just want kmalloc to fail fast and fall back to
+ * vmalloc if it can't get somethign straight away from the free lists or
+ * buddy allocator. Hence we have to open code kvmalloc outselves here.
+ *
+ * This assumes that the caller uses memalloc_nofs_save task context here, so
+ * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
+ * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
+ * allocations, so lets just all pretend this is a GFP_KERNEL context
+ * operation....
+ */
+static inline void *
+xlog_kvmalloc(
+	size_t		buf_size)
+{
+	gfp_t		flags = GFP_KERNEL;
+	void		*p;
+
+	flags &= ~__GFP_DIRECT_RECLAIM;
+	flags |= __GFP_NOWARN | __GFP_NORETRY;
+	do {
+		p = kmalloc(buf_size, flags);
+		if (!p)
+			p = vmalloc(buf_size);
+	} while (!p);
+
+	return p;
+}
+
+/*
+ * CIL CPU dead notifier
+ */
+void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
+
+#endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
new file mode 100644
index 000000000..322eb2ee6
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.c
@@ -0,0 +1,3517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_trans_priv.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_error.h"
+#include "xfs_buf_item.h"
+#include "xfs_ag.h"
+#include "xfs_quota.h"
+#include "xfs_reflink.h"
+
+#define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
+
+STATIC int
+xlog_find_zeroed(
+	struct xlog	*,
+	xfs_daddr_t	*);
+STATIC int
+xlog_clear_stale_blocks(
+	struct xlog	*,
+	xfs_lsn_t);
+STATIC int
+xlog_do_recovery_pass(
+        struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
+
+/*
+ * Sector aligned buffer routines for buffer create/read/write/access
+ */
+
+/*
+ * Verify the log-relative block number and length in basic blocks are valid for
+ * an operation involving the given XFS log buffer. Returns true if the fields
+ * are valid, false otherwise.
+ */
+static inline bool
+xlog_verify_bno(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		bbcount)
+{
+	if (blk_no < 0 || blk_no >= log->l_logBBsize)
+		return false;
+	if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
+		return false;
+	return true;
+}
+
+/*
+ * Allocate a buffer to hold log data.  The buffer needs to be able to map to
+ * a range of nbblks basic blocks at any valid offset within the log.
+ */
+static char *
+xlog_alloc_buffer(
+	struct xlog	*log,
+	int		nbblks)
+{
+	/*
+	 * Pass log block 0 since we don't have an addr yet, buffer will be
+	 * verified on read.
+	 */
+	if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+			nbblks);
+		return NULL;
+	}
+
+	/*
+	 * We do log I/O in units of log sectors (a power-of-2 multiple of the
+	 * basic block size), so we round up the requested size to accommodate
+	 * the basic blocks required for complete log sectors.
+	 *
+	 * In addition, the buffer may be used for a non-sector-aligned block
+	 * offset, in which case an I/O of the requested size could extend
+	 * beyond the end of the buffer.  If the requested size is only 1 basic
+	 * block it will never straddle a sector boundary, so this won't be an
+	 * issue.  Nor will this be a problem if the log I/O is done in basic
+	 * blocks (sector size 1).  But otherwise we extend the buffer by one
+	 * extra log sector to ensure there's space to accommodate this
+	 * possibility.
+	 */
+	if (nbblks > 1 && log->l_sectBBsize > 1)
+		nbblks += log->l_sectBBsize;
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+	return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+}
+
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer.  The buffer covers a log sector-aligned region.
+ */
+static inline unsigned int
+xlog_align(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no)
+{
+	return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
+}
+
+static int
+xlog_do_io(
+	struct xlog		*log,
+	xfs_daddr_t		blk_no,
+	unsigned int		nbblks,
+	char			*data,
+	enum req_op		op)
+{
+	int			error;
+
+	if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
+		xfs_warn(log->l_mp,
+			 "Invalid log block/length (0x%llx, 0x%x) for buffer",
+			 blk_no, nbblks);
+		return -EFSCORRUPTED;
+	}
+
+	blk_no = round_down(blk_no, log->l_sectBBsize);
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+	ASSERT(nbblks > 0);
+
+	error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
+			BBTOB(nbblks), data, op);
+	if (error && !xlog_is_shutdown(log)) {
+		xfs_alert(log->l_mp,
+			  "log recovery %s I/O error at daddr 0x%llx len %d error %d",
+			  op == REQ_OP_WRITE ? "write" : "read",
+			  blk_no, nbblks, error);
+	}
+	return error;
+}
+
+STATIC int
+xlog_bread_noalign(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	char		*data)
+{
+	return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
+}
+
+STATIC int
+xlog_bread(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	char		*data,
+	char		**offset)
+{
+	int		error;
+
+	error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
+	if (!error)
+		*offset = data + xlog_align(log, blk_no);
+	return error;
+}
+
+STATIC int
+xlog_bwrite(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	char		*data)
+{
+	return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
+}
+
+#ifdef DEBUG
+/*
+ * dump debug superblock and log record information
+ */
+STATIC void
+xlog_header_check_dump(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
+		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
+	xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
+		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
+}
+#else
+#define xlog_header_check_dump(mp, head)
+#endif
+
+/*
+ * check log record header for recovery
+ */
+STATIC int
+xlog_header_check_recover(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
+
+	/*
+	 * IRIX doesn't write the h_fmt field and leaves it zeroed
+	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
+	 * a dirty log created in IRIX.
+	 */
+	if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
+		xfs_warn(mp,
+	"dirty log written in incompatible format - can't recover");
+		xlog_header_check_dump(mp, head);
+		return -EFSCORRUPTED;
+	}
+	if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
+					   &head->h_fs_uuid))) {
+		xfs_warn(mp,
+	"dirty log entry has mismatched uuid - can't recover");
+		xlog_header_check_dump(mp, head);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * read the head block of the log and check the header
+ */
+STATIC int
+xlog_header_check_mount(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
+
+	if (uuid_is_null(&head->h_fs_uuid)) {
+		/*
+		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
+		 * h_fs_uuid is null, we assume this log was last mounted
+		 * by IRIX and continue.
+		 */
+		xfs_warn(mp, "null uuid in log - IRIX style log");
+	} else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
+						  &head->h_fs_uuid))) {
+		xfs_warn(mp, "log has mismatched uuid - can't recover");
+		xlog_header_check_dump(mp, head);
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * This routine finds (to an approximation) the first block in the physical
+ * log which contains the given cycle.  It uses a binary search algorithm.
+ * Note that the algorithm can not be perfect because the disk will not
+ * necessarily be perfect.
+ */
+STATIC int
+xlog_find_cycle_start(
+	struct xlog	*log,
+	char		*buffer,
+	xfs_daddr_t	first_blk,
+	xfs_daddr_t	*last_blk,
+	uint		cycle)
+{
+	char		*offset;
+	xfs_daddr_t	mid_blk;
+	xfs_daddr_t	end_blk;
+	uint		mid_cycle;
+	int		error;
+
+	end_blk = *last_blk;
+	mid_blk = BLK_AVG(first_blk, end_blk);
+	while (mid_blk != first_blk && mid_blk != end_blk) {
+		error = xlog_bread(log, mid_blk, 1, buffer, &offset);
+		if (error)
+			return error;
+		mid_cycle = xlog_get_cycle(offset);
+		if (mid_cycle == cycle)
+			end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
+		else
+			first_blk = mid_blk; /* first_half_cycle == mid_cycle */
+		mid_blk = BLK_AVG(first_blk, end_blk);
+	}
+	ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
+	       (mid_blk == end_blk && mid_blk-1 == first_blk));
+
+	*last_blk = end_blk;
+
+	return 0;
+}
+
+/*
+ * Check that a range of blocks does not contain stop_on_cycle_no.
+ * Fill in *new_blk with the block offset where such a block is
+ * found, or with -1 (an invalid block number) if there is no such
+ * block in the range.  The scan needs to occur from front to back
+ * and the pointer into the region must be updated since a later
+ * routine will need to perform another test.
+ */
+STATIC int
+xlog_find_verify_cycle(
+	struct xlog	*log,
+	xfs_daddr_t	start_blk,
+	int		nbblks,
+	uint		stop_on_cycle_no,
+	xfs_daddr_t	*new_blk)
+{
+	xfs_daddr_t	i, j;
+	uint		cycle;
+	char		*buffer;
+	xfs_daddr_t	bufblks;
+	char		*buf = NULL;
+	int		error = 0;
+
+	/*
+	 * Greedily allocate a buffer big enough to handle the full
+	 * range of basic blocks we'll be examining.  If that fails,
+	 * try a smaller size.  We need to be able to read at least
+	 * a log sector, or we're out of luck.
+	 */
+	bufblks = 1 << ffs(nbblks);
+	while (bufblks > log->l_logBBsize)
+		bufblks >>= 1;
+	while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
+		bufblks >>= 1;
+		if (bufblks < log->l_sectBBsize)
+			return -ENOMEM;
+	}
+
+	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
+		int	bcount;
+
+		bcount = min(bufblks, (start_blk + nbblks - i));
+
+		error = xlog_bread(log, i, bcount, buffer, &buf);
+		if (error)
+			goto out;
+
+		for (j = 0; j < bcount; j++) {
+			cycle = xlog_get_cycle(buf);
+			if (cycle == stop_on_cycle_no) {
+				*new_blk = i+j;
+				goto out;
+			}
+
+			buf += BBSIZE;
+		}
+	}
+
+	*new_blk = -1;
+
+out:
+	kmem_free(buffer);
+	return error;
+}
+
+static inline int
+xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
+{
+	if (xfs_has_logv2(log->l_mp)) {
+		int	h_size = be32_to_cpu(rh->h_size);
+
+		if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
+		    h_size > XLOG_HEADER_CYCLE_SIZE)
+			return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
+	}
+	return 1;
+}
+
+/*
+ * Potentially backup over partial log record write.
+ *
+ * In the typical case, last_blk is the number of the block directly after
+ * a good log record.  Therefore, we subtract one to get the block number
+ * of the last block in the given buffer.  extra_bblks contains the number
+ * of blocks we would have read on a previous read.  This happens when the
+ * last log record is split over the end of the physical log.
+ *
+ * extra_bblks is the number of blocks potentially verified on a previous
+ * call to this routine.
+ */
+STATIC int
+xlog_find_verify_log_record(
+	struct xlog		*log,
+	xfs_daddr_t		start_blk,
+	xfs_daddr_t		*last_blk,
+	int			extra_bblks)
+{
+	xfs_daddr_t		i;
+	char			*buffer;
+	char			*offset = NULL;
+	xlog_rec_header_t	*head = NULL;
+	int			error = 0;
+	int			smallmem = 0;
+	int			num_blks = *last_blk - start_blk;
+	int			xhdrs;
+
+	ASSERT(start_blk != 0 || *last_blk != start_blk);
+
+	buffer = xlog_alloc_buffer(log, num_blks);
+	if (!buffer) {
+		buffer = xlog_alloc_buffer(log, 1);
+		if (!buffer)
+			return -ENOMEM;
+		smallmem = 1;
+	} else {
+		error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
+		if (error)
+			goto out;
+		offset += ((num_blks - 1) << BBSHIFT);
+	}
+
+	for (i = (*last_blk) - 1; i >= 0; i--) {
+		if (i < start_blk) {
+			/* valid log record not found */
+			xfs_warn(log->l_mp,
+		"Log inconsistent (didn't find previous header)");
+			ASSERT(0);
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+
+		if (smallmem) {
+			error = xlog_bread(log, i, 1, buffer, &offset);
+			if (error)
+				goto out;
+		}
+
+		head = (xlog_rec_header_t *)offset;
+
+		if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
+			break;
+
+		if (!smallmem)
+			offset -= BBSIZE;
+	}
+
+	/*
+	 * We hit the beginning of the physical log & still no header.  Return
+	 * to caller.  If caller can handle a return of -1, then this routine
+	 * will be called again for the end of the physical log.
+	 */
+	if (i == -1) {
+		error = 1;
+		goto out;
+	}
+
+	/*
+	 * We have the final block of the good log (the first block
+	 * of the log record _before_ the head. So we check the uuid.
+	 */
+	if ((error = xlog_header_check_mount(log->l_mp, head)))
+		goto out;
+
+	/*
+	 * We may have found a log record header before we expected one.
+	 * last_blk will be the 1st block # with a given cycle #.  We may end
+	 * up reading an entire log record.  In this case, we don't want to
+	 * reset last_blk.  Only when last_blk points in the middle of a log
+	 * record do we update last_blk.
+	 */
+	xhdrs = xlog_logrec_hblks(log, head);
+
+	if (*last_blk - i + extra_bblks !=
+	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
+		*last_blk = i;
+
+out:
+	kmem_free(buffer);
+	return error;
+}
+
+/*
+ * Head is defined to be the point of the log where the next log write
+ * could go.  This means that incomplete LR writes at the end are
+ * eliminated when calculating the head.  We aren't guaranteed that previous
+ * LR have complete transactions.  We only know that a cycle number of
+ * current cycle number -1 won't be present in the log if we start writing
+ * from our current block number.
+ *
+ * last_blk contains the block number of the first block with a given
+ * cycle number.
+ *
+ * Return: zero if normal, non-zero if error.
+ */
+STATIC int
+xlog_find_head(
+	struct xlog	*log,
+	xfs_daddr_t	*return_head_blk)
+{
+	char		*buffer;
+	char		*offset;
+	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
+	int		num_scan_bblks;
+	uint		first_half_cycle, last_half_cycle;
+	uint		stop_on_cycle;
+	int		error, log_bbnum = log->l_logBBsize;
+
+	/* Is the end of the log device zeroed? */
+	error = xlog_find_zeroed(log, &first_blk);
+	if (error < 0) {
+		xfs_warn(log->l_mp, "empty log check failed");
+		return error;
+	}
+	if (error == 1) {
+		*return_head_blk = first_blk;
+
+		/* Is the whole lot zeroed? */
+		if (!first_blk) {
+			/* Linux XFS shouldn't generate totally zeroed logs -
+			 * mkfs etc write a dummy unmount record to a fresh
+			 * log so we can store the uuid in there
+			 */
+			xfs_warn(log->l_mp, "totally zeroed log");
+		}
+
+		return 0;
+	}
+
+	first_blk = 0;			/* get cycle # of 1st block */
+	buffer = xlog_alloc_buffer(log, 1);
+	if (!buffer)
+		return -ENOMEM;
+
+	error = xlog_bread(log, 0, 1, buffer, &offset);
+	if (error)
+		goto out_free_buffer;
+
+	first_half_cycle = xlog_get_cycle(offset);
+
+	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
+	error = xlog_bread(log, last_blk, 1, buffer, &offset);
+	if (error)
+		goto out_free_buffer;
+
+	last_half_cycle = xlog_get_cycle(offset);
+	ASSERT(last_half_cycle != 0);
+
+	/*
+	 * If the 1st half cycle number is equal to the last half cycle number,
+	 * then the entire log is stamped with the same cycle number.  In this
+	 * case, head_blk can't be set to zero (which makes sense).  The below
+	 * math doesn't work out properly with head_blk equal to zero.  Instead,
+	 * we set it to log_bbnum which is an invalid block number, but this
+	 * value makes the math correct.  If head_blk doesn't changed through
+	 * all the tests below, *head_blk is set to zero at the very end rather
+	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
+	 * in a circular file.
+	 */
+	if (first_half_cycle == last_half_cycle) {
+		/*
+		 * In this case we believe that the entire log should have
+		 * cycle number last_half_cycle.  We need to scan backwards
+		 * from the end verifying that there are no holes still
+		 * containing last_half_cycle - 1.  If we find such a hole,
+		 * then the start of that hole will be the new head.  The
+		 * simple case looks like
+		 *        x | x ... | x - 1 | x
+		 * Another case that fits this picture would be
+		 *        x | x + 1 | x ... | x
+		 * In this case the head really is somewhere at the end of the
+		 * log, as one of the latest writes at the beginning was
+		 * incomplete.
+		 * One more case is
+		 *        x | x + 1 | x ... | x - 1 | x
+		 * This is really the combination of the above two cases, and
+		 * the head has to end up at the start of the x-1 hole at the
+		 * end of the log.
+		 *
+		 * In the 256k log case, we will read from the beginning to the
+		 * end of the log and search for cycle numbers equal to x-1.
+		 * We don't worry about the x+1 blocks that we encounter,
+		 * because we know that they cannot be the head since the log
+		 * started with x.
+		 */
+		head_blk = log_bbnum;
+		stop_on_cycle = last_half_cycle - 1;
+	} else {
+		/*
+		 * In this case we want to find the first block with cycle
+		 * number matching last_half_cycle.  We expect the log to be
+		 * some variation on
+		 *        x + 1 ... | x ... | x
+		 * The first block with cycle number x (last_half_cycle) will
+		 * be where the new head belongs.  First we do a binary search
+		 * for the first occurrence of last_half_cycle.  The binary
+		 * search may not be totally accurate, so then we scan back
+		 * from there looking for occurrences of last_half_cycle before
+		 * us.  If that backwards scan wraps around the beginning of
+		 * the log, then we look for occurrences of last_half_cycle - 1
+		 * at the end of the log.  The cases we're looking for look
+		 * like
+		 *                               v binary search stopped here
+		 *        x + 1 ... | x | x + 1 | x ... | x
+		 *                   ^ but we want to locate this spot
+		 * or
+		 *        <---------> less than scan distance
+		 *        x + 1 ... | x ... | x - 1 | x
+		 *                           ^ we want to locate this spot
+		 */
+		stop_on_cycle = last_half_cycle;
+		error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
+				last_half_cycle);
+		if (error)
+			goto out_free_buffer;
+	}
+
+	/*
+	 * Now validate the answer.  Scan back some number of maximum possible
+	 * blocks and make sure each one has the expected cycle number.  The
+	 * maximum is determined by the total possible amount of buffering
+	 * in the in-core log.  The following number can be made tighter if
+	 * we actually look at the block size of the filesystem.
+	 */
+	num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
+	if (head_blk >= num_scan_bblks) {
+		/*
+		 * We are guaranteed that the entire check can be performed
+		 * in one buffer.
+		 */
+		start_blk = head_blk - num_scan_bblks;
+		if ((error = xlog_find_verify_cycle(log,
+						start_blk, num_scan_bblks,
+						stop_on_cycle, &new_blk)))
+			goto out_free_buffer;
+		if (new_blk != -1)
+			head_blk = new_blk;
+	} else {		/* need to read 2 parts of log */
+		/*
+		 * We are going to scan backwards in the log in two parts.
+		 * First we scan the physical end of the log.  In this part
+		 * of the log, we are looking for blocks with cycle number
+		 * last_half_cycle - 1.
+		 * If we find one, then we know that the log starts there, as
+		 * we've found a hole that didn't get written in going around
+		 * the end of the physical log.  The simple case for this is
+		 *        x + 1 ... | x ... | x - 1 | x
+		 *        <---------> less than scan distance
+		 * If all of the blocks at the end of the log have cycle number
+		 * last_half_cycle, then we check the blocks at the start of
+		 * the log looking for occurrences of last_half_cycle.  If we
+		 * find one, then our current estimate for the location of the
+		 * first occurrence of last_half_cycle is wrong and we move
+		 * back to the hole we've found.  This case looks like
+		 *        x + 1 ... | x | x + 1 | x ...
+		 *                               ^ binary search stopped here
+		 * Another case we need to handle that only occurs in 256k
+		 * logs is
+		 *        x + 1 ... | x ... | x+1 | x ...
+		 *                   ^ binary search stops here
+		 * In a 256k log, the scan at the end of the log will see the
+		 * x + 1 blocks.  We need to skip past those since that is
+		 * certainly not the head of the log.  By searching for
+		 * last_half_cycle-1 we accomplish that.
+		 */
+		ASSERT(head_blk <= INT_MAX &&
+			(xfs_daddr_t) num_scan_bblks >= head_blk);
+		start_blk = log_bbnum - (num_scan_bblks - head_blk);
+		if ((error = xlog_find_verify_cycle(log, start_blk,
+					num_scan_bblks - (int)head_blk,
+					(stop_on_cycle - 1), &new_blk)))
+			goto out_free_buffer;
+		if (new_blk != -1) {
+			head_blk = new_blk;
+			goto validate_head;
+		}
+
+		/*
+		 * Scan beginning of log now.  The last part of the physical
+		 * log is good.  This scan needs to verify that it doesn't find
+		 * the last_half_cycle.
+		 */
+		start_blk = 0;
+		ASSERT(head_blk <= INT_MAX);
+		if ((error = xlog_find_verify_cycle(log,
+					start_blk, (int)head_blk,
+					stop_on_cycle, &new_blk)))
+			goto out_free_buffer;
+		if (new_blk != -1)
+			head_blk = new_blk;
+	}
+
+validate_head:
+	/*
+	 * Now we need to make sure head_blk is not pointing to a block in
+	 * the middle of a log record.
+	 */
+	num_scan_bblks = XLOG_REC_SHIFT(log);
+	if (head_blk >= num_scan_bblks) {
+		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
+
+		/* start ptr at last block ptr before head_blk */
+		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+		if (error == 1)
+			error = -EIO;
+		if (error)
+			goto out_free_buffer;
+	} else {
+		start_blk = 0;
+		ASSERT(head_blk <= INT_MAX);
+		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+		if (error < 0)
+			goto out_free_buffer;
+		if (error == 1) {
+			/* We hit the beginning of the log during our search */
+			start_blk = log_bbnum - (num_scan_bblks - head_blk);
+			new_blk = log_bbnum;
+			ASSERT(start_blk <= INT_MAX &&
+				(xfs_daddr_t) log_bbnum-start_blk >= 0);
+			ASSERT(head_blk <= INT_MAX);
+			error = xlog_find_verify_log_record(log, start_blk,
+							&new_blk, (int)head_blk);
+			if (error == 1)
+				error = -EIO;
+			if (error)
+				goto out_free_buffer;
+			if (new_blk != log_bbnum)
+				head_blk = new_blk;
+		} else if (error)
+			goto out_free_buffer;
+	}
+
+	kmem_free(buffer);
+	if (head_blk == log_bbnum)
+		*return_head_blk = 0;
+	else
+		*return_head_blk = head_blk;
+	/*
+	 * When returning here, we have a good block number.  Bad block
+	 * means that during a previous crash, we didn't have a clean break
+	 * from cycle number N to cycle number N-1.  In this case, we need
+	 * to find the first block with cycle number N-1.
+	 */
+	return 0;
+
+out_free_buffer:
+	kmem_free(buffer);
+	if (error)
+		xfs_warn(log->l_mp, "failed to find log head");
+	return error;
+}
+
+/*
+ * Seek backwards in the log for log record headers.
+ *
+ * Given a starting log block, walk backwards until we find the provided number
+ * of records or hit the provided tail block. The return value is the number of
+ * records encountered or a negative error code. The log block and buffer
+ * pointer of the last record seen are returned in rblk and rhead respectively.
+ */
+STATIC int
+xlog_rseek_logrec_hdr(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk,
+	int			count,
+	char			*buffer,
+	xfs_daddr_t		*rblk,
+	struct xlog_rec_header	**rhead,
+	bool			*wrapped)
+{
+	int			i;
+	int			error;
+	int			found = 0;
+	char			*offset = NULL;
+	xfs_daddr_t		end_blk;
+
+	*wrapped = false;
+
+	/*
+	 * Walk backwards from the head block until we hit the tail or the first
+	 * block in the log.
+	 */
+	end_blk = head_blk > tail_blk ? tail_blk : 0;
+	for (i = (int) head_blk - 1; i >= end_blk; i--) {
+		error = xlog_bread(log, i, 1, buffer, &offset);
+		if (error)
+			goto out_error;
+
+		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+			*rblk = i;
+			*rhead = (struct xlog_rec_header *) offset;
+			if (++found == count)
+				break;
+		}
+	}
+
+	/*
+	 * If we haven't hit the tail block or the log record header count,
+	 * start looking again from the end of the physical log. Note that
+	 * callers can pass head == tail if the tail is not yet known.
+	 */
+	if (tail_blk >= head_blk && found != count) {
+		for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
+			error = xlog_bread(log, i, 1, buffer, &offset);
+			if (error)
+				goto out_error;
+
+			if (*(__be32 *)offset ==
+			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+				*wrapped = true;
+				*rblk = i;
+				*rhead = (struct xlog_rec_header *) offset;
+				if (++found == count)
+					break;
+			}
+		}
+	}
+
+	return found;
+
+out_error:
+	return error;
+}
+
+/*
+ * Seek forward in the log for log record headers.
+ *
+ * Given head and tail blocks, walk forward from the tail block until we find
+ * the provided number of records or hit the head block. The return value is the
+ * number of records encountered or a negative error code. The log block and
+ * buffer pointer of the last record seen are returned in rblk and rhead
+ * respectively.
+ */
+STATIC int
+xlog_seek_logrec_hdr(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk,
+	int			count,
+	char			*buffer,
+	xfs_daddr_t		*rblk,
+	struct xlog_rec_header	**rhead,
+	bool			*wrapped)
+{
+	int			i;
+	int			error;
+	int			found = 0;
+	char			*offset = NULL;
+	xfs_daddr_t		end_blk;
+
+	*wrapped = false;
+
+	/*
+	 * Walk forward from the tail block until we hit the head or the last
+	 * block in the log.
+	 */
+	end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
+	for (i = (int) tail_blk; i <= end_blk; i++) {
+		error = xlog_bread(log, i, 1, buffer, &offset);
+		if (error)
+			goto out_error;
+
+		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+			*rblk = i;
+			*rhead = (struct xlog_rec_header *) offset;
+			if (++found == count)
+				break;
+		}
+	}
+
+	/*
+	 * If we haven't hit the head block or the log record header count,
+	 * start looking again from the start of the physical log.
+	 */
+	if (tail_blk > head_blk && found != count) {
+		for (i = 0; i < (int) head_blk; i++) {
+			error = xlog_bread(log, i, 1, buffer, &offset);
+			if (error)
+				goto out_error;
+
+			if (*(__be32 *)offset ==
+			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+				*wrapped = true;
+				*rblk = i;
+				*rhead = (struct xlog_rec_header *) offset;
+				if (++found == count)
+					break;
+			}
+		}
+	}
+
+	return found;
+
+out_error:
+	return error;
+}
+
+/*
+ * Calculate distance from head to tail (i.e., unused space in the log).
+ */
+static inline int
+xlog_tail_distance(
+	struct xlog	*log,
+	xfs_daddr_t	head_blk,
+	xfs_daddr_t	tail_blk)
+{
+	if (head_blk < tail_blk)
+		return tail_blk - head_blk;
+
+	return tail_blk + (log->l_logBBsize - head_blk);
+}
+
+/*
+ * Verify the log tail. This is particularly important when torn or incomplete
+ * writes have been detected near the front of the log and the head has been
+ * walked back accordingly.
+ *
+ * We also have to handle the case where the tail was pinned and the head
+ * blocked behind the tail right before a crash. If the tail had been pushed
+ * immediately prior to the crash and the subsequent checkpoint was only
+ * partially written, it's possible it overwrote the last referenced tail in the
+ * log with garbage. This is not a coherency problem because the tail must have
+ * been pushed before it can be overwritten, but appears as log corruption to
+ * recovery because we have no way to know the tail was updated if the
+ * subsequent checkpoint didn't write successfully.
+ *
+ * Therefore, CRC check the log from tail to head. If a failure occurs and the
+ * offending record is within max iclog bufs from the head, walk the tail
+ * forward and retry until a valid tail is found or corruption is detected out
+ * of the range of a possible overwrite.
+ */
+STATIC int
+xlog_verify_tail(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		*tail_blk,
+	int			hsize)
+{
+	struct xlog_rec_header	*thead;
+	char			*buffer;
+	xfs_daddr_t		first_bad;
+	int			error = 0;
+	bool			wrapped;
+	xfs_daddr_t		tmp_tail;
+	xfs_daddr_t		orig_tail = *tail_blk;
+
+	buffer = xlog_alloc_buffer(log, 1);
+	if (!buffer)
+		return -ENOMEM;
+
+	/*
+	 * Make sure the tail points to a record (returns positive count on
+	 * success).
+	 */
+	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
+			&tmp_tail, &thead, &wrapped);
+	if (error < 0)
+		goto out;
+	if (*tail_blk != tmp_tail)
+		*tail_blk = tmp_tail;
+
+	/*
+	 * Run a CRC check from the tail to the head. We can't just check
+	 * MAX_ICLOGS records past the tail because the tail may point to stale
+	 * blocks cleared during the search for the head/tail. These blocks are
+	 * overwritten with zero-length records and thus record count is not a
+	 * reliable indicator of the iclog state before a crash.
+	 */
+	first_bad = 0;
+	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+				      XLOG_RECOVER_CRCPASS, &first_bad);
+	while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+		int	tail_distance;
+
+		/*
+		 * Is corruption within range of the head? If so, retry from
+		 * the next record. Otherwise return an error.
+		 */
+		tail_distance = xlog_tail_distance(log, head_blk, first_bad);
+		if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
+			break;
+
+		/* skip to the next record; returns positive count on success */
+		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
+				buffer, &tmp_tail, &thead, &wrapped);
+		if (error < 0)
+			goto out;
+
+		*tail_blk = tmp_tail;
+		first_bad = 0;
+		error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+					      XLOG_RECOVER_CRCPASS, &first_bad);
+	}
+
+	if (!error && *tail_blk != orig_tail)
+		xfs_warn(log->l_mp,
+		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
+			 orig_tail, *tail_blk);
+out:
+	kmem_free(buffer);
+	return error;
+}
+
+/*
+ * Detect and trim torn writes from the head of the log.
+ *
+ * Storage without sector atomicity guarantees can result in torn writes in the
+ * log in the event of a crash. Our only means to detect this scenario is via
+ * CRC verification. While we can't always be certain that CRC verification
+ * failure is due to a torn write vs. an unrelated corruption, we do know that
+ * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
+ * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
+ * the log and treat failures in this range as torn writes as a matter of
+ * policy. In the event of CRC failure, the head is walked back to the last good
+ * record in the log and the tail is updated from that record and verified.
+ */
+STATIC int
+xlog_verify_head(
+	struct xlog		*log,
+	xfs_daddr_t		*head_blk,	/* in/out: unverified head */
+	xfs_daddr_t		*tail_blk,	/* out: tail block */
+	char			*buffer,
+	xfs_daddr_t		*rhead_blk,	/* start blk of last record */
+	struct xlog_rec_header	**rhead,	/* ptr to last record */
+	bool			*wrapped)	/* last rec. wraps phys. log */
+{
+	struct xlog_rec_header	*tmp_rhead;
+	char			*tmp_buffer;
+	xfs_daddr_t		first_bad;
+	xfs_daddr_t		tmp_rhead_blk;
+	int			found;
+	int			error;
+	bool			tmp_wrapped;
+
+	/*
+	 * Check the head of the log for torn writes. Search backwards from the
+	 * head until we hit the tail or the maximum number of log record I/Os
+	 * that could have been in flight at one time. Use a temporary buffer so
+	 * we don't trash the rhead/buffer pointers from the caller.
+	 */
+	tmp_buffer = xlog_alloc_buffer(log, 1);
+	if (!tmp_buffer)
+		return -ENOMEM;
+	error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
+				      XLOG_MAX_ICLOGS, tmp_buffer,
+				      &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
+	kmem_free(tmp_buffer);
+	if (error < 0)
+		return error;
+
+	/*
+	 * Now run a CRC verification pass over the records starting at the
+	 * block found above to the current head. If a CRC failure occurs, the
+	 * log block of the first bad record is saved in first_bad.
+	 */
+	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+				      XLOG_RECOVER_CRCPASS, &first_bad);
+	if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+		/*
+		 * We've hit a potential torn write. Reset the error and warn
+		 * about it.
+		 */
+		error = 0;
+		xfs_warn(log->l_mp,
+"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
+			 first_bad, *head_blk);
+
+		/*
+		 * Get the header block and buffer pointer for the last good
+		 * record before the bad record.
+		 *
+		 * Note that xlog_find_tail() clears the blocks at the new head
+		 * (i.e., the records with invalid CRC) if the cycle number
+		 * matches the current cycle.
+		 */
+		found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
+				buffer, rhead_blk, rhead, wrapped);
+		if (found < 0)
+			return found;
+		if (found == 0)		/* XXX: right thing to do here? */
+			return -EIO;
+
+		/*
+		 * Reset the head block to the starting block of the first bad
+		 * log record and set the tail block based on the last good
+		 * record.
+		 *
+		 * Bail out if the updated head/tail match as this indicates
+		 * possible corruption outside of the acceptable
+		 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
+		 */
+		*head_blk = first_bad;
+		*tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+		if (*head_blk == *tail_blk) {
+			ASSERT(0);
+			return 0;
+		}
+	}
+	if (error)
+		return error;
+
+	return xlog_verify_tail(log, *head_blk, tail_blk,
+				be32_to_cpu((*rhead)->h_size));
+}
+
+/*
+ * We need to make sure we handle log wrapping properly, so we can't use the
+ * calculated logbno directly. Make sure it wraps to the correct bno inside the
+ * log.
+ *
+ * The log is limited to 32 bit sizes, so we use the appropriate modulus
+ * operation here and cast it back to a 64 bit daddr on return.
+ */
+static inline xfs_daddr_t
+xlog_wrap_logbno(
+	struct xlog		*log,
+	xfs_daddr_t		bno)
+{
+	int			mod;
+
+	div_s64_rem(bno, log->l_logBBsize, &mod);
+	return mod;
+}
+
+/*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+	struct xlog		*log,
+	xfs_daddr_t		*head_blk,
+	xfs_daddr_t		*tail_blk,
+	struct xlog_rec_header	*rhead,
+	xfs_daddr_t		rhead_blk,
+	char			*buffer,
+	bool			*clean)
+{
+	struct xlog_op_header	*op_head;
+	xfs_daddr_t		umount_data_blk;
+	xfs_daddr_t		after_umount_blk;
+	int			hblks;
+	int			error;
+	char			*offset;
+
+	*clean = false;
+
+	/*
+	 * Look for unmount record. If we find it, then we know there was a
+	 * clean unmount. Since 'i' could be the last block in the physical
+	 * log, we convert to a log block before comparing to the head_blk.
+	 *
+	 * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+	 * below. We won't want to clear the unmount record if there is one, so
+	 * we pass the lsn of the unmount record rather than the block after it.
+	 */
+	hblks = xlog_logrec_hblks(log, rhead);
+	after_umount_blk = xlog_wrap_logbno(log,
+			rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
+
+	if (*head_blk == after_umount_blk &&
+	    be32_to_cpu(rhead->h_num_logops) == 1) {
+		umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
+		error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
+		if (error)
+			return error;
+
+		op_head = (struct xlog_op_header *)offset;
+		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+			/*
+			 * Set tail and last sync so that newly written log
+			 * records will point recovery to after the current
+			 * unmount record.
+			 */
+			xlog_assign_atomic_lsn(&log->l_tail_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			*tail_blk = after_umount_blk;
+
+			*clean = true;
+		}
+	}
+
+	return 0;
+}
+
+static void
+xlog_set_state(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	struct xlog_rec_header	*rhead,
+	xfs_daddr_t		rhead_blk,
+	bool			bump_cycle)
+{
+	/*
+	 * Reset log values according to the state of the log when we
+	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
+	 * one because the next write starts a new cycle rather than
+	 * continuing the cycle of the last good log record.  At this
+	 * point we have guaranteed that all partial log records have been
+	 * accounted for.  Therefore, we know that the last good log record
+	 * written was complete and ended exactly on the end boundary
+	 * of the physical log.
+	 */
+	log->l_prev_block = rhead_blk;
+	log->l_curr_block = (int)head_blk;
+	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+	if (bump_cycle)
+		log->l_curr_cycle++;
+	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+}
+
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk.  Every log record header has
+ * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
+ * to get a sync block number.  The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn.  The entire log record does not need to be valid.  We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+STATIC int
+xlog_find_tail(
+	struct xlog		*log,
+	xfs_daddr_t		*head_blk,
+	xfs_daddr_t		*tail_blk)
+{
+	xlog_rec_header_t	*rhead;
+	char			*offset = NULL;
+	char			*buffer;
+	int			error;
+	xfs_daddr_t		rhead_blk;
+	xfs_lsn_t		tail_lsn;
+	bool			wrapped = false;
+	bool			clean = false;
+
+	/*
+	 * Find previous log record
+	 */
+	if ((error = xlog_find_head(log, head_blk)))
+		return error;
+	ASSERT(*head_blk < INT_MAX);
+
+	buffer = xlog_alloc_buffer(log, 1);
+	if (!buffer)
+		return -ENOMEM;
+	if (*head_blk == 0) {				/* special case */
+		error = xlog_bread(log, 0, 1, buffer, &offset);
+		if (error)
+			goto done;
+
+		if (xlog_get_cycle(offset) == 0) {
+			*tail_blk = 0;
+			/* leave all other log inited values alone */
+			goto done;
+		}
+	}
+
+	/*
+	 * Search backwards through the log looking for the log record header
+	 * block. This wraps all the way back around to the head so something is
+	 * seriously wrong if we can't find it.
+	 */
+	error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
+				      &rhead_blk, &rhead, &wrapped);
+	if (error < 0)
+		goto done;
+	if (!error) {
+		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+		error = -EFSCORRUPTED;
+		goto done;
+	}
+	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+
+	/*
+	 * Set the log state based on the current head record.
+	 */
+	xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
+	tail_lsn = atomic64_read(&log->l_tail_lsn);
+
+	/*
+	 * Look for an unmount record at the head of the log. This sets the log
+	 * state to determine whether recovery is necessary.
+	 */
+	error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+				       rhead_blk, buffer, &clean);
+	if (error)
+		goto done;
+
+	/*
+	 * Verify the log head if the log is not clean (e.g., we have anything
+	 * but an unmount record at the head). This uses CRC verification to
+	 * detect and trim torn writes. If discovered, CRC failures are
+	 * considered torn writes and the log head is trimmed accordingly.
+	 *
+	 * Note that we can only run CRC verification when the log is dirty
+	 * because there's no guarantee that the log data behind an unmount
+	 * record is compatible with the current architecture.
+	 */
+	if (!clean) {
+		xfs_daddr_t	orig_head = *head_blk;
+
+		error = xlog_verify_head(log, head_blk, tail_blk, buffer,
+					 &rhead_blk, &rhead, &wrapped);
+		if (error)
+			goto done;
+
+		/* update in-core state again if the head changed */
+		if (*head_blk != orig_head) {
+			xlog_set_state(log, *head_blk, rhead, rhead_blk,
+				       wrapped);
+			tail_lsn = atomic64_read(&log->l_tail_lsn);
+			error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+						       rhead, rhead_blk, buffer,
+						       &clean);
+			if (error)
+				goto done;
+		}
+	}
+
+	/*
+	 * Note that the unmount was clean. If the unmount was not clean, we
+	 * need to know this to rebuild the superblock counters from the perag
+	 * headers if we have a filesystem using non-persistent counters.
+	 */
+	if (clean)
+		set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
+
+	/*
+	 * Make sure that there are no blocks in front of the head
+	 * with the same cycle number as the head.  This can happen
+	 * because we allow multiple outstanding log writes concurrently,
+	 * and the later writes might make it out before earlier ones.
+	 *
+	 * We use the lsn from before modifying it so that we'll never
+	 * overwrite the unmount record after a clean unmount.
+	 *
+	 * Do this only if we are going to recover the filesystem
+	 *
+	 * NOTE: This used to say "if (!readonly)"
+	 * However on Linux, we can & do recover a read-only filesystem.
+	 * We only skip recovery if NORECOVERY is specified on mount,
+	 * in which case we would not be here.
+	 *
+	 * But... if the -device- itself is readonly, just skip this.
+	 * We can't recover this device anyway, so it won't matter.
+	 */
+	if (!xfs_readonly_buftarg(log->l_targ))
+		error = xlog_clear_stale_blocks(log, tail_lsn);
+
+done:
+	kmem_free(buffer);
+
+	if (error)
+		xfs_warn(log->l_mp, "failed to locate log tail");
+	return error;
+}
+
+/*
+ * Is the log zeroed at all?
+ *
+ * The last binary search should be changed to perform an X block read
+ * once X becomes small enough.  You can then search linearly through
+ * the X blocks.  This will cut down on the number of reads we need to do.
+ *
+ * If the log is partially zeroed, this routine will pass back the blkno
+ * of the first block with cycle number 0.  It won't have a complete LR
+ * preceding it.
+ *
+ * Return:
+ *	0  => the log is completely written to
+ *	1 => use *blk_no as the first block of the log
+ *	<0 => error has occurred
+ */
+STATIC int
+xlog_find_zeroed(
+	struct xlog	*log,
+	xfs_daddr_t	*blk_no)
+{
+	char		*buffer;
+	char		*offset;
+	uint	        first_cycle, last_cycle;
+	xfs_daddr_t	new_blk, last_blk, start_blk;
+	xfs_daddr_t     num_scan_bblks;
+	int	        error, log_bbnum = log->l_logBBsize;
+
+	*blk_no = 0;
+
+	/* check totally zeroed log */
+	buffer = xlog_alloc_buffer(log, 1);
+	if (!buffer)
+		return -ENOMEM;
+	error = xlog_bread(log, 0, 1, buffer, &offset);
+	if (error)
+		goto out_free_buffer;
+
+	first_cycle = xlog_get_cycle(offset);
+	if (first_cycle == 0) {		/* completely zeroed log */
+		*blk_no = 0;
+		kmem_free(buffer);
+		return 1;
+	}
+
+	/* check partially zeroed log */
+	error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
+	if (error)
+		goto out_free_buffer;
+
+	last_cycle = xlog_get_cycle(offset);
+	if (last_cycle != 0) {		/* log completely written to */
+		kmem_free(buffer);
+		return 0;
+	}
+
+	/* we have a partially zeroed log */
+	last_blk = log_bbnum-1;
+	error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
+	if (error)
+		goto out_free_buffer;
+
+	/*
+	 * Validate the answer.  Because there is no way to guarantee that
+	 * the entire log is made up of log records which are the same size,
+	 * we scan over the defined maximum blocks.  At this point, the maximum
+	 * is not chosen to mean anything special.   XXXmiken
+	 */
+	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+	ASSERT(num_scan_bblks <= INT_MAX);
+
+	if (last_blk < num_scan_bblks)
+		num_scan_bblks = last_blk;
+	start_blk = last_blk - num_scan_bblks;
+
+	/*
+	 * We search for any instances of cycle number 0 that occur before
+	 * our current estimate of the head.  What we're trying to detect is
+	 *        1 ... | 0 | 1 | 0...
+	 *                       ^ binary search ends here
+	 */
+	if ((error = xlog_find_verify_cycle(log, start_blk,
+					 (int)num_scan_bblks, 0, &new_blk)))
+		goto out_free_buffer;
+	if (new_blk != -1)
+		last_blk = new_blk;
+
+	/*
+	 * Potentially backup over partial log record write.  We don't need
+	 * to search the end of the log because we know it is zero.
+	 */
+	error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
+	if (error == 1)
+		error = -EIO;
+	if (error)
+		goto out_free_buffer;
+
+	*blk_no = last_blk;
+out_free_buffer:
+	kmem_free(buffer);
+	if (error)
+		return error;
+	return 1;
+}
+
+/*
+ * These are simple subroutines used by xlog_clear_stale_blocks() below
+ * to initialize a buffer full of empty log record headers and write
+ * them into the log.
+ */
+STATIC void
+xlog_add_record(
+	struct xlog		*log,
+	char			*buf,
+	int			cycle,
+	int			block,
+	int			tail_cycle,
+	int			tail_block)
+{
+	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
+
+	memset(buf, 0, BBSIZE);
+	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+	recp->h_cycle = cpu_to_be32(cycle);
+	recp->h_version = cpu_to_be32(
+			xfs_has_logv2(log->l_mp) ? 2 : 1);
+	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
+	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
+	recp->h_fmt = cpu_to_be32(XLOG_FMT);
+	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
+}
+
+STATIC int
+xlog_write_log_records(
+	struct xlog	*log,
+	int		cycle,
+	int		start_block,
+	int		blocks,
+	int		tail_cycle,
+	int		tail_block)
+{
+	char		*offset;
+	char		*buffer;
+	int		balign, ealign;
+	int		sectbb = log->l_sectBBsize;
+	int		end_block = start_block + blocks;
+	int		bufblks;
+	int		error = 0;
+	int		i, j = 0;
+
+	/*
+	 * Greedily allocate a buffer big enough to handle the full
+	 * range of basic blocks to be written.  If that fails, try
+	 * a smaller size.  We need to be able to write at least a
+	 * log sector, or we're out of luck.
+	 */
+	bufblks = 1 << ffs(blocks);
+	while (bufblks > log->l_logBBsize)
+		bufblks >>= 1;
+	while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
+		bufblks >>= 1;
+		if (bufblks < sectbb)
+			return -ENOMEM;
+	}
+
+	/* We may need to do a read at the start to fill in part of
+	 * the buffer in the starting sector not covered by the first
+	 * write below.
+	 */
+	balign = round_down(start_block, sectbb);
+	if (balign != start_block) {
+		error = xlog_bread_noalign(log, start_block, 1, buffer);
+		if (error)
+			goto out_free_buffer;
+
+		j = start_block - balign;
+	}
+
+	for (i = start_block; i < end_block; i += bufblks) {
+		int		bcount, endcount;
+
+		bcount = min(bufblks, end_block - start_block);
+		endcount = bcount - j;
+
+		/* We may need to do a read at the end to fill in part of
+		 * the buffer in the final sector not covered by the write.
+		 * If this is the same sector as the above read, skip it.
+		 */
+		ealign = round_down(end_block, sectbb);
+		if (j == 0 && (start_block + endcount > ealign)) {
+			error = xlog_bread_noalign(log, ealign, sectbb,
+					buffer + BBTOB(ealign - start_block));
+			if (error)
+				break;
+
+		}
+
+		offset = buffer + xlog_align(log, start_block);
+		for (; j < endcount; j++) {
+			xlog_add_record(log, offset, cycle, i+j,
+					tail_cycle, tail_block);
+			offset += BBSIZE;
+		}
+		error = xlog_bwrite(log, start_block, endcount, buffer);
+		if (error)
+			break;
+		start_block += endcount;
+		j = 0;
+	}
+
+out_free_buffer:
+	kmem_free(buffer);
+	return error;
+}
+
+/*
+ * This routine is called to blow away any incomplete log writes out
+ * in front of the log head.  We do this so that we won't become confused
+ * if we come up, write only a little bit more, and then crash again.
+ * If we leave the partial log records out there, this situation could
+ * cause us to think those partial writes are valid blocks since they
+ * have the current cycle number.  We get rid of them by overwriting them
+ * with empty log records with the old cycle number rather than the
+ * current one.
+ *
+ * The tail lsn is passed in rather than taken from
+ * the log so that we will not write over the unmount record after a
+ * clean unmount in a 512 block log.  Doing so would leave the log without
+ * any valid log records in it until a new one was written.  If we crashed
+ * during that time we would not be able to recover.
+ */
+STATIC int
+xlog_clear_stale_blocks(
+	struct xlog	*log,
+	xfs_lsn_t	tail_lsn)
+{
+	int		tail_cycle, head_cycle;
+	int		tail_block, head_block;
+	int		tail_distance, max_distance;
+	int		distance;
+	int		error;
+
+	tail_cycle = CYCLE_LSN(tail_lsn);
+	tail_block = BLOCK_LSN(tail_lsn);
+	head_cycle = log->l_curr_cycle;
+	head_block = log->l_curr_block;
+
+	/*
+	 * Figure out the distance between the new head of the log
+	 * and the tail.  We want to write over any blocks beyond the
+	 * head that we may have written just before the crash, but
+	 * we don't want to overwrite the tail of the log.
+	 */
+	if (head_cycle == tail_cycle) {
+		/*
+		 * The tail is behind the head in the physical log,
+		 * so the distance from the head to the tail is the
+		 * distance from the head to the end of the log plus
+		 * the distance from the beginning of the log to the
+		 * tail.
+		 */
+		if (XFS_IS_CORRUPT(log->l_mp,
+				   head_block < tail_block ||
+				   head_block >= log->l_logBBsize))
+			return -EFSCORRUPTED;
+		tail_distance = tail_block + (log->l_logBBsize - head_block);
+	} else {
+		/*
+		 * The head is behind the tail in the physical log,
+		 * so the distance from the head to the tail is just
+		 * the tail block minus the head block.
+		 */
+		if (XFS_IS_CORRUPT(log->l_mp,
+				   head_block >= tail_block ||
+				   head_cycle != tail_cycle + 1))
+			return -EFSCORRUPTED;
+		tail_distance = tail_block - head_block;
+	}
+
+	/*
+	 * If the head is right up against the tail, we can't clear
+	 * anything.
+	 */
+	if (tail_distance <= 0) {
+		ASSERT(tail_distance == 0);
+		return 0;
+	}
+
+	max_distance = XLOG_TOTAL_REC_SHIFT(log);
+	/*
+	 * Take the smaller of the maximum amount of outstanding I/O
+	 * we could have and the distance to the tail to clear out.
+	 * We take the smaller so that we don't overwrite the tail and
+	 * we don't waste all day writing from the head to the tail
+	 * for no reason.
+	 */
+	max_distance = min(max_distance, tail_distance);
+
+	if ((head_block + max_distance) <= log->l_logBBsize) {
+		/*
+		 * We can stomp all the blocks we need to without
+		 * wrapping around the end of the log.  Just do it
+		 * in a single write.  Use the cycle number of the
+		 * current cycle minus one so that the log will look like:
+		 *     n ... | n - 1 ...
+		 */
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, max_distance, tail_cycle,
+				tail_block);
+		if (error)
+			return error;
+	} else {
+		/*
+		 * We need to wrap around the end of the physical log in
+		 * order to clear all the blocks.  Do it in two separate
+		 * I/Os.  The first write should be from the head to the
+		 * end of the physical log, and it should use the current
+		 * cycle number minus one just like above.
+		 */
+		distance = log->l_logBBsize - head_block;
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, distance, tail_cycle,
+				tail_block);
+
+		if (error)
+			return error;
+
+		/*
+		 * Now write the blocks at the start of the physical log.
+		 * This writes the remainder of the blocks we want to clear.
+		 * It uses the current cycle number since we're now on the
+		 * same cycle as the head so that we get:
+		 *    n ... n ... | n - 1 ...
+		 *    ^^^^^ blocks we're writing
+		 */
+		distance = max_distance - (log->l_logBBsize - head_block);
+		error = xlog_write_log_records(log, head_cycle, 0, distance,
+				tail_cycle, tail_block);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Release the recovered intent item in the AIL that matches the given intent
+ * type and intent id.
+ */
+void
+xlog_recover_release_intent(
+	struct xlog		*log,
+	unsigned short		intent_type,
+	uint64_t		intent_id)
+{
+	struct xfs_ail_cursor	cur;
+	struct xfs_log_item	*lip;
+	struct xfs_ail		*ailp = log->l_ailp;
+
+	spin_lock(&ailp->ail_lock);
+	for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
+	     lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+		if (lip->li_type != intent_type)
+			continue;
+		if (!lip->li_ops->iop_match(lip, intent_id))
+			continue;
+
+		spin_unlock(&ailp->ail_lock);
+		lip->li_ops->iop_release(lip);
+		spin_lock(&ailp->ail_lock);
+		break;
+	}
+
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->ail_lock);
+}
+
+int
+xlog_recover_iget(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	struct xfs_inode	**ipp)
+{
+	int			error;
+
+	error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
+	if (error)
+		return error;
+
+	error = xfs_qm_dqattach(*ipp);
+	if (error) {
+		xfs_irele(*ipp);
+		return error;
+	}
+
+	if (VFS_I(*ipp)->i_nlink == 0)
+		xfs_iflags_set(*ipp, XFS_IRECOVERY);
+
+	return 0;
+}
+
+/******************************************************************************
+ *
+ *		Log recover routines
+ *
+ ******************************************************************************
+ */
+static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
+	&xlog_buf_item_ops,
+	&xlog_inode_item_ops,
+	&xlog_dquot_item_ops,
+	&xlog_quotaoff_item_ops,
+	&xlog_icreate_item_ops,
+	&xlog_efi_item_ops,
+	&xlog_efd_item_ops,
+	&xlog_rui_item_ops,
+	&xlog_rud_item_ops,
+	&xlog_cui_item_ops,
+	&xlog_cud_item_ops,
+	&xlog_bui_item_ops,
+	&xlog_bud_item_ops,
+	&xlog_attri_item_ops,
+	&xlog_attrd_item_ops,
+};
+
+static const struct xlog_recover_item_ops *
+xlog_find_item_ops(
+	struct xlog_recover_item		*item)
+{
+	unsigned int				i;
+
+	for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
+		if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
+			return xlog_recover_item_ops[i];
+
+	return NULL;
+}
+
+/*
+ * Sort the log items in the transaction.
+ *
+ * The ordering constraints are defined by the inode allocation and unlink
+ * behaviour. The rules are:
+ *
+ *	1. Every item is only logged once in a given transaction. Hence it
+ *	   represents the last logged state of the item. Hence ordering is
+ *	   dependent on the order in which operations need to be performed so
+ *	   required initial conditions are always met.
+ *
+ *	2. Cancelled buffers are recorded in pass 1 in a separate table and
+ *	   there's nothing to replay from them so we can simply cull them
+ *	   from the transaction. However, we can't do that until after we've
+ *	   replayed all the other items because they may be dependent on the
+ *	   cancelled buffer and replaying the cancelled buffer can remove it
+ *	   form the cancelled buffer table. Hence they have tobe done last.
+ *
+ *	3. Inode allocation buffers must be replayed before inode items that
+ *	   read the buffer and replay changes into it. For filesystems using the
+ *	   ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ *	   treated the same as inode allocation buffers as they create and
+ *	   initialise the buffers directly.
+ *
+ *	4. Inode unlink buffers must be replayed after inode items are replayed.
+ *	   This ensures that inodes are completely flushed to the inode buffer
+ *	   in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ *	- buffer_list for all buffers except cancelled/inode unlink buffers
+ *	- item_list for all non-buffer items
+ *	- inode_buffer_list for inode unlink buffers
+ *	- cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
+ */
+STATIC int
+xlog_recover_reorder_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	int			pass)
+{
+	struct xlog_recover_item *item, *n;
+	int			error = 0;
+	LIST_HEAD(sort_list);
+	LIST_HEAD(cancel_list);
+	LIST_HEAD(buffer_list);
+	LIST_HEAD(inode_buffer_list);
+	LIST_HEAD(item_list);
+
+	list_splice_init(&trans->r_itemq, &sort_list);
+	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
+		enum xlog_recover_reorder	fate = XLOG_REORDER_ITEM_LIST;
+
+		item->ri_ops = xlog_find_item_ops(item);
+		if (!item->ri_ops) {
+			xfs_warn(log->l_mp,
+				"%s: unrecognized type of log operation (%d)",
+				__func__, ITEM_TYPE(item));
+			ASSERT(0);
+			/*
+			 * return the remaining items back to the transaction
+			 * item list so they can be freed in caller.
+			 */
+			if (!list_empty(&sort_list))
+				list_splice_init(&sort_list, &trans->r_itemq);
+			error = -EFSCORRUPTED;
+			break;
+		}
+
+		if (item->ri_ops->reorder)
+			fate = item->ri_ops->reorder(item);
+
+		switch (fate) {
+		case XLOG_REORDER_BUFFER_LIST:
+			list_move_tail(&item->ri_list, &buffer_list);
+			break;
+		case XLOG_REORDER_CANCEL_LIST:
+			trace_xfs_log_recover_item_reorder_head(log,
+					trans, item, pass);
+			list_move(&item->ri_list, &cancel_list);
+			break;
+		case XLOG_REORDER_INODE_BUFFER_LIST:
+			list_move(&item->ri_list, &inode_buffer_list);
+			break;
+		case XLOG_REORDER_ITEM_LIST:
+			trace_xfs_log_recover_item_reorder_tail(log,
+							trans, item, pass);
+			list_move_tail(&item->ri_list, &item_list);
+			break;
+		}
+	}
+
+	ASSERT(list_empty(&sort_list));
+	if (!list_empty(&buffer_list))
+		list_splice(&buffer_list, &trans->r_itemq);
+	if (!list_empty(&item_list))
+		list_splice_tail(&item_list, &trans->r_itemq);
+	if (!list_empty(&inode_buffer_list))
+		list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+	if (!list_empty(&cancel_list))
+		list_splice_tail(&cancel_list, &trans->r_itemq);
+	return error;
+}
+
+void
+xlog_buf_readahead(
+	struct xlog		*log,
+	xfs_daddr_t		blkno,
+	uint			len,
+	const struct xfs_buf_ops *ops)
+{
+	if (!xlog_is_buffer_cancelled(log, blkno, len))
+		xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
+}
+
+STATIC int
+xlog_recover_items_pass2(
+	struct xlog                     *log,
+	struct xlog_recover             *trans,
+	struct list_head                *buffer_list,
+	struct list_head                *item_list)
+{
+	struct xlog_recover_item	*item;
+	int				error = 0;
+
+	list_for_each_entry(item, item_list, ri_list) {
+		trace_xfs_log_recover_item_recover(log, trans, item,
+				XLOG_RECOVER_PASS2);
+
+		if (item->ri_ops->commit_pass2)
+			error = item->ri_ops->commit_pass2(log, buffer_list,
+					item, trans->r_lsn);
+		if (error)
+			return error;
+	}
+
+	return error;
+}
+
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
+xlog_recover_commit_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	int			pass,
+	struct list_head	*buffer_list)
+{
+	int				error = 0;
+	int				items_queued = 0;
+	struct xlog_recover_item	*item;
+	struct xlog_recover_item	*next;
+	LIST_HEAD			(ra_list);
+	LIST_HEAD			(done_list);
+
+	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
+
+	hlist_del_init(&trans->r_list);
+
+	error = xlog_recover_reorder_trans(log, trans, pass);
+	if (error)
+		return error;
+
+	list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
+		trace_xfs_log_recover_item_recover(log, trans, item, pass);
+
+		switch (pass) {
+		case XLOG_RECOVER_PASS1:
+			if (item->ri_ops->commit_pass1)
+				error = item->ri_ops->commit_pass1(log, item);
+			break;
+		case XLOG_RECOVER_PASS2:
+			if (item->ri_ops->ra_pass2)
+				item->ri_ops->ra_pass2(log, item);
+			list_move_tail(&item->ri_list, &ra_list);
+			items_queued++;
+			if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
+				error = xlog_recover_items_pass2(log, trans,
+						buffer_list, &ra_list);
+				list_splice_tail_init(&ra_list, &done_list);
+				items_queued = 0;
+			}
+
+			break;
+		default:
+			ASSERT(0);
+		}
+
+		if (error)
+			goto out;
+	}
+
+out:
+	if (!list_empty(&ra_list)) {
+		if (!error)
+			error = xlog_recover_items_pass2(log, trans,
+					buffer_list, &ra_list);
+		list_splice_tail_init(&ra_list, &done_list);
+	}
+
+	if (!list_empty(&done_list))
+		list_splice_init(&done_list, &trans->r_itemq);
+
+	return error;
+}
+
+STATIC void
+xlog_recover_add_item(
+	struct list_head	*head)
+{
+	struct xlog_recover_item *item;
+
+	item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
+	INIT_LIST_HEAD(&item->ri_list);
+	list_add_tail(&item->ri_list, head);
+}
+
+STATIC int
+xlog_recover_add_to_cont_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	char			*dp,
+	int			len)
+{
+	struct xlog_recover_item *item;
+	char			*ptr, *old_ptr;
+	int			old_len;
+
+	/*
+	 * If the transaction is empty, the header was split across this and the
+	 * previous record. Copy the rest of the header.
+	 */
+	if (list_empty(&trans->r_itemq)) {
+		ASSERT(len <= sizeof(struct xfs_trans_header));
+		if (len > sizeof(struct xfs_trans_header)) {
+			xfs_warn(log->l_mp, "%s: bad header length", __func__);
+			return -EFSCORRUPTED;
+		}
+
+		xlog_recover_add_item(&trans->r_itemq);
+		ptr = (char *)&trans->r_theader +
+				sizeof(struct xfs_trans_header) - len;
+		memcpy(ptr, dp, len);
+		return 0;
+	}
+
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
+			  ri_list);
+
+	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+	old_len = item->ri_buf[item->ri_cnt-1].i_len;
+
+	ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+	memcpy(&ptr[old_len], dp, len);
+	item->ri_buf[item->ri_cnt-1].i_len += len;
+	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
+	return 0;
+}
+
+/*
+ * The next region to add is the start of a new region.  It could be
+ * a whole region or it could be the first part of a new region.  Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned.  Therefore, we
+ * either have both fields or we have neither field.  In the case we have
+ * neither field, the data part of the region is zero length.  We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later.  If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	char			*dp,
+	int			len)
+{
+	struct xfs_inode_log_format	*in_f;			/* any will do */
+	struct xlog_recover_item *item;
+	char			*ptr;
+
+	if (!len)
+		return 0;
+	if (list_empty(&trans->r_itemq)) {
+		/* we need to catch log corruptions here */
+		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+			xfs_warn(log->l_mp, "%s: bad header magic number",
+				__func__);
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+
+		if (len > sizeof(struct xfs_trans_header)) {
+			xfs_warn(log->l_mp, "%s: bad header length", __func__);
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+
+		/*
+		 * The transaction header can be arbitrarily split across op
+		 * records. If we don't have the whole thing here, copy what we
+		 * do have and handle the rest in the next record.
+		 */
+		if (len == sizeof(struct xfs_trans_header))
+			xlog_recover_add_item(&trans->r_itemq);
+		memcpy(&trans->r_theader, dp, len);
+		return 0;
+	}
+
+	ptr = kmem_alloc(len, 0);
+	memcpy(ptr, dp, len);
+	in_f = (struct xfs_inode_log_format *)ptr;
+
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
+			  ri_list);
+	if (item->ri_total != 0 &&
+	     item->ri_total == item->ri_cnt) {
+		/* tail item is in use, get a new one */
+		xlog_recover_add_item(&trans->r_itemq);
+		item = list_entry(trans->r_itemq.prev,
+					struct xlog_recover_item, ri_list);
+	}
+
+	if (item->ri_total == 0) {		/* first region to be added */
+		if (in_f->ilf_size == 0 ||
+		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+			xfs_warn(log->l_mp,
+		"bad number of regions (%d) in inode log format",
+				  in_f->ilf_size);
+			ASSERT(0);
+			kmem_free(ptr);
+			return -EFSCORRUPTED;
+		}
+
+		item->ri_total = in_f->ilf_size;
+		item->ri_buf =
+			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+				    0);
+	}
+
+	if (item->ri_total <= item->ri_cnt) {
+		xfs_warn(log->l_mp,
+	"log item region count (%d) overflowed size (%d)",
+				item->ri_cnt, item->ri_total);
+		ASSERT(0);
+		kmem_free(ptr);
+		return -EFSCORRUPTED;
+	}
+
+	/* Description region is ri_buf[0] */
+	item->ri_buf[item->ri_cnt].i_addr = ptr;
+	item->ri_buf[item->ri_cnt].i_len  = len;
+	item->ri_cnt++;
+	trace_xfs_log_recover_item_add(log, trans, item, 0);
+	return 0;
+}
+
+/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(
+	struct xlog_recover	*trans)
+{
+	struct xlog_recover_item *item, *n;
+	int			i;
+
+	hlist_del_init(&trans->r_list);
+
+	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+		/* Free the regions in the item. */
+		list_del(&item->ri_list);
+		for (i = 0; i < item->ri_cnt; i++)
+			kmem_free(item->ri_buf[i].i_addr);
+		/* Free the item itself */
+		kmem_free(item->ri_buf);
+		kmem_free(item);
+	}
+	/* Free the transaction recover structure */
+	kmem_free(trans);
+}
+
+/*
+ * On error or completion, trans is freed.
+ */
+STATIC int
+xlog_recovery_process_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	char			*dp,
+	unsigned int		len,
+	unsigned int		flags,
+	int			pass,
+	struct list_head	*buffer_list)
+{
+	int			error = 0;
+	bool			freeit = false;
+
+	/* mask off ophdr transaction container flags */
+	flags &= ~XLOG_END_TRANS;
+	if (flags & XLOG_WAS_CONT_TRANS)
+		flags &= ~XLOG_CONTINUE_TRANS;
+
+	/*
+	 * Callees must not free the trans structure. We'll decide if we need to
+	 * free it or not based on the operation being done and it's result.
+	 */
+	switch (flags) {
+	/* expected flag values */
+	case 0:
+	case XLOG_CONTINUE_TRANS:
+		error = xlog_recover_add_to_trans(log, trans, dp, len);
+		break;
+	case XLOG_WAS_CONT_TRANS:
+		error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
+		break;
+	case XLOG_COMMIT_TRANS:
+		error = xlog_recover_commit_trans(log, trans, pass,
+						  buffer_list);
+		/* success or fail, we are now done with this transaction. */
+		freeit = true;
+		break;
+
+	/* unexpected flag values */
+	case XLOG_UNMOUNT_TRANS:
+		/* just skip trans */
+		xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+		freeit = true;
+		break;
+	case XLOG_START_TRANS:
+	default:
+		xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		break;
+	}
+	if (error || freeit)
+		xlog_recover_free_trans(trans);
+	return error;
+}
+
+/*
+ * Lookup the transaction recovery structure associated with the ID in the
+ * current ophdr. If the transaction doesn't exist and the start flag is set in
+ * the ophdr, then allocate a new transaction for future ID matches to find.
+ * Either way, return what we found during the lookup - an existing transaction
+ * or nothing.
+ */
+STATIC struct xlog_recover *
+xlog_recover_ophdr_to_trans(
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	struct xlog_op_header	*ohead)
+{
+	struct xlog_recover	*trans;
+	xlog_tid_t		tid;
+	struct hlist_head	*rhp;
+
+	tid = be32_to_cpu(ohead->oh_tid);
+	rhp = &rhash[XLOG_RHASH(tid)];
+	hlist_for_each_entry(trans, rhp, r_list) {
+		if (trans->r_log_tid == tid)
+			return trans;
+	}
+
+	/*
+	 * skip over non-start transaction headers - we could be
+	 * processing slack space before the next transaction starts
+	 */
+	if (!(ohead->oh_flags & XLOG_START_TRANS))
+		return NULL;
+
+	ASSERT(be32_to_cpu(ohead->oh_len) == 0);
+
+	/*
+	 * This is a new transaction so allocate a new recovery container to
+	 * hold the recovery ops that will follow.
+	 */
+	trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
+	trans->r_log_tid = tid;
+	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
+	INIT_LIST_HEAD(&trans->r_itemq);
+	INIT_HLIST_NODE(&trans->r_list);
+	hlist_add_head(&trans->r_list, rhp);
+
+	/*
+	 * Nothing more to do for this ophdr. Items to be added to this new
+	 * transaction will be in subsequent ophdr containers.
+	 */
+	return NULL;
+}
+
+STATIC int
+xlog_recover_process_ophdr(
+	struct xlog		*log,
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	struct xlog_op_header	*ohead,
+	char			*dp,
+	char			*end,
+	int			pass,
+	struct list_head	*buffer_list)
+{
+	struct xlog_recover	*trans;
+	unsigned int		len;
+	int			error;
+
+	/* Do we understand who wrote this op? */
+	if (ohead->oh_clientid != XFS_TRANSACTION &&
+	    ohead->oh_clientid != XFS_LOG) {
+		xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+			__func__, ohead->oh_clientid);
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Check the ophdr contains all the data it is supposed to contain.
+	 */
+	len = be32_to_cpu(ohead->oh_len);
+	if (dp + len > end) {
+		xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
+		WARN_ON(1);
+		return -EFSCORRUPTED;
+	}
+
+	trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
+	if (!trans) {
+		/* nothing to do, so skip over this ophdr */
+		return 0;
+	}
+
+	/*
+	 * The recovered buffer queue is drained only once we know that all
+	 * recovery items for the current LSN have been processed. This is
+	 * required because:
+	 *
+	 * - Buffer write submission updates the metadata LSN of the buffer.
+	 * - Log recovery skips items with a metadata LSN >= the current LSN of
+	 *   the recovery item.
+	 * - Separate recovery items against the same metadata buffer can share
+	 *   a current LSN. I.e., consider that the LSN of a recovery item is
+	 *   defined as the starting LSN of the first record in which its
+	 *   transaction appears, that a record can hold multiple transactions,
+	 *   and/or that a transaction can span multiple records.
+	 *
+	 * In other words, we are allowed to submit a buffer from log recovery
+	 * once per current LSN. Otherwise, we may incorrectly skip recovery
+	 * items and cause corruption.
+	 *
+	 * We don't know up front whether buffers are updated multiple times per
+	 * LSN. Therefore, track the current LSN of each commit log record as it
+	 * is processed and drain the queue when it changes. Use commit records
+	 * because they are ordered correctly by the logging code.
+	 */
+	if (log->l_recovery_lsn != trans->r_lsn &&
+	    ohead->oh_flags & XLOG_COMMIT_TRANS) {
+		error = xfs_buf_delwri_submit(buffer_list);
+		if (error)
+			return error;
+		log->l_recovery_lsn = trans->r_lsn;
+	}
+
+	return xlog_recovery_process_trans(log, trans, dp, len,
+					   ohead->oh_flags, pass, buffer_list);
+}
+
+/*
+ * There are two valid states of the r_state field.  0 indicates that the
+ * transaction structure is in a normal state.  We have either seen the
+ * start of the transaction or the last operation we added was not a partial
+ * operation.  If the last operation we added to the transaction was a
+ * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
+ *
+ * NOTE: skip LRs with 0 data length.
+ */
+STATIC int
+xlog_recover_process_data(
+	struct xlog		*log,
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	char			*dp,
+	int			pass,
+	struct list_head	*buffer_list)
+{
+	struct xlog_op_header	*ohead;
+	char			*end;
+	int			num_logops;
+	int			error;
+
+	end = dp + be32_to_cpu(rhead->h_len);
+	num_logops = be32_to_cpu(rhead->h_num_logops);
+
+	/* check the log format matches our own - else we can't recover */
+	if (xlog_header_check_recover(log->l_mp, rhead))
+		return -EIO;
+
+	trace_xfs_log_recover_record(log, rhead, pass);
+	while ((dp < end) && num_logops) {
+
+		ohead = (struct xlog_op_header *)dp;
+		dp += sizeof(*ohead);
+		ASSERT(dp <= end);
+
+		/* errors will abort recovery */
+		error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
+						   dp, end, pass, buffer_list);
+		if (error)
+			return error;
+
+		dp += be32_to_cpu(ohead->oh_len);
+		num_logops--;
+	}
+	return 0;
+}
+
+/* Take all the collected deferred ops and finish them in order. */
+static int
+xlog_finish_defer_ops(
+	struct xfs_mount	*mp,
+	struct list_head	*capture_list)
+{
+	struct xfs_defer_capture *dfc, *next;
+	struct xfs_trans	*tp;
+	int			error = 0;
+
+	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+		struct xfs_trans_res	resv;
+		struct xfs_defer_resources dres;
+
+		/*
+		 * Create a new transaction reservation from the captured
+		 * information.  Set logcount to 1 to force the new transaction
+		 * to regrant every roll so that we can make forward progress
+		 * in recovery no matter how full the log might be.
+		 */
+		resv.tr_logres = dfc->dfc_logres;
+		resv.tr_logcount = 1;
+		resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+
+		error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
+				dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+		if (error) {
+			xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
+			return error;
+		}
+
+		/*
+		 * Transfer to this new transaction all the dfops we captured
+		 * from recovering a single intent item.
+		 */
+		list_del_init(&dfc->dfc_list);
+		xfs_defer_ops_continue(dfc, tp, &dres);
+		error = xfs_trans_commit(tp);
+		xfs_defer_resources_rele(&dres);
+		if (error)
+			return error;
+	}
+
+	ASSERT(list_empty(capture_list));
+	return 0;
+}
+
+/* Release all the captured defer ops and capture structures in this list. */
+static void
+xlog_abort_defer_ops(
+	struct xfs_mount		*mp,
+	struct list_head		*capture_list)
+{
+	struct xfs_defer_capture	*dfc;
+	struct xfs_defer_capture	*next;
+
+	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+		list_del_init(&dfc->dfc_list);
+		xfs_defer_ops_capture_free(mp, dfc);
+	}
+}
+
+/*
+ * When this is called, all of the log intent items which did not have
+ * corresponding log done items should be in the AIL.  What we do now is update
+ * the data structures associated with each one.
+ *
+ * Since we process the log intent items in normal transactions, they will be
+ * removed at some point after the commit.  This prevents us from just walking
+ * down the list processing each one.  We'll use a flag in the intent item to
+ * skip those that we've already processed and use the AIL iteration mechanism's
+ * generation count to try to speed this up at least a bit.
+ *
+ * When we start, we know that the intents are the only things in the AIL. As we
+ * process them, however, other items are added to the AIL. Hence we know we
+ * have started recovery on all the pending intents when we find an non-intent
+ * item in the AIL.
+ */
+STATIC int
+xlog_recover_process_intents(
+	struct xlog		*log)
+{
+	LIST_HEAD(capture_list);
+	struct xfs_ail_cursor	cur;
+	struct xfs_log_item	*lip;
+	struct xfs_ail		*ailp;
+	int			error = 0;
+#if defined(DEBUG) || defined(XFS_WARN)
+	xfs_lsn_t		last_lsn;
+#endif
+
+	ailp = log->l_ailp;
+	spin_lock(&ailp->ail_lock);
+#if defined(DEBUG) || defined(XFS_WARN)
+	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
+#endif
+	for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	     lip != NULL;
+	     lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+		const struct xfs_item_ops	*ops;
+
+		if (!xlog_item_is_intent(lip))
+			break;
+
+		/*
+		 * We should never see a redo item with a LSN higher than
+		 * the last transaction we found in the log at the start
+		 * of recovery.
+		 */
+		ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
+
+		/*
+		 * NOTE: If your intent processing routine can create more
+		 * deferred ops, you /must/ attach them to the capture list in
+		 * the recover routine or else those subsequent intents will be
+		 * replayed in the wrong order!
+		 *
+		 * The recovery function can free the log item, so we must not
+		 * access lip after it returns.
+		 */
+		spin_unlock(&ailp->ail_lock);
+		ops = lip->li_ops;
+		error = ops->iop_recover(lip, &capture_list);
+		spin_lock(&ailp->ail_lock);
+		if (error) {
+			trace_xlog_intent_recovery_failed(log->l_mp, error,
+					ops->iop_recover);
+			break;
+		}
+	}
+
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->ail_lock);
+	if (error)
+		goto err;
+
+	error = xlog_finish_defer_ops(log->l_mp, &capture_list);
+	if (error)
+		goto err;
+
+	return 0;
+err:
+	xlog_abort_defer_ops(log->l_mp, &capture_list);
+	return error;
+}
+
+/*
+ * A cancel occurs when the mount has failed and we're bailing out.  Release all
+ * pending log intent items that we haven't started recovery on so they don't
+ * pin the AIL.
+ */
+STATIC void
+xlog_recover_cancel_intents(
+	struct xlog		*log)
+{
+	struct xfs_log_item	*lip;
+	struct xfs_ail_cursor	cur;
+	struct xfs_ail		*ailp;
+
+	ailp = log->l_ailp;
+	spin_lock(&ailp->ail_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		if (!xlog_item_is_intent(lip))
+			break;
+
+		spin_unlock(&ailp->ail_lock);
+		lip->li_ops->iop_release(lip);
+		spin_lock(&ailp->ail_lock);
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->ail_lock);
+}
+
+/*
+ * This routine performs a transaction to null out a bad inode pointer
+ * in an agi unlinked inode hash bucket.
+ */
+STATIC void
+xlog_recover_clear_agi_bucket(
+	struct xfs_perag	*pag,
+	int			bucket)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_trans	*tp;
+	struct xfs_agi		*agi;
+	struct xfs_buf		*agibp;
+	int			offset;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
+	if (error)
+		goto out_error;
+
+	error = xfs_read_agi(pag, tp, &agibp);
+	if (error)
+		goto out_abort;
+
+	agi = agibp->b_addr;
+	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		 (sizeof(xfs_agino_t) * bucket);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_error;
+	return;
+
+out_abort:
+	xfs_trans_cancel(tp);
+out_error:
+	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
+			pag->pag_agno);
+	return;
+}
+
+static int
+xlog_recover_iunlink_bucket(
+	struct xfs_perag	*pag,
+	struct xfs_agi		*agi,
+	int			bucket)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_inode	*prev_ip = NULL;
+	struct xfs_inode	*ip;
+	xfs_agino_t		prev_agino, agino;
+	int			error = 0;
+
+	agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+	while (agino != NULLAGINO) {
+		error = xfs_iget(mp, NULL,
+				XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
+				0, 0, &ip);
+		if (error)
+			break;
+
+		ASSERT(VFS_I(ip)->i_nlink == 0);
+		ASSERT(VFS_I(ip)->i_mode != 0);
+		xfs_iflags_clear(ip, XFS_IRECOVERY);
+		agino = ip->i_next_unlinked;
+
+		if (prev_ip) {
+			ip->i_prev_unlinked = prev_agino;
+			xfs_irele(prev_ip);
+
+			/*
+			 * Ensure the inode is removed from the unlinked list
+			 * before we continue so that it won't race with
+			 * building the in-memory list here. This could be
+			 * serialised with the agibp lock, but that just
+			 * serialises via lockstepping and it's much simpler
+			 * just to flush the inodegc queue and wait for it to
+			 * complete.
+			 */
+			xfs_inodegc_flush(mp);
+		}
+
+		prev_agino = agino;
+		prev_ip = ip;
+	}
+
+	if (prev_ip) {
+		ip->i_prev_unlinked = prev_agino;
+		xfs_irele(prev_ip);
+	}
+	xfs_inodegc_flush(mp);
+	return error;
+}
+
+/*
+ * Recover AGI unlinked lists
+ *
+ * This is called during recovery to process any inodes which we unlinked but
+ * not freed when the system crashed.  These inodes will be on the lists in the
+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
+ * any inodes found on the lists. Each inode is removed from the lists when it
+ * has been fully truncated and is freed. The freeing of the inode and its
+ * removal from the list must be atomic.
+ *
+ * If everything we touch in the agi processing loop is already in memory, this
+ * loop can hold the cpu for a long time. It runs without lock contention,
+ * memory allocation contention, the need wait for IO, etc, and so will run
+ * until we either run out of inodes to process, run low on memory or we run out
+ * of log space.
+ *
+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
+ * and can prevent other filesystem work (such as CIL pushes) from running. This
+ * can lead to deadlocks if the recovery process runs out of log reservation
+ * space. Hence we need to yield the CPU when there is other kernel work
+ * scheduled on this CPU to ensure other scheduled work can run without undue
+ * latency.
+ */
+static void
+xlog_recover_iunlink_ag(
+	struct xfs_perag	*pag)
+{
+	struct xfs_agi		*agi;
+	struct xfs_buf		*agibp;
+	int			bucket;
+	int			error;
+
+	error = xfs_read_agi(pag, NULL, &agibp);
+	if (error) {
+		/*
+		 * AGI is b0rked. Don't process it.
+		 *
+		 * We should probably mark the filesystem as corrupt after we've
+		 * recovered all the ag's we can....
+		 */
+		return;
+	}
+
+	/*
+	 * Unlock the buffer so that it can be acquired in the normal course of
+	 * the transaction to truncate and free each inode.  Because we are not
+	 * racing with anyone else here for the AGI buffer, we don't even need
+	 * to hold it locked to read the initial unlinked bucket entries out of
+	 * the buffer. We keep buffer reference though, so that it stays pinned
+	 * in memory while we need the buffer.
+	 */
+	agi = agibp->b_addr;
+	xfs_buf_unlock(agibp);
+
+	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+		error = xlog_recover_iunlink_bucket(pag, agi, bucket);
+		if (error) {
+			/*
+			 * Bucket is unrecoverable, so only a repair scan can
+			 * free the remaining unlinked inodes. Just empty the
+			 * bucket and remaining inodes on it unreferenced and
+			 * unfreeable.
+			 */
+			xfs_inodegc_flush(pag->pag_mount);
+			xlog_recover_clear_agi_bucket(pag, bucket);
+		}
+	}
+
+	xfs_buf_rele(agibp);
+}
+
+static void
+xlog_recover_process_iunlinks(
+	struct xlog	*log)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	for_each_perag(log->l_mp, agno, pag)
+		xlog_recover_iunlink_ag(pag);
+
+	/*
+	 * Flush the pending unlinked inodes to ensure that the inactivations
+	 * are fully completed on disk and the incore inodes can be reclaimed
+	 * before we signal that recovery is complete.
+	 */
+	xfs_inodegc_flush(log->l_mp);
+}
+
+STATIC void
+xlog_unpack_data(
+	struct xlog_rec_header	*rhead,
+	char			*dp,
+	struct xlog		*log)
+{
+	int			i, j, k;
+
+	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
+		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+		dp += BBSIZE;
+	}
+
+	if (xfs_has_logv2(log->l_mp)) {
+		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
+		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+			dp += BBSIZE;
+		}
+	}
+}
+
+/*
+ * CRC check, unpack and process a log record.
+ */
+STATIC int
+xlog_recover_process(
+	struct xlog		*log,
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	char			*dp,
+	int			pass,
+	struct list_head	*buffer_list)
+{
+	__le32			old_crc = rhead->h_crc;
+	__le32			crc;
+
+	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+
+	/*
+	 * Nothing else to do if this is a CRC verification pass. Just return
+	 * if this a record with a non-zero crc. Unfortunately, mkfs always
+	 * sets old_crc to 0 so we must consider this valid even on v5 supers.
+	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
+	 * know precisely what failed.
+	 */
+	if (pass == XLOG_RECOVER_CRCPASS) {
+		if (old_crc && crc != old_crc)
+			return -EFSBADCRC;
+		return 0;
+	}
+
+	/*
+	 * We're in the normal recovery path. Issue a warning if and only if the
+	 * CRC in the header is non-zero. This is an advisory warning and the
+	 * zero CRC check prevents warnings from being emitted when upgrading
+	 * the kernel from one that does not add CRCs by default.
+	 */
+	if (crc != old_crc) {
+		if (old_crc || xfs_has_crc(log->l_mp)) {
+			xfs_alert(log->l_mp,
+		"log record CRC mismatch: found 0x%x, expected 0x%x.",
+					le32_to_cpu(old_crc),
+					le32_to_cpu(crc));
+			xfs_hex_dump(dp, 32);
+		}
+
+		/*
+		 * If the filesystem is CRC enabled, this mismatch becomes a
+		 * fatal log corruption failure.
+		 */
+		if (xfs_has_crc(log->l_mp)) {
+			XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+			return -EFSCORRUPTED;
+		}
+	}
+
+	xlog_unpack_data(rhead, dp, log);
+
+	return xlog_recover_process_data(log, rhash, rhead, dp, pass,
+					 buffer_list);
+}
+
+STATIC int
+xlog_valid_rec_header(
+	struct xlog		*log,
+	struct xlog_rec_header	*rhead,
+	xfs_daddr_t		blkno,
+	int			bufsize)
+{
+	int			hlen;
+
+	if (XFS_IS_CORRUPT(log->l_mp,
+			   rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
+		return -EFSCORRUPTED;
+	if (XFS_IS_CORRUPT(log->l_mp,
+			   (!rhead->h_version ||
+			   (be32_to_cpu(rhead->h_version) &
+			    (~XLOG_VERSION_OKBITS))))) {
+		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
+			__func__, be32_to_cpu(rhead->h_version));
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * LR body must have data (or it wouldn't have been written)
+	 * and h_len must not be greater than LR buffer size.
+	 */
+	hlen = be32_to_cpu(rhead->h_len);
+	if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
+		return -EFSCORRUPTED;
+
+	if (XFS_IS_CORRUPT(log->l_mp,
+			   blkno > log->l_logBBsize || blkno > INT_MAX))
+		return -EFSCORRUPTED;
+	return 0;
+}
+
+/*
+ * Read the log from tail to head and process the log records found.
+ * Handle the two cases where the tail and head are in the same cycle
+ * and where the active portion of the log wraps around the end of
+ * the physical log separately.  The pass parameter is passed through
+ * to the routines called to process the data and is not looked at
+ * here.
+ */
+STATIC int
+xlog_do_recovery_pass(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk,
+	int			pass,
+	xfs_daddr_t		*first_bad)	/* out: first bad log rec */
+{
+	xlog_rec_header_t	*rhead;
+	xfs_daddr_t		blk_no, rblk_no;
+	xfs_daddr_t		rhead_blk;
+	char			*offset;
+	char			*hbp, *dbp;
+	int			error = 0, h_size, h_len;
+	int			error2 = 0;
+	int			bblks, split_bblks;
+	int			hblks, split_hblks, wrapped_hblks;
+	int			i;
+	struct hlist_head	rhash[XLOG_RHASH_SIZE];
+	LIST_HEAD		(buffer_list);
+
+	ASSERT(head_blk != tail_blk);
+	blk_no = rhead_blk = tail_blk;
+
+	for (i = 0; i < XLOG_RHASH_SIZE; i++)
+		INIT_HLIST_HEAD(&rhash[i]);
+
+	/*
+	 * Read the header of the tail block and get the iclog buffer size from
+	 * h_size.  Use this to tell how many sectors make up the log header.
+	 */
+	if (xfs_has_logv2(log->l_mp)) {
+		/*
+		 * When using variable length iclogs, read first sector of
+		 * iclog header and extract the header size from it.  Get a
+		 * new hbp that is the correct size.
+		 */
+		hbp = xlog_alloc_buffer(log, 1);
+		if (!hbp)
+			return -ENOMEM;
+
+		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+		if (error)
+			goto bread_err1;
+
+		rhead = (xlog_rec_header_t *)offset;
+
+		/*
+		 * xfsprogs has a bug where record length is based on lsunit but
+		 * h_size (iclog size) is hardcoded to 32k. Now that we
+		 * unconditionally CRC verify the unmount record, this means the
+		 * log buffer can be too small for the record and cause an
+		 * overrun.
+		 *
+		 * Detect this condition here. Use lsunit for the buffer size as
+		 * long as this looks like the mkfs case. Otherwise, return an
+		 * error to avoid a buffer overrun.
+		 */
+		h_size = be32_to_cpu(rhead->h_size);
+		h_len = be32_to_cpu(rhead->h_len);
+		if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
+		    rhead->h_num_logops == cpu_to_be32(1)) {
+			xfs_warn(log->l_mp,
+		"invalid iclog size (%d bytes), using lsunit (%d bytes)",
+				 h_size, log->l_mp->m_logbsize);
+			h_size = log->l_mp->m_logbsize;
+		}
+
+		error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
+		if (error)
+			goto bread_err1;
+
+		hblks = xlog_logrec_hblks(log, rhead);
+		if (hblks != 1) {
+			kmem_free(hbp);
+			hbp = xlog_alloc_buffer(log, hblks);
+		}
+	} else {
+		ASSERT(log->l_sectBBsize == 1);
+		hblks = 1;
+		hbp = xlog_alloc_buffer(log, 1);
+		h_size = XLOG_BIG_RECORD_BSIZE;
+	}
+
+	if (!hbp)
+		return -ENOMEM;
+	dbp = xlog_alloc_buffer(log, BTOBB(h_size));
+	if (!dbp) {
+		kmem_free(hbp);
+		return -ENOMEM;
+	}
+
+	memset(rhash, 0, sizeof(rhash));
+	if (tail_blk > head_blk) {
+		/*
+		 * Perform recovery around the end of the physical log.
+		 * When the head is not on the same cycle number as the tail,
+		 * we can't do a sequential recovery.
+		 */
+		while (blk_no < log->l_logBBsize) {
+			/*
+			 * Check for header wrapping around physical end-of-log
+			 */
+			offset = hbp;
+			split_hblks = 0;
+			wrapped_hblks = 0;
+			if (blk_no + hblks <= log->l_logBBsize) {
+				/* Read header in one read */
+				error = xlog_bread(log, blk_no, hblks, hbp,
+						   &offset);
+				if (error)
+					goto bread_err2;
+			} else {
+				/* This LR is split across physical log end */
+				if (blk_no != log->l_logBBsize) {
+					/* some data before physical log end */
+					ASSERT(blk_no <= INT_MAX);
+					split_hblks = log->l_logBBsize - (int)blk_no;
+					ASSERT(split_hblks > 0);
+					error = xlog_bread(log, blk_no,
+							   split_hblks, hbp,
+							   &offset);
+					if (error)
+						goto bread_err2;
+				}
+
+				/*
+				 * Note: this black magic still works with
+				 * large sector sizes (non-512) only because:
+				 * - we increased the buffer size originally
+				 *   by 1 sector giving us enough extra space
+				 *   for the second read;
+				 * - the log start is guaranteed to be sector
+				 *   aligned;
+				 * - we read the log end (LR header start)
+				 *   _first_, then the log start (LR header end)
+				 *   - order is important.
+				 */
+				wrapped_hblks = hblks - split_hblks;
+				error = xlog_bread_noalign(log, 0,
+						wrapped_hblks,
+						offset + BBTOB(split_hblks));
+				if (error)
+					goto bread_err2;
+			}
+			rhead = (xlog_rec_header_t *)offset;
+			error = xlog_valid_rec_header(log, rhead,
+					split_hblks ? blk_no : 0, h_size);
+			if (error)
+				goto bread_err2;
+
+			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+			blk_no += hblks;
+
+			/*
+			 * Read the log record data in multiple reads if it
+			 * wraps around the end of the log. Note that if the
+			 * header already wrapped, blk_no could point past the
+			 * end of the log. The record data is contiguous in
+			 * that case.
+			 */
+			if (blk_no + bblks <= log->l_logBBsize ||
+			    blk_no >= log->l_logBBsize) {
+				rblk_no = xlog_wrap_logbno(log, blk_no);
+				error = xlog_bread(log, rblk_no, bblks, dbp,
+						   &offset);
+				if (error)
+					goto bread_err2;
+			} else {
+				/* This log record is split across the
+				 * physical end of log */
+				offset = dbp;
+				split_bblks = 0;
+				if (blk_no != log->l_logBBsize) {
+					/* some data is before the physical
+					 * end of log */
+					ASSERT(!wrapped_hblks);
+					ASSERT(blk_no <= INT_MAX);
+					split_bblks =
+						log->l_logBBsize - (int)blk_no;
+					ASSERT(split_bblks > 0);
+					error = xlog_bread(log, blk_no,
+							split_bblks, dbp,
+							&offset);
+					if (error)
+						goto bread_err2;
+				}
+
+				/*
+				 * Note: this black magic still works with
+				 * large sector sizes (non-512) only because:
+				 * - we increased the buffer size originally
+				 *   by 1 sector giving us enough extra space
+				 *   for the second read;
+				 * - the log start is guaranteed to be sector
+				 *   aligned;
+				 * - we read the log end (LR header start)
+				 *   _first_, then the log start (LR header end)
+				 *   - order is important.
+				 */
+				error = xlog_bread_noalign(log, 0,
+						bblks - split_bblks,
+						offset + BBTOB(split_bblks));
+				if (error)
+					goto bread_err2;
+			}
+
+			error = xlog_recover_process(log, rhash, rhead, offset,
+						     pass, &buffer_list);
+			if (error)
+				goto bread_err2;
+
+			blk_no += bblks;
+			rhead_blk = blk_no;
+		}
+
+		ASSERT(blk_no >= log->l_logBBsize);
+		blk_no -= log->l_logBBsize;
+		rhead_blk = blk_no;
+	}
+
+	/* read first part of physical log */
+	while (blk_no < head_blk) {
+		error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+		if (error)
+			goto bread_err2;
+
+		rhead = (xlog_rec_header_t *)offset;
+		error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
+		if (error)
+			goto bread_err2;
+
+		/* blocks in data section */
+		bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+		error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+				   &offset);
+		if (error)
+			goto bread_err2;
+
+		error = xlog_recover_process(log, rhash, rhead, offset, pass,
+					     &buffer_list);
+		if (error)
+			goto bread_err2;
+
+		blk_no += bblks + hblks;
+		rhead_blk = blk_no;
+	}
+
+ bread_err2:
+	kmem_free(dbp);
+ bread_err1:
+	kmem_free(hbp);
+
+	/*
+	 * Submit buffers that have been added from the last record processed,
+	 * regardless of error status.
+	 */
+	if (!list_empty(&buffer_list))
+		error2 = xfs_buf_delwri_submit(&buffer_list);
+
+	if (error && first_bad)
+		*first_bad = rhead_blk;
+
+	/*
+	 * Transactions are freed at commit time but transactions without commit
+	 * records on disk are never committed. Free any that may be left in the
+	 * hash table.
+	 */
+	for (i = 0; i < XLOG_RHASH_SIZE; i++) {
+		struct hlist_node	*tmp;
+		struct xlog_recover	*trans;
+
+		hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
+			xlog_recover_free_trans(trans);
+	}
+
+	return error ? error : error2;
+}
+
+/*
+ * Do the recovery of the log.  We actually do this in two phases.
+ * The two passes are necessary in order to implement the function
+ * of cancelling a record written into the log.  The first pass
+ * determines those things which have been cancelled, and the
+ * second pass replays log items normally except for those which
+ * have been cancelled.  The handling of the replay and cancellations
+ * takes place in the log item type specific routines.
+ *
+ * The table of items which have cancel records in the log is allocated
+ * and freed at this level, since only here do we know when all of
+ * the log recovery has been completed.
+ */
+STATIC int
+xlog_do_log_recovery(
+	struct xlog	*log,
+	xfs_daddr_t	head_blk,
+	xfs_daddr_t	tail_blk)
+{
+	int		error;
+
+	ASSERT(head_blk != tail_blk);
+
+	/*
+	 * First do a pass to find all of the cancelled buf log items.
+	 * Store them in the buf_cancel_table for use in the second pass.
+	 */
+	error = xlog_alloc_buf_cancel_table(log);
+	if (error)
+		return error;
+
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS1, NULL);
+	if (error != 0)
+		goto out_cancel;
+
+	/*
+	 * Then do a second pass to actually recover the items in the log.
+	 * When it is complete free the table of buf cancel items.
+	 */
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS2, NULL);
+	if (!error)
+		xlog_check_buf_cancel_table(log);
+out_cancel:
+	xlog_free_buf_cancel_table(log);
+	return error;
+}
+
+/*
+ * Do the actual recovery
+ */
+STATIC int
+xlog_do_recover(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk)
+{
+	struct xfs_mount	*mp = log->l_mp;
+	struct xfs_buf		*bp = mp->m_sb_bp;
+	struct xfs_sb		*sbp = &mp->m_sb;
+	int			error;
+
+	trace_xfs_log_recover(log, head_blk, tail_blk);
+
+	/*
+	 * First replay the images in the log.
+	 */
+	error = xlog_do_log_recovery(log, head_blk, tail_blk);
+	if (error)
+		return error;
+
+	if (xlog_is_shutdown(log))
+		return -EIO;
+
+	/*
+	 * We now update the tail_lsn since much of the recovery has completed
+	 * and there may be space available to use.  If there were no extent
+	 * or iunlinks, we can free up the entire log and set the tail_lsn to
+	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
+	 * lsn of the last known good LR on disk.  If there are extent frees
+	 * or iunlinks they will have some entries in the AIL; so we look at
+	 * the AIL to determine how to set the tail_lsn.
+	 */
+	xlog_assign_tail_lsn(mp);
+
+	/*
+	 * Now that we've finished replaying all buffer and inode updates,
+	 * re-read the superblock and reverify it.
+	 */
+	xfs_buf_lock(bp);
+	xfs_buf_hold(bp);
+	error = _xfs_buf_read(bp, XBF_READ);
+	if (error) {
+		if (!xlog_is_shutdown(log)) {
+			xfs_buf_ioerror_alert(bp, __this_address);
+			ASSERT(0);
+		}
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	/* Convert superblock from on-disk format */
+	xfs_sb_from_disk(sbp, bp->b_addr);
+	xfs_buf_relse(bp);
+
+	/* re-initialise in-core superblock and geometry structures */
+	mp->m_features |= xfs_sb_version_to_features(sbp);
+	xfs_reinit_percpu_counters(mp);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
+			&mp->m_maxagi);
+	if (error) {
+		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
+		return error;
+	}
+	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+
+	/* Normal transactions can now occur */
+	clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
+	return 0;
+}
+
+/*
+ * Perform recovery and re-initialize some log variables in xlog_find_tail.
+ *
+ * Return error or zero.
+ */
+int
+xlog_recover(
+	struct xlog	*log)
+{
+	xfs_daddr_t	head_blk, tail_blk;
+	int		error;
+
+	/* find the tail of the log */
+	error = xlog_find_tail(log, &head_blk, &tail_blk);
+	if (error)
+		return error;
+
+	/*
+	 * The superblock was read before the log was available and thus the LSN
+	 * could not be verified. Check the superblock LSN against the current
+	 * LSN now that it's known.
+	 */
+	if (xfs_has_crc(log->l_mp) &&
+	    !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
+		return -EINVAL;
+
+	if (tail_blk != head_blk) {
+		/* There used to be a comment here:
+		 *
+		 * disallow recovery on read-only mounts.  note -- mount
+		 * checks for ENOSPC and turns it into an intelligent
+		 * error message.
+		 * ...but this is no longer true.  Now, unless you specify
+		 * NORECOVERY (in which case this function would never be
+		 * called), we just go ahead and recover.  We do this all
+		 * under the vfs layer, so we can get away with it unless
+		 * the device itself is read-only, in which case we fail.
+		 */
+		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
+			return error;
+		}
+
+		/*
+		 * Version 5 superblock log feature mask validation. We know the
+		 * log is dirty so check if there are any unknown log features
+		 * in what we need to recover. If there are unknown features
+		 * (e.g. unsupported transactions, then simply reject the
+		 * attempt at recovery before touching anything.
+		 */
+		if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
+		    xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
+					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+			xfs_warn(log->l_mp,
+"Superblock has unknown incompatible log features (0x%x) enabled.",
+				(log->l_mp->m_sb.sb_features_log_incompat &
+					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+			xfs_warn(log->l_mp,
+"The log can not be fully and/or safely recovered by this kernel.");
+			xfs_warn(log->l_mp,
+"Please recover the log on a kernel that supports the unknown features.");
+			return -EINVAL;
+		}
+
+		/*
+		 * Delay log recovery if the debug hook is set. This is debug
+		 * instrumentation to coordinate simulation of I/O failures with
+		 * log recovery.
+		 */
+		if (xfs_globals.log_recovery_delay) {
+			xfs_notice(log->l_mp,
+				"Delaying log recovery for %d seconds.",
+				xfs_globals.log_recovery_delay);
+			msleep(xfs_globals.log_recovery_delay * 1000);
+		}
+
+		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
+				log->l_mp->m_logname ? log->l_mp->m_logname
+						     : "internal");
+
+		error = xlog_do_recover(log, head_blk, tail_blk);
+		set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
+	}
+	return error;
+}
+
+/*
+ * In the first part of recovery we replay inodes and buffers and build up the
+ * list of intents which need to be processed. Here we process the intents and
+ * clean up the on disk unlinked inode lists. This is separated from the first
+ * part of recovery so that the root and real-time bitmap inodes can be read in
+ * from disk in between the two stages.  This is necessary so that we can free
+ * space in the real-time portion of the file system.
+ */
+int
+xlog_recover_finish(
+	struct xlog	*log)
+{
+	int	error;
+
+	error = xlog_recover_process_intents(log);
+	if (error) {
+		/*
+		 * Cancel all the unprocessed intent items now so that we don't
+		 * leave them pinned in the AIL.  This can cause the AIL to
+		 * livelock on the pinned item if anyone tries to push the AIL
+		 * (inode reclaim does this) before we get around to
+		 * xfs_log_mount_cancel.
+		 */
+		xlog_recover_cancel_intents(log);
+		xfs_alert(log->l_mp, "Failed to recover intents");
+		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+		return error;
+	}
+
+	/*
+	 * Sync the log to get all the intents out of the AIL.  This isn't
+	 * absolutely necessary, but it helps in case the unlink transactions
+	 * would have problems pushing the intents out of the way.
+	 */
+	xfs_log_force(log->l_mp, XFS_LOG_SYNC);
+
+	/*
+	 * Now that we've recovered the log and all the intents, we can clear
+	 * the log incompat feature bits in the superblock because there's no
+	 * longer anything to protect.  We rely on the AIL push to write out the
+	 * updated superblock after everything else.
+	 */
+	if (xfs_clear_incompat_log_features(log->l_mp)) {
+		error = xfs_sync_sb(log->l_mp, false);
+		if (error < 0) {
+			xfs_alert(log->l_mp,
+	"Failed to clear log incompat features on recovery");
+			return error;
+		}
+	}
+
+	xlog_recover_process_iunlinks(log);
+
+	/*
+	 * Recover any CoW staging blocks that are still referenced by the
+	 * ondisk refcount metadata.  During mount there cannot be any live
+	 * staging extents as we have not permitted any user modifications.
+	 * Therefore, it is safe to free them all right now, even on a
+	 * read-only mount.
+	 */
+	error = xfs_reflink_recover_cow(log->l_mp);
+	if (error) {
+		xfs_alert(log->l_mp,
+	"Failed to recover leftover CoW staging extents, err %d.",
+				error);
+		/*
+		 * If we get an error here, make sure the log is shut down
+		 * but return zero so that any log items committed since the
+		 * end of intents processing can be pushed through the CIL
+		 * and AIL.
+		 */
+		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+	}
+
+	return 0;
+}
+
+void
+xlog_recover_cancel(
+	struct xlog	*log)
+{
+	if (xlog_recovery_needed(log))
+		xlog_recover_cancel_intents(log);
+}
+
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
new file mode 100644
index 000000000..8f495cc23
--- /dev/null
+++ b/fs/xfs/xfs_message.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_error.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+
+/*
+ * XFS logging functions
+ */
+static void
+__xfs_printk(
+	const char		*level,
+	const struct xfs_mount	*mp,
+	struct va_format	*vaf)
+{
+	if (mp && mp->m_super) {
+		printk("%sXFS (%s): %pV\n", level, mp->m_super->s_id, vaf);
+		return;
+	}
+	printk("%sXFS: %pV\n", level, vaf);
+}
+
+void
+xfs_printk_level(
+	const char *kern_level,
+	const struct xfs_mount *mp,
+	const char *fmt, ...)
+{
+	struct va_format	vaf;
+	va_list			args;
+	int			level;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	__xfs_printk(kern_level, mp, &vaf);
+
+	va_end(args);
+
+	if (!kstrtoint(kern_level, 0, &level) &&
+	    level <= LOGLEVEL_ERR &&
+	    xfs_error_level >= XFS_ERRLEVEL_HIGH)
+		xfs_stack_trace();
+}
+
+void
+_xfs_alert_tag(
+	const struct xfs_mount	*mp,
+	uint32_t		panic_tag,
+	const char		*fmt, ...)
+{
+	struct va_format	vaf;
+	va_list			args;
+	int			do_panic = 0;
+
+	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+		xfs_alert(mp, "Transforming an alert into a BUG.");
+		do_panic = 1;
+	}
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	__xfs_printk(KERN_ALERT, mp, &vaf);
+	va_end(args);
+
+	BUG_ON(do_panic);
+}
+
+void
+asswarn(
+	struct xfs_mount	*mp,
+	char			*expr,
+	char			*file,
+	int			line)
+{
+	xfs_warn(mp, "Assertion failed: %s, file: %s, line: %d",
+		expr, file, line);
+	WARN_ON(1);
+}
+
+void
+assfail(
+	struct xfs_mount	*mp,
+	char			*expr,
+	char			*file,
+	int			line)
+{
+	xfs_emerg(mp, "Assertion failed: %s, file: %s, line: %d",
+		expr, file, line);
+	if (xfs_globals.bug_on_assert)
+		BUG();
+	else
+		WARN_ON(1);
+}
+
+void
+xfs_hex_dump(const void *p, int length)
+{
+	print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
+}
+
+void
+xfs_buf_alert_ratelimited(
+	struct xfs_buf		*bp,
+	const char		*rlmsg,
+	const char		*fmt,
+	...)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct va_format	vaf;
+	va_list			args;
+
+	/* use the more aggressive per-target rate limit for buffers */
+	if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg))
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	__xfs_printk(KERN_ALERT, mp, &vaf);
+	va_end(args);
+}
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
new file mode 100644
index 000000000..cc323775a
--- /dev/null
+++ b/fs/xfs/xfs_message.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+
+#include <linux/once_lite.h>
+
+struct xfs_mount;
+
+extern __printf(3, 4)
+void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp,
+			const char *fmt, ...);
+
+#define xfs_printk_index_wrap(kern_level, mp, fmt, ...)		\
+({								\
+	printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt);	\
+	xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__);	\
+})
+#define xfs_emerg(mp, fmt, ...) \
+	xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__)
+#define xfs_alert(mp, fmt, ...) \
+	xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__)
+#define xfs_crit(mp, fmt, ...) \
+	xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__)
+#define xfs_err(mp, fmt, ...) \
+	xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__)
+#define xfs_warn(mp, fmt, ...) \
+	xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__)
+#define xfs_notice(mp, fmt, ...) \
+	xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__)
+#define xfs_info(mp, fmt, ...) \
+	xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__)
+#ifdef DEBUG
+#define xfs_debug(mp, fmt, ...) \
+	xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__)
+#else
+#define xfs_debug(mp, fmt, ...) do {} while (0)
+#endif
+
+#define xfs_alert_tag(mp, tag, fmt, ...)			\
+({								\
+	printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt);	\
+	_xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__);		\
+})
+
+extern __printf(3, 4)
+void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag,
+		const char *fmt, ...);
+
+#define xfs_printk_ratelimited(func, dev, fmt, ...)			\
+do {									\
+	static DEFINE_RATELIMIT_STATE(_rs,				\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+	if (__ratelimit(&_rs))						\
+		func(dev, fmt, ##__VA_ARGS__);				\
+} while (0)
+
+#define xfs_printk_once(func, dev, fmt, ...)			\
+	DO_ONCE_LITE(func, dev, fmt, ##__VA_ARGS__)
+
+#define xfs_emerg_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
+#define xfs_alert_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__)
+#define xfs_crit_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__)
+#define xfs_err_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__)
+#define xfs_warn_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__)
+#define xfs_notice_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__)
+#define xfs_info_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__)
+#define xfs_debug_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
+
+#define xfs_warn_mount(mp, warntag, fmt, ...)				\
+do {									\
+	if (xfs_should_warn((mp), (warntag)))				\
+		xfs_warn((mp), (fmt), ##__VA_ARGS__);			\
+} while (0)
+
+#define xfs_warn_once(dev, fmt, ...)				\
+	xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__)
+#define xfs_notice_once(dev, fmt, ...)				\
+	xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__)
+#define xfs_info_once(dev, fmt, ...)				\
+	xfs_printk_once(xfs_info, dev, fmt, ##__VA_ARGS__)
+
+void assfail(struct xfs_mount *mp, char *expr, char *f, int l);
+void asswarn(struct xfs_mount *mp, char *expr, char *f, int l);
+
+extern void xfs_hex_dump(const void *p, int length);
+
+void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg,
+			       const char *fmt, ...);
+
+#endif	/* __XFS_MESSAGE_H */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
new file mode 100644
index 000000000..e8bb3c2e8
--- /dev/null
+++ b/fs/xfs/xfs_mount.c
@@ -0,0 +1,1389 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+#include "xfs_icache.h"
+#include "xfs_sysfs.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_reflink.h"
+#include "xfs_extent_busy.h"
+#include "xfs_health.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+
+static DEFINE_MUTEX(xfs_uuid_table_mutex);
+static int xfs_uuid_table_size;
+static uuid_t *xfs_uuid_table;
+
+void
+xfs_uuid_table_free(void)
+{
+	if (xfs_uuid_table_size == 0)
+		return;
+	kmem_free(xfs_uuid_table);
+	xfs_uuid_table = NULL;
+	xfs_uuid_table_size = 0;
+}
+
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+	struct xfs_mount	*mp)
+{
+	uuid_t			*uuid = &mp->m_sb.sb_uuid;
+	int			hole, i;
+
+	/* Publish UUID in struct super_block */
+	uuid_copy(&mp->m_super->s_uuid, uuid);
+
+	if (xfs_has_nouuid(mp))
+		return 0;
+
+	if (uuid_is_null(uuid)) {
+		xfs_warn(mp, "Filesystem has null UUID - can't mount");
+		return -EINVAL;
+	}
+
+	mutex_lock(&xfs_uuid_table_mutex);
+	for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
+		if (uuid_is_null(&xfs_uuid_table[i])) {
+			hole = i;
+			continue;
+		}
+		if (uuid_equal(uuid, &xfs_uuid_table[i]))
+			goto out_duplicate;
+	}
+
+	if (hole < 0) {
+		xfs_uuid_table = krealloc(xfs_uuid_table,
+			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
+			GFP_KERNEL | __GFP_NOFAIL);
+		hole = xfs_uuid_table_size++;
+	}
+	xfs_uuid_table[hole] = *uuid;
+	mutex_unlock(&xfs_uuid_table_mutex);
+
+	return 0;
+
+ out_duplicate:
+	mutex_unlock(&xfs_uuid_table_mutex);
+	xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
+	return -EINVAL;
+}
+
+STATIC void
+xfs_uuid_unmount(
+	struct xfs_mount	*mp)
+{
+	uuid_t			*uuid = &mp->m_sb.sb_uuid;
+	int			i;
+
+	if (xfs_has_nouuid(mp))
+		return;
+
+	mutex_lock(&xfs_uuid_table_mutex);
+	for (i = 0; i < xfs_uuid_table_size; i++) {
+		if (uuid_is_null(&xfs_uuid_table[i]))
+			continue;
+		if (!uuid_equal(uuid, &xfs_uuid_table[i]))
+			continue;
+		memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
+		break;
+	}
+	ASSERT(i < xfs_uuid_table_size);
+	mutex_unlock(&xfs_uuid_table_mutex);
+}
+
+/*
+ * Check size of device based on the (data/realtime) block count.
+ * Note: this check is used by the growfs code as well as mount.
+ */
+int
+xfs_sb_validate_fsb_count(
+	xfs_sb_t	*sbp,
+	uint64_t	nblocks)
+{
+	ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
+	ASSERT(sbp->sb_blocklog >= BBSHIFT);
+
+	/* Limited by ULONG_MAX of page cache index */
+	if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+		return -EFBIG;
+	return 0;
+}
+
+/*
+ * xfs_readsb
+ *
+ * Does the initial read of the superblock.
+ */
+int
+xfs_readsb(
+	struct xfs_mount *mp,
+	int		flags)
+{
+	unsigned int	sector_size;
+	struct xfs_buf	*bp;
+	struct xfs_sb	*sbp = &mp->m_sb;
+	int		error;
+	int		loud = !(flags & XFS_MFSI_QUIET);
+	const struct xfs_buf_ops *buf_ops;
+
+	ASSERT(mp->m_sb_bp == NULL);
+	ASSERT(mp->m_ddev_targp != NULL);
+
+	/*
+	 * For the initial read, we must guess at the sector
+	 * size based on the block device.  It's enough to
+	 * get the sb_sectsize out of the superblock and
+	 * then reread with the proper length.
+	 * We don't verify it yet, because it may not be complete.
+	 */
+	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+	buf_ops = NULL;
+
+	/*
+	 * Allocate a (locked) buffer to hold the superblock. This will be kept
+	 * around at all times to optimize access to the superblock. Therefore,
+	 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
+	 * elevated.
+	 */
+reread:
+	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
+				      BTOBB(sector_size), XBF_NO_IOACCT, &bp,
+				      buf_ops);
+	if (error) {
+		if (loud)
+			xfs_warn(mp, "SB validate failed with error %d.", error);
+		/* bad CRC means corrupted metadata */
+		if (error == -EFSBADCRC)
+			error = -EFSCORRUPTED;
+		return error;
+	}
+
+	/*
+	 * Initialize the mount structure from the superblock.
+	 */
+	xfs_sb_from_disk(sbp, bp->b_addr);
+
+	/*
+	 * If we haven't validated the superblock, do so now before we try
+	 * to check the sector size and reread the superblock appropriately.
+	 */
+	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+		if (loud)
+			xfs_warn(mp, "Invalid superblock magic number");
+		error = -EINVAL;
+		goto release_buf;
+	}
+
+	/*
+	 * We must be able to do sector-sized and sector-aligned IO.
+	 */
+	if (sector_size > sbp->sb_sectsize) {
+		if (loud)
+			xfs_warn(mp, "device supports %u byte sectors (not %u)",
+				sector_size, sbp->sb_sectsize);
+		error = -ENOSYS;
+		goto release_buf;
+	}
+
+	if (buf_ops == NULL) {
+		/*
+		 * Re-read the superblock so the buffer is correctly sized,
+		 * and properly verified.
+		 */
+		xfs_buf_relse(bp);
+		sector_size = sbp->sb_sectsize;
+		buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
+		goto reread;
+	}
+
+	mp->m_features |= xfs_sb_version_to_features(sbp);
+	xfs_reinit_percpu_counters(mp);
+
+	/* no need to be quiet anymore, so reset the buf ops */
+	bp->b_ops = &xfs_sb_buf_ops;
+
+	mp->m_sb_bp = bp;
+	xfs_buf_unlock(bp);
+	return 0;
+
+release_buf:
+	xfs_buf_relse(bp);
+	return error;
+}
+
+/*
+ * If the sunit/swidth change would move the precomputed root inode value, we
+ * must reject the ondisk change because repair will stumble over that.
+ * However, we allow the mount to proceed because we never rejected this
+ * combination before.  Returns true to update the sb, false otherwise.
+ */
+static inline int
+xfs_check_new_dalign(
+	struct xfs_mount	*mp,
+	int			new_dalign,
+	bool			*update_sb)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+	xfs_ino_t		calc_ino;
+
+	calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
+	trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
+
+	if (sbp->sb_rootino == calc_ino) {
+		*update_sb = true;
+		return 0;
+	}
+
+	xfs_warn(mp,
+"Cannot change stripe alignment; would require moving root inode.");
+
+	/*
+	 * XXX: Next time we add a new incompat feature, this should start
+	 * returning -EINVAL to fail the mount.  Until then, spit out a warning
+	 * that we're ignoring the administrator's instructions.
+	 */
+	xfs_warn(mp, "Skipping superblock stripe alignment update.");
+	*update_sb = false;
+	return 0;
+}
+
+/*
+ * If we were provided with new sunit/swidth values as mount options, make sure
+ * that they pass basic alignment and superblock feature checks, and convert
+ * them into the same units (FSB) that everything else expects.  This step
+ * /must/ be done before computing the inode geometry.
+ */
+STATIC int
+xfs_validate_new_dalign(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_dalign == 0)
+		return 0;
+
+	/*
+	 * If stripe unit and stripe width are not multiples
+	 * of the fs blocksize turn off alignment.
+	 */
+	if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+	    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+		xfs_warn(mp,
+	"alignment check failed: sunit/swidth vs. blocksize(%d)",
+			mp->m_sb.sb_blocksize);
+		return -EINVAL;
+	}
+
+	/*
+	 * Convert the stripe unit and width to FSBs.
+	 */
+	mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+	if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
+		xfs_warn(mp,
+	"alignment check failed: sunit/swidth vs. agsize(%d)",
+			mp->m_sb.sb_agblocks);
+		return -EINVAL;
+	}
+
+	if (!mp->m_dalign) {
+		xfs_warn(mp,
+	"alignment check failed: sunit(%d) less than bsize(%d)",
+			mp->m_dalign, mp->m_sb.sb_blocksize);
+		return -EINVAL;
+	}
+
+	mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+
+	if (!xfs_has_dalign(mp)) {
+		xfs_warn(mp,
+"cannot change alignment: superblock does not support data alignment");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Update alignment values based on mount options and sb values. */
+STATIC int
+xfs_update_alignment(
+	struct xfs_mount	*mp)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+
+	if (mp->m_dalign) {
+		bool		update_sb;
+		int		error;
+
+		if (sbp->sb_unit == mp->m_dalign &&
+		    sbp->sb_width == mp->m_swidth)
+			return 0;
+
+		error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
+		if (error || !update_sb)
+			return error;
+
+		sbp->sb_unit = mp->m_dalign;
+		sbp->sb_width = mp->m_swidth;
+		mp->m_update_sb = true;
+	} else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) {
+		mp->m_dalign = sbp->sb_unit;
+		mp->m_swidth = sbp->sb_width;
+	}
+
+	return 0;
+}
+
+/*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+	struct xfs_mount	*mp)
+{
+	uint64_t		dblocks = mp->m_sb.sb_dblocks;
+	uint64_t		rtexts = mp->m_sb.sb_rextents;
+	int			i;
+
+	do_div(dblocks, 100);
+	do_div(rtexts, 100);
+
+	for (i = 0; i < XFS_LOWSP_MAX; i++) {
+		mp->m_low_space[i] = dblocks * (i + 1);
+		mp->m_low_rtexts[i] = rtexts * (i + 1);
+	}
+}
+
+/*
+ * Check that the data (and log if separate) is an ok size.
+ */
+STATIC int
+xfs_check_sizes(
+	struct xfs_mount *mp)
+{
+	struct xfs_buf	*bp;
+	xfs_daddr_t	d;
+	int		error;
+
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
+		xfs_warn(mp, "filesystem size mismatch detected");
+		return -EFBIG;
+	}
+	error = xfs_buf_read_uncached(mp->m_ddev_targp,
+					d - XFS_FSS_TO_BB(mp, 1),
+					XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
+		xfs_warn(mp, "last sector read failed");
+		return error;
+	}
+	xfs_buf_relse(bp);
+
+	if (mp->m_logdev_targp == mp->m_ddev_targp)
+		return 0;
+
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+		xfs_warn(mp, "log size mismatch detected");
+		return -EFBIG;
+	}
+	error = xfs_buf_read_uncached(mp->m_logdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
+		xfs_warn(mp, "log device read failed");
+		return error;
+	}
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+/*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(
+	struct xfs_mount	*mp)
+{
+	mp->m_qflags = 0;
+
+	/* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
+	if (mp->m_sb.sb_qflags == 0)
+		return 0;
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_qflags = 0;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
+		return 0;
+
+	return xfs_sync_sb(mp, false);
+}
+
+uint64_t
+xfs_default_resblks(xfs_mount_t *mp)
+{
+	uint64_t resblks;
+
+	/*
+	 * We default to 5% or 8192 fsbs of space reserved, whichever is
+	 * smaller.  This is intended to cover concurrent allocation
+	 * transactions when we initially hit enospc. These each require a 4
+	 * block reservation. Hence by default we cover roughly 2000 concurrent
+	 * allocation reservations.
+	 */
+	resblks = mp->m_sb.sb_dblocks;
+	do_div(resblks, 20);
+	resblks = min_t(uint64_t, resblks, 8192);
+	return resblks;
+}
+
+/* Ensure the summary counts are correct. */
+STATIC int
+xfs_check_summary_counts(
+	struct xfs_mount	*mp)
+{
+	int			error = 0;
+
+	/*
+	 * The AG0 superblock verifier rejects in-progress filesystems,
+	 * so we should never see the flag set this far into mounting.
+	 */
+	if (mp->m_sb.sb_inprogress) {
+		xfs_err(mp, "sb_inprogress set after log recovery??");
+		WARN_ON(1);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Now the log is mounted, we know if it was an unclean shutdown or
+	 * not. If it was, with the first phase of recovery has completed, we
+	 * have consistent AG blocks on disk. We have not recovered EFIs yet,
+	 * but they are recovered transactionally in the second recovery phase
+	 * later.
+	 *
+	 * If the log was clean when we mounted, we can check the summary
+	 * counters.  If any of them are obviously incorrect, we can recompute
+	 * them from the AGF headers in the next step.
+	 */
+	if (xfs_is_clean(mp) &&
+	    (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
+	     !xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
+	     mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+
+	/*
+	 * We can safely re-initialise incore superblock counters from the
+	 * per-ag data. These may not be correct if the filesystem was not
+	 * cleanly unmounted, so we waited for recovery to finish before doing
+	 * this.
+	 *
+	 * If the filesystem was cleanly unmounted or the previous check did
+	 * not flag anything weird, then we can trust the values in the
+	 * superblock to be correct and we don't need to do anything here.
+	 * Otherwise, recalculate the summary counters.
+	 */
+	if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
+	    xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
+		error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Older kernels misused sb_frextents to reflect both incore
+	 * reservations made by running transactions and the actual count of
+	 * free rt extents in the ondisk metadata.  Transactions committed
+	 * during runtime can therefore contain a superblock update that
+	 * undercounts the number of free rt extents tracked in the rt bitmap.
+	 * A clean unmount record will have the correct frextents value since
+	 * there can be no other transactions running at that point.
+	 *
+	 * If we're mounting the rt volume after recovering the log, recompute
+	 * frextents from the rtbitmap file to fix the inconsistency.
+	 */
+	if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+		error = xfs_rtalloc_reinit_frextents(mp);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
+ * internal inode structures can be sitting in the CIL and AIL at this point,
+ * so we need to unpin them, write them back and/or reclaim them before unmount
+ * can proceed.  In other words, callers are required to have inactivated all
+ * inodes.
+ *
+ * An inode cluster that has been freed can have its buffer still pinned in
+ * memory because the transaction is still sitting in a iclog. The stale inodes
+ * on that buffer will be pinned to the buffer until the transaction hits the
+ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
+ * may never see the pinned buffer, so nothing will push out the iclog and
+ * unpin the buffer.
+ *
+ * Hence we need to force the log to unpin everything first. However, log
+ * forces don't wait for the discards they issue to complete, so we have to
+ * explicitly wait for them to complete here as well.
+ *
+ * Then we can tell the world we are unmounting so that error handling knows
+ * that the filesystem is going away and we should error out anything that we
+ * have been retrying in the background.  This will prevent never-ending
+ * retries in AIL pushing from hanging the unmount.
+ *
+ * Finally, we can push the AIL to clean all the remaining dirty objects, then
+ * reclaim the remaining inodes that are still in memory at this point in time.
+ */
+static void
+xfs_unmount_flush_inodes(
+	struct xfs_mount	*mp)
+{
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	xfs_extent_busy_wait_all(mp);
+	flush_workqueue(xfs_discard_wq);
+
+	set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate);
+
+	xfs_ail_push_all_sync(mp->m_ail);
+	xfs_inodegc_stop(mp);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	xfs_reclaim_inodes(mp);
+	xfs_health_unmount(mp);
+}
+
+static void
+xfs_mount_setup_inode_geom(
+	struct xfs_mount	*mp)
+{
+	struct xfs_ino_geometry *igeo = M_IGEO(mp);
+
+	igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
+	ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp));
+
+	xfs_ialloc_setup_geometry(mp);
+}
+
+/* Compute maximum possible height for per-AG btree types for this fs. */
+static inline void
+xfs_agbtree_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	unsigned int		levels;
+
+	levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
+	levels = max(levels, mp->m_rmap_maxlevels);
+	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
+}
+
+/*
+ * This function does the following on an initial mount of a file system:
+ *	- reads the superblock from disk and init the mount struct
+ *	- if we're a 32-bit kernel, do a size check on the superblock
+ *		so we don't mount terabyte filesystems
+ *	- init mount struct realtime fields
+ *	- allocate inode hash table for fs
+ *	- init directory manager
+ *	- perform recovery and init the log manager
+ */
+int
+xfs_mountfs(
+	struct xfs_mount	*mp)
+{
+	struct xfs_sb		*sbp = &(mp->m_sb);
+	struct xfs_inode	*rip;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	uint64_t		resblks;
+	uint			quotamount = 0;
+	uint			quotaflags = 0;
+	int			error = 0;
+
+	xfs_sb_mount_common(mp, sbp);
+
+	/*
+	 * Check for a mismatched features2 values.  Older kernels read & wrote
+	 * into the wrong sb offset for sb_features2 on some platforms due to
+	 * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
+	 * which made older superblock reading/writing routines swap it as a
+	 * 64-bit value.
+	 *
+	 * For backwards compatibility, we make both slots equal.
+	 *
+	 * If we detect a mismatched field, we OR the set bits into the existing
+	 * features2 field in case it has already been modified; we don't want
+	 * to lose any features.  We then update the bad location with the ORed
+	 * value so that older kernels will see any features2 flags. The
+	 * superblock writeback code ensures the new sb_features2 is copied to
+	 * sb_bad_features2 before it is logged or written to disk.
+	 */
+	if (xfs_sb_has_mismatched_features2(sbp)) {
+		xfs_warn(mp, "correcting sb_features alignment problem");
+		sbp->sb_features2 |= sbp->sb_bad_features2;
+		mp->m_update_sb = true;
+	}
+
+
+	/* always use v2 inodes by default now */
+	if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
+		mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+		mp->m_features |= XFS_FEAT_NLINK;
+		mp->m_update_sb = true;
+	}
+
+	/*
+	 * If we were given new sunit/swidth options, do some basic validation
+	 * checks and convert the incore dalign and swidth values to the
+	 * same units (FSB) that everything else uses.  This /must/ happen
+	 * before computing the inode geometry.
+	 */
+	error = xfs_validate_new_dalign(mp);
+	if (error)
+		goto out;
+
+	xfs_alloc_compute_maxlevels(mp);
+	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
+	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
+	xfs_mount_setup_inode_geom(mp);
+	xfs_rmapbt_compute_maxlevels(mp);
+	xfs_refcountbt_compute_maxlevels(mp);
+
+	xfs_agbtree_compute_maxlevels(mp);
+
+	/*
+	 * Check if sb_agblocks is aligned at stripe boundary.  If sb_agblocks
+	 * is NOT aligned turn off m_dalign since allocator alignment is within
+	 * an ag, therefore ag has to be aligned at stripe boundary.  Note that
+	 * we must compute the free space and rmap btree geometry before doing
+	 * this.
+	 */
+	error = xfs_update_alignment(mp);
+	if (error)
+		goto out;
+
+	/* enable fail_at_unmount as default */
+	mp->m_fail_unmount = true;
+
+	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
+			       NULL, mp->m_super->s_id);
+	if (error)
+		goto out;
+
+	error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+			       &mp->m_kobj, "stats");
+	if (error)
+		goto out_remove_sysfs;
+
+	error = xfs_error_sysfs_init(mp);
+	if (error)
+		goto out_del_stats;
+
+	error = xfs_errortag_init(mp);
+	if (error)
+		goto out_remove_error_sysfs;
+
+	error = xfs_uuid_mount(mp);
+	if (error)
+		goto out_remove_errortag;
+
+	/*
+	 * Update the preferred write size based on the information from the
+	 * on-disk superblock.
+	 */
+	mp->m_allocsize_log =
+		max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
+	mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
+
+	/* set the low space thresholds for dynamic preallocation */
+	xfs_set_low_space_thresholds(mp);
+
+	/*
+	 * If enabled, sparse inode chunk alignment is expected to match the
+	 * cluster size. Full inode chunk alignment must match the chunk size,
+	 * but that is checked on sb read verification...
+	 */
+	if (xfs_has_sparseinodes(mp) &&
+	    mp->m_sb.sb_spino_align !=
+			XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
+		xfs_warn(mp,
+	"Sparse inode block alignment (%u) must match cluster size (%llu).",
+			 mp->m_sb.sb_spino_align,
+			 XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
+		error = -EINVAL;
+		goto out_remove_uuid;
+	}
+
+	/*
+	 * Check that the data (and log if separate) is an ok size.
+	 */
+	error = xfs_check_sizes(mp);
+	if (error)
+		goto out_remove_uuid;
+
+	/*
+	 * Initialize realtime fields in the mount structure
+	 */
+	error = xfs_rtmount_init(mp);
+	if (error) {
+		xfs_warn(mp, "RT mount failed");
+		goto out_remove_uuid;
+	}
+
+	/*
+	 *  Copies the low order bits of the timestamp and the randomly
+	 *  set "sequence" number out of a UUID.
+	 */
+	mp->m_fixedfsid[0] =
+		(get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) |
+		 get_unaligned_be16(&sbp->sb_uuid.b[4]);
+	mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]);
+
+	error = xfs_da_mount(mp);
+	if (error) {
+		xfs_warn(mp, "Failed dir/attr init: %d", error);
+		goto out_remove_uuid;
+	}
+
+	/*
+	 * Initialize the precomputed transaction reservations values.
+	 */
+	xfs_trans_init(mp);
+
+	/*
+	 * Allocate and initialize the per-ag data.
+	 */
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks,
+			&mp->m_maxagi);
+	if (error) {
+		xfs_warn(mp, "Failed per-ag init: %d", error);
+		goto out_free_dir;
+	}
+
+	if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
+		xfs_warn(mp, "no log defined");
+		error = -EFSCORRUPTED;
+		goto out_free_perag;
+	}
+
+	error = xfs_inodegc_register_shrinker(mp);
+	if (error)
+		goto out_fail_wait;
+
+	/*
+	 * Log's mount-time initialization. The first part of recovery can place
+	 * some items on the AIL, to be handled when recovery is finished or
+	 * cancelled.
+	 */
+	error = xfs_log_mount(mp, mp->m_logdev_targp,
+			      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+			      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+	if (error) {
+		xfs_warn(mp, "log mount failed");
+		goto out_inodegc_shrinker;
+	}
+
+	/* Enable background inode inactivation workers. */
+	xfs_inodegc_start(mp);
+	xfs_blockgc_start(mp);
+
+	/*
+	 * Now that we've recovered any pending superblock feature bit
+	 * additions, we can finish setting up the attr2 behaviour for the
+	 * mount. The noattr2 option overrides the superblock flag, so only
+	 * check the superblock feature flag if the mount option is not set.
+	 */
+	if (xfs_has_noattr2(mp)) {
+		mp->m_features &= ~XFS_FEAT_ATTR2;
+	} else if (!xfs_has_attr2(mp) &&
+		   (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
+		mp->m_features |= XFS_FEAT_ATTR2;
+	}
+
+	/*
+	 * Get and sanity-check the root inode.
+	 * Save the pointer to it in the mount structure.
+	 */
+	error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED,
+			 XFS_ILOCK_EXCL, &rip);
+	if (error) {
+		xfs_warn(mp,
+			"Failed to read root inode 0x%llx, error %d",
+			sbp->sb_rootino, -error);
+		goto out_log_dealloc;
+	}
+
+	ASSERT(rip != NULL);
+
+	if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
+		xfs_warn(mp, "corrupted root inode %llu: not a directory",
+			(unsigned long long)rip->i_ino);
+		xfs_iunlock(rip, XFS_ILOCK_EXCL);
+		error = -EFSCORRUPTED;
+		goto out_rele_rip;
+	}
+	mp->m_rootip = rip;	/* save it */
+
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Initialize realtime inode pointers in the mount structure
+	 */
+	error = xfs_rtmount_inodes(mp);
+	if (error) {
+		/*
+		 * Free up the root inode.
+		 */
+		xfs_warn(mp, "failed to read RT inodes");
+		goto out_rele_rip;
+	}
+
+	/* Make sure the summary counts are ok. */
+	error = xfs_check_summary_counts(mp);
+	if (error)
+		goto out_rtunmount;
+
+	/*
+	 * If this is a read-only mount defer the superblock updates until
+	 * the next remount into writeable mode.  Otherwise we would never
+	 * perform the update e.g. for the root filesystem.
+	 */
+	if (mp->m_update_sb && !xfs_is_readonly(mp)) {
+		error = xfs_sync_sb(mp, false);
+		if (error) {
+			xfs_warn(mp, "failed to write sb changes");
+			goto out_rtunmount;
+		}
+	}
+
+	/*
+	 * Initialise the XFS quota management subsystem for this mount
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
+		if (error)
+			goto out_rtunmount;
+	} else {
+		/*
+		 * If a file system had quotas running earlier, but decided to
+		 * mount without -o uquota/pquota/gquota options, revoke the
+		 * quotachecked license.
+		 */
+		if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+			xfs_notice(mp, "resetting quota flags");
+			error = xfs_mount_reset_sbqflags(mp);
+			if (error)
+				goto out_rtunmount;
+		}
+	}
+
+	/*
+	 * Finish recovering the file system.  This part needed to be delayed
+	 * until after the root and real-time bitmap inodes were consistently
+	 * read in.  Temporarily create per-AG space reservations for metadata
+	 * btree shape changes because space freeing transactions (for inode
+	 * inactivation) require the per-AG reservation in lieu of reserving
+	 * blocks.
+	 */
+	error = xfs_fs_reserve_ag_blocks(mp);
+	if (error && error == -ENOSPC)
+		xfs_warn(mp,
+	"ENOSPC reserving per-AG metadata pool, log recovery may fail.");
+	error = xfs_log_mount_finish(mp);
+	xfs_fs_unreserve_ag_blocks(mp);
+	if (error) {
+		xfs_warn(mp, "log mount finish failed");
+		goto out_rtunmount;
+	}
+
+	/*
+	 * Now the log is fully replayed, we can transition to full read-only
+	 * mode for read-only mounts. This will sync all the metadata and clean
+	 * the log so that the recovery we just performed does not have to be
+	 * replayed again on the next mount.
+	 *
+	 * We use the same quiesce mechanism as the rw->ro remount, as they are
+	 * semantically identical operations.
+	 */
+	if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
+		xfs_log_clean(mp);
+
+	/*
+	 * Complete the quota initialisation, post-log-replay component.
+	 */
+	if (quotamount) {
+		ASSERT(mp->m_qflags == 0);
+		mp->m_qflags = quotaflags;
+
+		xfs_qm_mount_quotas(mp);
+	}
+
+	/*
+	 * Now we are mounted, reserve a small amount of unused space for
+	 * privileged transactions. This is needed so that transaction
+	 * space required for critical operations can dip into this pool
+	 * when at ENOSPC. This is needed for operations like create with
+	 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
+	 * are not allowed to use this reserved space.
+	 *
+	 * This may drive us straight to ENOSPC on mount, but that implies
+	 * we were already there on the last unmount. Warn if this occurs.
+	 */
+	if (!xfs_is_readonly(mp)) {
+		resblks = xfs_default_resblks(mp);
+		error = xfs_reserve_blocks(mp, &resblks, NULL);
+		if (error)
+			xfs_warn(mp,
+	"Unable to allocate reserve blocks. Continuing without reserve pool.");
+
+		/* Reserve AG blocks for future btree expansion. */
+		error = xfs_fs_reserve_ag_blocks(mp);
+		if (error && error != -ENOSPC)
+			goto out_agresv;
+	}
+
+	return 0;
+
+ out_agresv:
+	xfs_fs_unreserve_ag_blocks(mp);
+	xfs_qm_unmount_quotas(mp);
+ out_rtunmount:
+	xfs_rtunmount_inodes(mp);
+ out_rele_rip:
+	xfs_irele(rip);
+	/* Clean out dquots that might be in memory after quotacheck. */
+	xfs_qm_unmount(mp);
+
+	/*
+	 * Inactivate all inodes that might still be in memory after a log
+	 * intent recovery failure so that reclaim can free them.  Metadata
+	 * inodes and the root directory shouldn't need inactivation, but the
+	 * mount failed for some reason, so pull down all the state and flee.
+	 */
+	xfs_inodegc_flush(mp);
+
+	/*
+	 * Flush all inode reclamation work and flush the log.
+	 * We have to do this /after/ rtunmount and qm_unmount because those
+	 * two will have scheduled delayed reclaim for the rt/quota inodes.
+	 *
+	 * This is slightly different from the unmountfs call sequence
+	 * because we could be tearing down a partially set up mount.  In
+	 * particular, if log_mount_finish fails we bail out without calling
+	 * qm_unmount_quotas and therefore rely on qm_unmount to release the
+	 * quota inodes.
+	 */
+	xfs_unmount_flush_inodes(mp);
+ out_log_dealloc:
+	xfs_log_mount_cancel(mp);
+ out_inodegc_shrinker:
+	unregister_shrinker(&mp->m_inodegc_shrinker);
+ out_fail_wait:
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
+		xfs_buftarg_drain(mp->m_logdev_targp);
+	xfs_buftarg_drain(mp->m_ddev_targp);
+ out_free_perag:
+	xfs_free_perag(mp);
+ out_free_dir:
+	xfs_da_unmount(mp);
+ out_remove_uuid:
+	xfs_uuid_unmount(mp);
+ out_remove_errortag:
+	xfs_errortag_del(mp);
+ out_remove_error_sysfs:
+	xfs_error_sysfs_del(mp);
+ out_del_stats:
+	xfs_sysfs_del(&mp->m_stats.xs_kobj);
+ out_remove_sysfs:
+	xfs_sysfs_del(&mp->m_kobj);
+ out:
+	return error;
+}
+
+/*
+ * This flushes out the inodes,dquots and the superblock, unmounts the
+ * log and makes sure that incore structures are freed.
+ */
+void
+xfs_unmountfs(
+	struct xfs_mount	*mp)
+{
+	uint64_t		resblks;
+	int			error;
+
+	/*
+	 * Perform all on-disk metadata updates required to inactivate inodes
+	 * that the VFS evicted earlier in the unmount process.  Freeing inodes
+	 * and discarding CoW fork preallocations can cause shape changes to
+	 * the free inode and refcount btrees, respectively, so we must finish
+	 * this before we discard the metadata space reservations.  Metadata
+	 * inodes and the root directory do not require inactivation.
+	 */
+	xfs_inodegc_flush(mp);
+
+	xfs_blockgc_stop(mp);
+	xfs_fs_unreserve_ag_blocks(mp);
+	xfs_qm_unmount_quotas(mp);
+	xfs_rtunmount_inodes(mp);
+	xfs_irele(mp->m_rootip);
+
+	xfs_unmount_flush_inodes(mp);
+
+	xfs_qm_unmount(mp);
+
+	/*
+	 * Unreserve any blocks we have so that when we unmount we don't account
+	 * the reserved free space as used. This is really only necessary for
+	 * lazy superblock counting because it trusts the incore superblock
+	 * counters to be absolutely correct on clean unmount.
+	 *
+	 * We don't bother correcting this elsewhere for lazy superblock
+	 * counting because on mount of an unclean filesystem we reconstruct the
+	 * correct counter value and this is irrelevant.
+	 *
+	 * For non-lazy counter filesystems, this doesn't matter at all because
+	 * we only every apply deltas to the superblock and hence the incore
+	 * value does not matter....
+	 */
+	resblks = 0;
+	error = xfs_reserve_blocks(mp, &resblks, NULL);
+	if (error)
+		xfs_warn(mp, "Unable to free reserved block pool. "
+				"Freespace may not be correct on next mount.");
+
+	xfs_log_unmount(mp);
+	xfs_da_unmount(mp);
+	xfs_uuid_unmount(mp);
+
+#if defined(DEBUG)
+	xfs_errortag_clearall(mp);
+#endif
+	unregister_shrinker(&mp->m_inodegc_shrinker);
+	xfs_free_perag(mp);
+
+	xfs_errortag_del(mp);
+	xfs_error_sysfs_del(mp);
+	xfs_sysfs_del(&mp->m_stats.xs_kobj);
+	xfs_sysfs_del(&mp->m_kobj);
+}
+
+/*
+ * Determine whether modifications can proceed. The caller specifies the minimum
+ * freeze level for which modifications should not be allowed. This allows
+ * certain operations to proceed while the freeze sequence is in progress, if
+ * necessary.
+ */
+bool
+xfs_fs_writable(
+	struct xfs_mount	*mp,
+	int			level)
+{
+	ASSERT(level > SB_UNFROZEN);
+	if ((mp->m_super->s_writers.frozen >= level) ||
+	    xfs_is_shutdown(mp) || xfs_is_readonly(mp))
+		return false;
+
+	return true;
+}
+
+/* Adjust m_fdblocks or m_frextents. */
+int
+xfs_mod_freecounter(
+	struct xfs_mount	*mp,
+	struct percpu_counter	*counter,
+	int64_t			delta,
+	bool			rsvd)
+{
+	int64_t			lcounter;
+	long long		res_used;
+	uint64_t		set_aside = 0;
+	s32			batch;
+	bool			has_resv_pool;
+
+	ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
+	has_resv_pool = (counter == &mp->m_fdblocks);
+	if (rsvd)
+		ASSERT(has_resv_pool);
+
+	if (delta > 0) {
+		/*
+		 * If the reserve pool is depleted, put blocks back into it
+		 * first. Most of the time the pool is full.
+		 */
+		if (likely(!has_resv_pool ||
+			   mp->m_resblks == mp->m_resblks_avail)) {
+			percpu_counter_add(counter, delta);
+			return 0;
+		}
+
+		spin_lock(&mp->m_sb_lock);
+		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+
+		if (res_used > delta) {
+			mp->m_resblks_avail += delta;
+		} else {
+			delta -= res_used;
+			mp->m_resblks_avail = mp->m_resblks;
+			percpu_counter_add(counter, delta);
+		}
+		spin_unlock(&mp->m_sb_lock);
+		return 0;
+	}
+
+	/*
+	 * Taking blocks away, need to be more accurate the closer we
+	 * are to zero.
+	 *
+	 * If the counter has a value of less than 2 * max batch size,
+	 * then make everything serialise as we are real close to
+	 * ENOSPC.
+	 */
+	if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
+				     XFS_FDBLOCKS_BATCH) < 0)
+		batch = 1;
+	else
+		batch = XFS_FDBLOCKS_BATCH;
+
+	/*
+	 * Set aside allocbt blocks because these blocks are tracked as free
+	 * space but not available for allocation. Technically this means that a
+	 * single reservation cannot consume all remaining free space, but the
+	 * ratio of allocbt blocks to usable free blocks should be rather small.
+	 * The tradeoff without this is that filesystems that maintain high
+	 * perag block reservations can over reserve physical block availability
+	 * and fail physical allocation, which leads to much more serious
+	 * problems (i.e. transaction abort, pagecache discards, etc.) than
+	 * slightly premature -ENOSPC.
+	 */
+	if (has_resv_pool)
+		set_aside = xfs_fdblocks_unavailable(mp);
+	percpu_counter_add_batch(counter, delta, batch);
+	if (__percpu_counter_compare(counter, set_aside,
+				     XFS_FDBLOCKS_BATCH) >= 0) {
+		/* we had space! */
+		return 0;
+	}
+
+	/*
+	 * lock up the sb for dipping into reserves before releasing the space
+	 * that took us to ENOSPC.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	percpu_counter_add(counter, -delta);
+	if (!has_resv_pool || !rsvd)
+		goto fdblocks_enospc;
+
+	lcounter = (long long)mp->m_resblks_avail + delta;
+	if (lcounter >= 0) {
+		mp->m_resblks_avail = lcounter;
+		spin_unlock(&mp->m_sb_lock);
+		return 0;
+	}
+	xfs_warn_once(mp,
+"Reserve blocks depleted! Consider increasing reserve pool size.");
+
+fdblocks_enospc:
+	spin_unlock(&mp->m_sb_lock);
+	return -ENOSPC;
+}
+
+/*
+ * Used to free the superblock along various error paths.
+ */
+void
+xfs_freesb(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp = mp->m_sb_bp;
+
+	xfs_buf_lock(bp);
+	mp->m_sb_bp = NULL;
+	xfs_buf_relse(bp);
+}
+
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+	struct xfs_mount	*mp,
+	char			*message)
+{
+	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
+	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+		xfs_notice(mp, "%s required on read-only device.", message);
+		xfs_notice(mp, "write access unavailable, cannot proceed.");
+		return -EROFS;
+	}
+	return 0;
+}
+
+/* Force the summary counters to be recalculated at next mount. */
+void
+xfs_force_summary_recalc(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_has_lazysbcount(mp))
+		return;
+
+	xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+}
+
+/*
+ * Enable a log incompat feature flag in the primary superblock.  The caller
+ * cannot have any other transactions in progress.
+ */
+int
+xfs_add_incompat_log_feature(
+	struct xfs_mount	*mp,
+	uint32_t		feature)
+{
+	struct xfs_dsb		*dsb;
+	int			error;
+
+	ASSERT(hweight32(feature) == 1);
+	ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+
+	/*
+	 * Force the log to disk and kick the background AIL thread to reduce
+	 * the chances that the bwrite will stall waiting for the AIL to unpin
+	 * the primary superblock buffer.  This isn't a data integrity
+	 * operation, so we don't need a synchronous push.
+	 */
+	error = xfs_log_force(mp, XFS_LOG_SYNC);
+	if (error)
+		return error;
+	xfs_ail_push_all(mp->m_ail);
+
+	/*
+	 * Lock the primary superblock buffer to serialize all callers that
+	 * are trying to set feature bits.
+	 */
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_hold(mp->m_sb_bp);
+
+	if (xfs_is_shutdown(mp)) {
+		error = -EIO;
+		goto rele;
+	}
+
+	if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature))
+		goto rele;
+
+	/*
+	 * Write the primary superblock to disk immediately, because we need
+	 * the log_incompat bit to be set in the primary super now to protect
+	 * the log items that we're going to commit later.
+	 */
+	dsb = mp->m_sb_bp->b_addr;
+	xfs_sb_to_disk(dsb, &mp->m_sb);
+	dsb->sb_features_log_incompat |= cpu_to_be32(feature);
+	error = xfs_bwrite(mp->m_sb_bp);
+	if (error)
+		goto shutdown;
+
+	/*
+	 * Add the feature bits to the incore superblock before we unlock the
+	 * buffer.
+	 */
+	xfs_sb_add_incompat_log_features(&mp->m_sb, feature);
+	xfs_buf_relse(mp->m_sb_bp);
+
+	/* Log the superblock to disk. */
+	return xfs_sync_sb(mp, false);
+shutdown:
+	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+rele:
+	xfs_buf_relse(mp->m_sb_bp);
+	return error;
+}
+
+/*
+ * Clear all the log incompat flags from the superblock.
+ *
+ * The caller cannot be in a transaction, must ensure that the log does not
+ * contain any log items protected by any log incompat bit, and must ensure
+ * that there are no other threads that depend on the state of the log incompat
+ * feature flags in the primary super.
+ *
+ * Returns true if the superblock is dirty.
+ */
+bool
+xfs_clear_incompat_log_features(
+	struct xfs_mount	*mp)
+{
+	bool			ret = false;
+
+	if (!xfs_has_crc(mp) ||
+	    !xfs_sb_has_incompat_log_feature(&mp->m_sb,
+				XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
+	    xfs_is_shutdown(mp))
+		return false;
+
+	/*
+	 * Update the incore superblock.  We synchronize on the primary super
+	 * buffer lock to be consistent with the add function, though at least
+	 * in theory this shouldn't be necessary.
+	 */
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_hold(mp->m_sb_bp);
+
+	if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
+				XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
+		xfs_sb_remove_incompat_log_features(&mp->m_sb);
+		ret = true;
+	}
+
+	xfs_buf_relse(mp->m_sb_bp);
+	return ret;
+}
+
+/*
+ * Update the in-core delayed block counter.
+ *
+ * We prefer to update the counter without having to take a spinlock for every
+ * counter update (i.e. batching).  Each change to delayed allocation
+ * reservations can change can easily exceed the default percpu counter
+ * batching, so we use a larger batch factor here.
+ *
+ * Note that we don't currently have any callers requiring fast summation
+ * (e.g. percpu_counter_read) so we can use a big batch value here.
+ */
+#define XFS_DELALLOC_BATCH	(4096)
+void
+xfs_mod_delalloc(
+	struct xfs_mount	*mp,
+	int64_t			delta)
+{
+	percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
+			XFS_DELALLOC_BATCH);
+}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
new file mode 100644
index 000000000..69ddd5319
--- /dev/null
+++ b/fs/xfs/xfs_mount.h
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_MOUNT_H__
+#define	__XFS_MOUNT_H__
+
+struct xlog;
+struct xfs_inode;
+struct xfs_mru_cache;
+struct xfs_ail;
+struct xfs_quotainfo;
+struct xfs_da_geometry;
+struct xfs_perag;
+
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+	XFS_LOWSP_1_PCNT = 0,
+	XFS_LOWSP_2_PCNT,
+	XFS_LOWSP_3_PCNT,
+	XFS_LOWSP_4_PCNT,
+	XFS_LOWSP_5_PCNT,
+	XFS_LOWSP_MAX,
+};
+
+/*
+ * Error Configuration
+ *
+ * Error classes define the subsystem the configuration belongs to.
+ * Error numbers define the errors that are configurable.
+ */
+enum {
+	XFS_ERR_METADATA,
+	XFS_ERR_CLASS_MAX,
+};
+enum {
+	XFS_ERR_DEFAULT,
+	XFS_ERR_EIO,
+	XFS_ERR_ENOSPC,
+	XFS_ERR_ENODEV,
+	XFS_ERR_ERRNO_MAX,
+};
+
+#define XFS_ERR_RETRY_FOREVER	-1
+
+/*
+ * Although retry_timeout is in jiffies which is normally an unsigned long,
+ * we limit the retry timeout to 86400 seconds, or one day.  So even a
+ * signed 32-bit long is sufficient for a HZ value up to 24855.  Making it
+ * signed lets us store the special "-1" value, meaning retry forever.
+ */
+struct xfs_error_cfg {
+	struct xfs_kobj	kobj;
+	int		max_retries;
+	long		retry_timeout;	/* in jiffies, -1 = infinite */
+};
+
+/*
+ * Per-cpu deferred inode inactivation GC lists.
+ */
+struct xfs_inodegc {
+	struct llist_head	list;
+	struct delayed_work	work;
+
+	/* approximate count of inodes in the list */
+	unsigned int		items;
+	unsigned int		shrinker_hits;
+#if defined(DEBUG) || defined(XFS_WARN)
+	unsigned int		cpu;
+#endif
+};
+
+/*
+ * The struct xfsmount layout is optimised to separate read-mostly variables
+ * from variables that are frequently modified. We put the read-mostly variables
+ * first, then place all the other variables at the end.
+ *
+ * Typically, read-mostly variables are those that are set at mount time and
+ * never changed again, or only change rarely as a result of things like sysfs
+ * knobs being tweaked.
+ */
+typedef struct xfs_mount {
+	struct xfs_sb		m_sb;		/* copy of fs superblock */
+	struct super_block	*m_super;
+	struct xfs_ail		*m_ail;		/* fs active log item list */
+	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
+	char			*m_rtname;	/* realtime device name */
+	char			*m_logname;	/* external log device name */
+	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
+	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
+	struct xlog		*m_log;		/* log specific stuff */
+	struct xfs_inode	*m_rbmip;	/* pointer to bitmap inode */
+	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
+	struct xfs_inode	*m_rootip;	/* pointer to root directory */
+	struct xfs_quotainfo	*m_quotainfo;	/* disk quota information */
+	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
+	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
+	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
+	struct list_head	m_mount_list;	/* global mount list */
+	void __percpu		*m_inodegc;	/* percpu inodegc structures */
+
+	/*
+	 * Optional cache of rt summary level per bitmap block with the
+	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
+	 * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
+	 * inode lock.
+	 */
+	uint8_t			*m_rsum_cache;
+	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
+	struct workqueue_struct *m_buf_workqueue;
+	struct workqueue_struct	*m_unwritten_workqueue;
+	struct workqueue_struct	*m_reclaim_workqueue;
+	struct workqueue_struct	*m_sync_workqueue;
+	struct workqueue_struct *m_blockgc_wq;
+	struct workqueue_struct *m_inodegc_wq;
+
+	int			m_bsize;	/* fs logical block size */
+	uint8_t			m_blkbit_log;	/* blocklog + NBBY */
+	uint8_t			m_blkbb_log;	/* blocklog - BBSHIFT */
+	uint8_t			m_agno_log;	/* log #ag's */
+	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
+	uint			m_blockmask;	/* sb_blocksize-1 */
+	uint			m_blockwsize;	/* sb_blocksize in words */
+	uint			m_blockwmask;	/* blockwsize-1 */
+	uint			m_alloc_mxr[2];	/* max alloc btree records */
+	uint			m_alloc_mnr[2];	/* min alloc btree records */
+	uint			m_bmap_dmxr[2];	/* max bmap btree records */
+	uint			m_bmap_dmnr[2];	/* min bmap btree records */
+	uint			m_rmap_mxr[2];	/* max rmap btree records */
+	uint			m_rmap_mnr[2];	/* min rmap btree records */
+	uint			m_refc_mxr[2];	/* max refc btree records */
+	uint			m_refc_mnr[2];	/* min refc btree records */
+	uint			m_alloc_maxlevels; /* max alloc btree levels */
+	uint			m_bm_maxlevels[2]; /* max bmap btree levels */
+	uint			m_rmap_maxlevels; /* max rmap btree levels */
+	uint			m_refc_maxlevels; /* max refcount btree level */
+	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
+	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
+	uint			m_alloc_set_aside; /* space we can't use */
+	uint			m_ag_max_usable; /* max space per AG */
+	int			m_dalign;	/* stripe unit */
+	int			m_swidth;	/* stripe width */
+	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
+	uint			m_allocsize_log;/* min write size log bytes */
+	uint			m_allocsize_blocks; /* min write size blocks */
+	int			m_logbufs;	/* number of log buffers */
+	int			m_logbsize;	/* size of each log buffer */
+	uint			m_rsumlevels;	/* rt summary levels */
+	uint			m_rsumsize;	/* size of rt summary, bytes */
+	int			m_fixedfsid[2];	/* unchanged for life of FS */
+	uint			m_qflags;	/* quota status flags */
+	uint64_t		m_features;	/* active filesystem features */
+	uint64_t		m_low_space[XFS_LOWSP_MAX];
+	uint64_t		m_low_rtexts[XFS_LOWSP_MAX];
+	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
+	struct xfs_trans_resv	m_resv;		/* precomputed res values */
+						/* low free space thresholds */
+	unsigned long		m_opstate;	/* dynamic state flags */
+	bool			m_always_cow;
+	bool			m_fail_unmount;
+	bool			m_finobt_nores; /* no per-AG finobt resv. */
+	bool			m_update_sb;	/* sb needs update in mount */
+
+	/*
+	 * Bitsets of per-fs metadata that have been checked and/or are sick.
+	 * Callers must hold m_sb_lock to access these two fields.
+	 */
+	uint8_t			m_fs_checked;
+	uint8_t			m_fs_sick;
+	/*
+	 * Bitsets of rt metadata that have been checked and/or are sick.
+	 * Callers must hold m_sb_lock to access this field.
+	 */
+	uint8_t			m_rt_checked;
+	uint8_t			m_rt_sick;
+
+	/*
+	 * End of read-mostly variables. Frequently written variables and locks
+	 * should be placed below this comment from now on. The first variable
+	 * here is marked as cacheline aligned so they it is separated from
+	 * the read-mostly variables.
+	 */
+
+	spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
+	struct percpu_counter	m_icount;	/* allocated inodes counter */
+	struct percpu_counter	m_ifree;	/* free inodes counter */
+	struct percpu_counter	m_fdblocks;	/* free block counter */
+	struct percpu_counter	m_frextents;	/* free rt extent counter */
+
+	/*
+	 * Count of data device blocks reserved for delayed allocations,
+	 * including indlen blocks.  Does not include allocated CoW staging
+	 * extents or anything related to the rt device.
+	 */
+	struct percpu_counter	m_delalloc_blks;
+	/*
+	 * Global count of allocation btree blocks in use across all AGs. Only
+	 * used when perag reservation is enabled. Helps prevent block
+	 * reservation from attempting to reserve allocation btree blocks.
+	 */
+	atomic64_t		m_allocbt_blks;
+
+	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
+	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
+	uint64_t		m_resblks;	/* total reserved blocks */
+	uint64_t		m_resblks_avail;/* available reserved blocks */
+	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
+	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
+	struct xfs_kobj		m_kobj;
+	struct xfs_kobj		m_error_kobj;
+	struct xfs_kobj		m_error_meta_kobj;
+	struct xfs_error_cfg	m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
+	struct xstats		m_stats;	/* per-fs stats */
+	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
+	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
+	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
+
+	/* Memory shrinker to throttle and reprioritize inodegc */
+	struct shrinker		m_inodegc_shrinker;
+	/*
+	 * Workqueue item so that we can coalesce multiple inode flush attempts
+	 * into a single flush.
+	 */
+	struct work_struct	m_flush_inodes_work;
+
+	/*
+	 * Generation of the filesysyem layout.  This is incremented by each
+	 * growfs, and used by the pNFS server to ensure the client updates
+	 * its view of the block device once it gets a layout that might
+	 * reference the newly added blocks.  Does not need to be persistent
+	 * as long as we only allow file system size increments, but if we
+	 * ever support shrinks it would have to be persisted in addition
+	 * to various other kinds of pain inflicted on the pNFS server.
+	 */
+	uint32_t		m_generation;
+	struct mutex		m_growlock;	/* growfs mutex */
+
+#ifdef DEBUG
+	/*
+	 * Frequency with which errors are injected.  Replaces xfs_etest; the
+	 * value stored in here is the inverse of the frequency with which the
+	 * error triggers.  1 = always, 2 = half the time, etc.
+	 */
+	unsigned int		*m_errortag;
+	struct xfs_kobj		m_errortag_kobj;
+#endif
+} xfs_mount_t;
+
+#define M_IGEO(mp)		(&(mp)->m_ino_geo)
+
+/*
+ * Flags for m_features.
+ *
+ * These are all the active features in the filesystem, regardless of how
+ * they are configured.
+ */
+#define XFS_FEAT_ATTR		(1ULL << 0)	/* xattrs present in fs */
+#define XFS_FEAT_NLINK		(1ULL << 1)	/* 32 bit link counts */
+#define XFS_FEAT_QUOTA		(1ULL << 2)	/* quota active */
+#define XFS_FEAT_ALIGN		(1ULL << 3)	/* inode alignment */
+#define XFS_FEAT_DALIGN		(1ULL << 4)	/* data alignment */
+#define XFS_FEAT_LOGV2		(1ULL << 5)	/* version 2 logs */
+#define XFS_FEAT_SECTOR		(1ULL << 6)	/* sector size > 512 bytes */
+#define XFS_FEAT_EXTFLG		(1ULL << 7)	/* unwritten extents */
+#define XFS_FEAT_ASCIICI	(1ULL << 8)	/* ASCII only case-insens. */
+#define XFS_FEAT_LAZYSBCOUNT	(1ULL << 9)	/* Superblk counters */
+#define XFS_FEAT_ATTR2		(1ULL << 10)	/* dynamic attr fork */
+#define XFS_FEAT_PARENT		(1ULL << 11)	/* parent pointers */
+#define XFS_FEAT_PROJID32	(1ULL << 12)	/* 32 bit project id */
+#define XFS_FEAT_CRC		(1ULL << 13)	/* metadata CRCs */
+#define XFS_FEAT_V3INODES	(1ULL << 14)	/* Version 3 inodes */
+#define XFS_FEAT_PQUOTINO	(1ULL << 15)	/* non-shared proj/grp quotas */
+#define XFS_FEAT_FTYPE		(1ULL << 16)	/* inode type in dir */
+#define XFS_FEAT_FINOBT		(1ULL << 17)	/* free inode btree */
+#define XFS_FEAT_RMAPBT		(1ULL << 18)	/* reverse map btree */
+#define XFS_FEAT_REFLINK	(1ULL << 19)	/* reflinked files */
+#define XFS_FEAT_SPINODES	(1ULL << 20)	/* sparse inode chunks */
+#define XFS_FEAT_META_UUID	(1ULL << 21)	/* metadata UUID */
+#define XFS_FEAT_REALTIME	(1ULL << 22)	/* realtime device present */
+#define XFS_FEAT_INOBTCNT	(1ULL << 23)	/* inobt block counts */
+#define XFS_FEAT_BIGTIME	(1ULL << 24)	/* large timestamps */
+#define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
+#define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
+
+/* Mount features */
+#define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
+#define XFS_FEAT_NOALIGN	(1ULL << 49)	/* ignore alignment */
+#define XFS_FEAT_ALLOCSIZE	(1ULL << 50)	/* user specified allocation size */
+#define XFS_FEAT_LARGE_IOSIZE	(1ULL << 51)	/* report large preferred
+						 * I/O size in stat() */
+#define XFS_FEAT_WSYNC		(1ULL << 52)	/* synchronous metadata ops */
+#define XFS_FEAT_DIRSYNC	(1ULL << 53)	/* synchronous directory ops */
+#define XFS_FEAT_DISCARD	(1ULL << 54)	/* discard unused blocks */
+#define XFS_FEAT_GRPID		(1ULL << 55)	/* group-ID assigned from directory */
+#define XFS_FEAT_SMALL_INUMS	(1ULL << 56)	/* user wants 32bit inodes */
+#define XFS_FEAT_IKEEP		(1ULL << 57)	/* keep empty inode clusters*/
+#define XFS_FEAT_SWALLOC	(1ULL << 58)	/* stripe width allocation */
+#define XFS_FEAT_FILESTREAMS	(1ULL << 59)	/* use filestreams allocator */
+#define XFS_FEAT_DAX_ALWAYS	(1ULL << 60)	/* DAX always enabled */
+#define XFS_FEAT_DAX_NEVER	(1ULL << 61)	/* DAX never enabled */
+#define XFS_FEAT_NORECOVERY	(1ULL << 62)	/* no recovery - dirty fs */
+#define XFS_FEAT_NOUUID		(1ULL << 63)	/* ignore uuid during mount */
+
+#define __XFS_HAS_FEAT(name, NAME) \
+static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
+{ \
+	return mp->m_features & XFS_FEAT_ ## NAME; \
+}
+
+/* Some features can be added dynamically so they need a set wrapper, too. */
+#define __XFS_ADD_FEAT(name, NAME) \
+	__XFS_HAS_FEAT(name, NAME); \
+static inline void xfs_add_ ## name (struct xfs_mount *mp) \
+{ \
+	mp->m_features |= XFS_FEAT_ ## NAME; \
+	xfs_sb_version_add ## name(&mp->m_sb); \
+}
+
+/* Superblock features */
+__XFS_ADD_FEAT(attr, ATTR)
+__XFS_HAS_FEAT(nlink, NLINK)
+__XFS_ADD_FEAT(quota, QUOTA)
+__XFS_HAS_FEAT(align, ALIGN)
+__XFS_HAS_FEAT(dalign, DALIGN)
+__XFS_HAS_FEAT(logv2, LOGV2)
+__XFS_HAS_FEAT(sector, SECTOR)
+__XFS_HAS_FEAT(extflg, EXTFLG)
+__XFS_HAS_FEAT(asciici, ASCIICI)
+__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT)
+__XFS_ADD_FEAT(attr2, ATTR2)
+__XFS_HAS_FEAT(parent, PARENT)
+__XFS_ADD_FEAT(projid32, PROJID32)
+__XFS_HAS_FEAT(crc, CRC)
+__XFS_HAS_FEAT(v3inodes, V3INODES)
+__XFS_HAS_FEAT(pquotino, PQUOTINO)
+__XFS_HAS_FEAT(ftype, FTYPE)
+__XFS_HAS_FEAT(finobt, FINOBT)
+__XFS_HAS_FEAT(rmapbt, RMAPBT)
+__XFS_HAS_FEAT(reflink, REFLINK)
+__XFS_HAS_FEAT(sparseinodes, SPINODES)
+__XFS_HAS_FEAT(metauuid, META_UUID)
+__XFS_HAS_FEAT(realtime, REALTIME)
+__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
+__XFS_HAS_FEAT(bigtime, BIGTIME)
+__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(large_extent_counts, NREXT64)
+
+/*
+ * Mount features
+ *
+ * These do not change dynamically - features that can come and go, such as 32
+ * bit inodes and read-only state, are kept as operational state rather than
+ * features.
+ */
+__XFS_HAS_FEAT(noattr2, NOATTR2)
+__XFS_HAS_FEAT(noalign, NOALIGN)
+__XFS_HAS_FEAT(allocsize, ALLOCSIZE)
+__XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE)
+__XFS_HAS_FEAT(wsync, WSYNC)
+__XFS_HAS_FEAT(dirsync, DIRSYNC)
+__XFS_HAS_FEAT(discard, DISCARD)
+__XFS_HAS_FEAT(grpid, GRPID)
+__XFS_HAS_FEAT(small_inums, SMALL_INUMS)
+__XFS_HAS_FEAT(ikeep, IKEEP)
+__XFS_HAS_FEAT(swalloc, SWALLOC)
+__XFS_HAS_FEAT(filestreams, FILESTREAMS)
+__XFS_HAS_FEAT(dax_always, DAX_ALWAYS)
+__XFS_HAS_FEAT(dax_never, DAX_NEVER)
+__XFS_HAS_FEAT(norecovery, NORECOVERY)
+__XFS_HAS_FEAT(nouuid, NOUUID)
+
+/*
+ * Operational mount state flags
+ *
+ * Use these with atomic bit ops only!
+ */
+#define XFS_OPSTATE_UNMOUNTING		0	/* filesystem is unmounting */
+#define XFS_OPSTATE_CLEAN		1	/* mount was clean */
+#define XFS_OPSTATE_SHUTDOWN		2	/* stop all fs operations */
+#define XFS_OPSTATE_INODE32		3	/* inode32 allocator active */
+#define XFS_OPSTATE_READONLY		4	/* read-only fs */
+
+/*
+ * If set, inactivation worker threads will be scheduled to process queued
+ * inodegc work.  If not, queued inodes remain in memory waiting to be
+ * processed.
+ */
+#define XFS_OPSTATE_INODEGC_ENABLED	5
+/*
+ * If set, background speculative prealloc gc worker threads will be scheduled
+ * to process queued blockgc work.  If not, inodes retain their preallocations
+ * until explicitly deleted.
+ */
+#define XFS_OPSTATE_BLOCKGC_ENABLED	6
+
+/* Kernel has logged a warning about online fsck being used on this fs. */
+#define XFS_OPSTATE_WARNED_SCRUB	7
+/* Kernel has logged a warning about shrink being used on this fs. */
+#define XFS_OPSTATE_WARNED_SHRINK	8
+/* Kernel has logged a warning about logged xattr updates being used. */
+#define XFS_OPSTATE_WARNED_LARP		9
+
+#define __XFS_IS_OPSTATE(name, NAME) \
+static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
+{ \
+	return test_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+} \
+static inline bool xfs_clear_ ## name (struct xfs_mount *mp) \
+{ \
+	return test_and_clear_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+} \
+static inline bool xfs_set_ ## name (struct xfs_mount *mp) \
+{ \
+	return test_and_set_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+}
+
+__XFS_IS_OPSTATE(unmounting, UNMOUNTING)
+__XFS_IS_OPSTATE(clean, CLEAN)
+__XFS_IS_OPSTATE(shutdown, SHUTDOWN)
+__XFS_IS_OPSTATE(inode32, INODE32)
+__XFS_IS_OPSTATE(readonly, READONLY)
+__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
+__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
+
+static inline bool
+xfs_should_warn(struct xfs_mount *mp, long nr)
+{
+	return !test_and_set_bit(nr, &mp->m_opstate);
+}
+
+#define XFS_OPSTATE_STRINGS \
+	{ (1UL << XFS_OPSTATE_UNMOUNTING),		"unmounting" }, \
+	{ (1UL << XFS_OPSTATE_CLEAN),			"clean" }, \
+	{ (1UL << XFS_OPSTATE_SHUTDOWN),		"shutdown" }, \
+	{ (1UL << XFS_OPSTATE_INODE32),			"inode32" }, \
+	{ (1UL << XFS_OPSTATE_READONLY),		"read_only" }, \
+	{ (1UL << XFS_OPSTATE_INODEGC_ENABLED),		"inodegc" }, \
+	{ (1UL << XFS_OPSTATE_BLOCKGC_ENABLED),		"blockgc" }, \
+	{ (1UL << XFS_OPSTATE_WARNED_SCRUB),		"wscrub" }, \
+	{ (1UL << XFS_OPSTATE_WARNED_SHRINK),		"wshrink" }, \
+	{ (1UL << XFS_OPSTATE_WARNED_LARP),		"wlarp" }
+
+/*
+ * Max and min values for mount-option defined I/O
+ * preallocation sizes.
+ */
+#define XFS_MAX_IO_LOG		30	/* 1G */
+#define XFS_MIN_IO_LOG		PAGE_SHIFT
+
+void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
+		int lnnum);
+#define xfs_force_shutdown(m,f)	\
+	xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
+
+#define SHUTDOWN_META_IO_ERROR	(1u << 0) /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR	(1u << 1) /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT	(1u << 2) /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE	(1u << 3) /* corrupt in-memory structures */
+#define SHUTDOWN_CORRUPT_ONDISK	(1u << 4)  /* corrupt metadata on device */
+
+#define XFS_SHUTDOWN_STRINGS \
+	{ SHUTDOWN_META_IO_ERROR,	"metadata_io" }, \
+	{ SHUTDOWN_LOG_IO_ERROR,	"log_io" }, \
+	{ SHUTDOWN_FORCE_UMOUNT,	"force_umount" }, \
+	{ SHUTDOWN_CORRUPT_INCORE,	"corruption" }
+
+/*
+ * Flags for xfs_mountfs
+ */
+#define XFS_MFSI_QUIET		0x40	/* Be silent if mount errors found */
+
+static inline xfs_agnumber_t
+xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
+{
+	xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
+	do_div(ld, mp->m_sb.sb_agblocks);
+	return (xfs_agnumber_t) ld;
+}
+
+static inline xfs_agblock_t
+xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
+{
+	xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
+	return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
+}
+
+int xfs_buf_hash_init(struct xfs_perag *pag);
+void xfs_buf_hash_destroy(struct xfs_perag *pag);
+
+extern void	xfs_uuid_table_free(void);
+extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
+extern int	xfs_mountfs(xfs_mount_t *mp);
+extern void	xfs_unmountfs(xfs_mount_t *);
+
+/*
+ * Deltas for the block count can vary from 1 to very large, but lock contention
+ * only occurs on frequent small block count updates such as in the delayed
+ * allocation path for buffered writes (page a time updates). Hence we set
+ * a large batch count (1024) to minimise global counter updates except when
+ * we get near to ENOSPC and we have to be very accurate with our updates.
+ */
+#define XFS_FDBLOCKS_BATCH	1024
+
+/*
+ * Estimate the amount of free space that is not available to userspace and is
+ * not explicitly reserved from the incore fdblocks.  This includes:
+ *
+ * - The minimum number of blocks needed to support splitting a bmap btree
+ * - The blocks currently in use by the freespace btrees because they record
+ *   the actual blocks that will fill per-AG metadata space reservations
+ */
+static inline uint64_t
+xfs_fdblocks_unavailable(
+	struct xfs_mount	*mp)
+{
+	return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+}
+
+int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+		int64_t delta, bool rsvd);
+
+static inline int
+xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+{
+	return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline int
+xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+{
+	return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+}
+
+extern int	xfs_readsb(xfs_mount_t *, int);
+extern void	xfs_freesb(xfs_mount_t *);
+extern bool	xfs_fs_writable(struct xfs_mount *mp, int level);
+extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, uint64_t);
+
+extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
+
+extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
+
+int	xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
+			xfs_off_t count_fsb);
+
+struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
+		int error_class, int error);
+void xfs_force_summary_recalc(struct xfs_mount *mp);
+int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
+bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
+void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
+
+#endif	/* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
new file mode 100644
index 000000000..f85e3b07a
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.c
@@ -0,0 +1,542 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_mru_cache.h"
+
+/*
+ * The MRU Cache data structure consists of a data store, an array of lists and
+ * a lock to protect its internal state.  At initialisation time, the client
+ * supplies an element lifetime in milliseconds and a group count, as well as a
+ * function pointer to call when deleting elements.  A data structure for
+ * queueing up work in the form of timed callbacks is also included.
+ *
+ * The group count controls how many lists are created, and thereby how finely
+ * the elements are grouped in time.  When reaping occurs, all the elements in
+ * all the lists whose time has expired are deleted.
+ *
+ * To give an example of how this works in practice, consider a client that
+ * initialises an MRU Cache with a lifetime of ten seconds and a group count of
+ * five.  Five internal lists will be created, each representing a two second
+ * period in time.  When the first element is added, time zero for the data
+ * structure is initialised to the current time.
+ *
+ * All the elements added in the first two seconds are appended to the first
+ * list.  Elements added in the third second go into the second list, and so on.
+ * If an element is accessed at any point, it is removed from its list and
+ * inserted at the head of the current most-recently-used list.
+ *
+ * The reaper function will have nothing to do until at least twelve seconds
+ * have elapsed since the first element was added.  The reason for this is that
+ * if it were called at t=11s, there could be elements in the first list that
+ * have only been inactive for nine seconds, so it still does nothing.  If it is
+ * called anywhere between t=12 and t=14 seconds, it will delete all the
+ * elements that remain in the first list.  It's therefore possible for elements
+ * to remain in the data store even after they've been inactive for up to
+ * (t + t/g) seconds, where t is the inactive element lifetime and g is the
+ * number of groups.
+ *
+ * The above example assumes that the reaper function gets called at least once
+ * every (t/g) seconds.  If it is called less frequently, unused elements will
+ * accumulate in the reap list until the reaper function is eventually called.
+ * The current implementation uses work queue callbacks to carefully time the
+ * reaper function calls, so this should happen rarely, if at all.
+ *
+ * From a design perspective, the primary reason for the choice of a list array
+ * representing discrete time intervals is that it's only practical to reap
+ * expired elements in groups of some appreciable size.  This automatically
+ * introduces a granularity to element lifetimes, so there's no point storing an
+ * individual timeout with each element that specifies a more precise reap time.
+ * The bonus is a saving of sizeof(long) bytes of memory per element stored.
+ *
+ * The elements could have been stored in just one list, but an array of
+ * counters or pointers would need to be maintained to allow them to be divided
+ * up into discrete time groups.  More critically, the process of touching or
+ * removing an element would involve walking large portions of the entire list,
+ * which would have a detrimental effect on performance.  The additional memory
+ * requirement for the array of list heads is minimal.
+ *
+ * When an element is touched or deleted, it needs to be removed from its
+ * current list.  Doubly linked lists are used to make the list maintenance
+ * portion of these operations O(1).  Since reaper timing can be imprecise,
+ * inserts and lookups can occur when there are no free lists available.  When
+ * this happens, all the elements on the LRU list need to be migrated to the end
+ * of the reap list.  To keep the list maintenance portion of these operations
+ * O(1) also, list tails need to be accessible without walking the entire list.
+ * This is the reason why doubly linked list heads are used.
+ */
+
+/*
+ * An MRU Cache is a dynamic data structure that stores its elements in a way
+ * that allows efficient lookups, but also groups them into discrete time
+ * intervals based on insertion time.  This allows elements to be efficiently
+ * and automatically reaped after a fixed period of inactivity.
+ *
+ * When a client data pointer is stored in the MRU Cache it needs to be added to
+ * both the data store and to one of the lists.  It must also be possible to
+ * access each of these entries via the other, i.e. to:
+ *
+ *    a) Walk a list, removing the corresponding data store entry for each item.
+ *    b) Look up a data store entry, then access its list entry directly.
+ *
+ * To achieve both of these goals, each entry must contain both a list entry and
+ * a key, in addition to the user's data pointer.  Note that it's not a good
+ * idea to have the client embed one of these structures at the top of their own
+ * data structure, because inserting the same item more than once would most
+ * likely result in a loop in one of the lists.  That's a sure-fire recipe for
+ * an infinite loop in the code.
+ */
+struct xfs_mru_cache {
+	struct radix_tree_root	store;     /* Core storage data structure.  */
+	struct list_head	*lists;    /* Array of lists, one per grp.  */
+	struct list_head	reap_list; /* Elements overdue for reaping. */
+	spinlock_t		lock;      /* Lock to protect this struct.  */
+	unsigned int		grp_count; /* Number of discrete groups.    */
+	unsigned int		grp_time;  /* Time period spanned by grps.  */
+	unsigned int		lru_grp;   /* Group containing time zero.   */
+	unsigned long		time_zero; /* Time first element was added. */
+	xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
+	struct delayed_work	work;      /* Workqueue data for reaping.   */
+	unsigned int		queued;	   /* work has been queued */
+	void			*data;
+};
+
+static struct workqueue_struct	*xfs_mru_reap_wq;
+
+/*
+ * When inserting, destroying or reaping, it's first necessary to update the
+ * lists relative to a particular time.  In the case of destroying, that time
+ * will be well in the future to ensure that all items are moved to the reap
+ * list.  In all other cases though, the time will be the current time.
+ *
+ * This function enters a loop, moving the contents of the LRU list to the reap
+ * list again and again until either a) the lists are all empty, or b) time zero
+ * has been advanced sufficiently to be within the immediate element lifetime.
+ *
+ * Case a) above is detected by counting how many groups are migrated and
+ * stopping when they've all been moved.  Case b) is detected by monitoring the
+ * time_zero field, which is updated as each group is migrated.
+ *
+ * The return value is the earliest time that more migration could be needed, or
+ * zero if there's no need to schedule more work because the lists are empty.
+ */
+STATIC unsigned long
+_xfs_mru_cache_migrate(
+	struct xfs_mru_cache	*mru,
+	unsigned long		now)
+{
+	unsigned int		grp;
+	unsigned int		migrated = 0;
+	struct list_head	*lru_list;
+
+	/* Nothing to do if the data store is empty. */
+	if (!mru->time_zero)
+		return 0;
+
+	/* While time zero is older than the time spanned by all the lists. */
+	while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
+
+		/*
+		 * If the LRU list isn't empty, migrate its elements to the tail
+		 * of the reap list.
+		 */
+		lru_list = mru->lists + mru->lru_grp;
+		if (!list_empty(lru_list))
+			list_splice_init(lru_list, mru->reap_list.prev);
+
+		/*
+		 * Advance the LRU group number, freeing the old LRU list to
+		 * become the new MRU list; advance time zero accordingly.
+		 */
+		mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
+		mru->time_zero += mru->grp_time;
+
+		/*
+		 * If reaping is so far behind that all the elements on all the
+		 * lists have been migrated to the reap list, it's now empty.
+		 */
+		if (++migrated == mru->grp_count) {
+			mru->lru_grp = 0;
+			mru->time_zero = 0;
+			return 0;
+		}
+	}
+
+	/* Find the first non-empty list from the LRU end. */
+	for (grp = 0; grp < mru->grp_count; grp++) {
+
+		/* Check the grp'th list from the LRU end. */
+		lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
+		if (!list_empty(lru_list))
+			return mru->time_zero +
+			       (mru->grp_count + grp) * mru->grp_time;
+	}
+
+	/* All the lists must be empty. */
+	mru->lru_grp = 0;
+	mru->time_zero = 0;
+	return 0;
+}
+
+/*
+ * When inserting or doing a lookup, an element needs to be inserted into the
+ * MRU list.  The lists must be migrated first to ensure that they're
+ * up-to-date, otherwise the new element could be given a shorter lifetime in
+ * the cache than it should.
+ */
+STATIC void
+_xfs_mru_cache_list_insert(
+	struct xfs_mru_cache	*mru,
+	struct xfs_mru_cache_elem *elem)
+{
+	unsigned int		grp = 0;
+	unsigned long		now = jiffies;
+
+	/*
+	 * If the data store is empty, initialise time zero, leave grp set to
+	 * zero and start the work queue timer if necessary.  Otherwise, set grp
+	 * to the number of group times that have elapsed since time zero.
+	 */
+	if (!_xfs_mru_cache_migrate(mru, now)) {
+		mru->time_zero = now;
+		if (!mru->queued) {
+			mru->queued = 1;
+			queue_delayed_work(xfs_mru_reap_wq, &mru->work,
+			                   mru->grp_count * mru->grp_time);
+		}
+	} else {
+		grp = (now - mru->time_zero) / mru->grp_time;
+		grp = (mru->lru_grp + grp) % mru->grp_count;
+	}
+
+	/* Insert the element at the tail of the corresponding list. */
+	list_add_tail(&elem->list_node, mru->lists + grp);
+}
+
+/*
+ * When destroying or reaping, all the elements that were migrated to the reap
+ * list need to be deleted.  For each element this involves removing it from the
+ * data store, removing it from the reap list, calling the client's free
+ * function and deleting the element from the element cache.
+ *
+ * We get called holding the mru->lock, which we drop and then reacquire.
+ * Sparse need special help with this to tell it we know what we are doing.
+ */
+STATIC void
+_xfs_mru_cache_clear_reap_list(
+	struct xfs_mru_cache	*mru)
+		__releases(mru->lock) __acquires(mru->lock)
+{
+	struct xfs_mru_cache_elem *elem, *next;
+	struct list_head	tmp;
+
+	INIT_LIST_HEAD(&tmp);
+	list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
+
+		/* Remove the element from the data store. */
+		radix_tree_delete(&mru->store, elem->key);
+
+		/*
+		 * remove to temp list so it can be freed without
+		 * needing to hold the lock
+		 */
+		list_move(&elem->list_node, &tmp);
+	}
+	spin_unlock(&mru->lock);
+
+	list_for_each_entry_safe(elem, next, &tmp, list_node) {
+		list_del_init(&elem->list_node);
+		mru->free_func(mru->data, elem);
+	}
+
+	spin_lock(&mru->lock);
+}
+
+/*
+ * We fire the reap timer every group expiry interval so
+ * we always have a reaper ready to run. This makes shutdown
+ * and flushing of the reaper easy to do. Hence we need to
+ * keep when the next reap must occur so we can determine
+ * at each interval whether there is anything we need to do.
+ */
+STATIC void
+_xfs_mru_cache_reap(
+	struct work_struct	*work)
+{
+	struct xfs_mru_cache	*mru =
+		container_of(work, struct xfs_mru_cache, work.work);
+	unsigned long		now, next;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return;
+
+	spin_lock(&mru->lock);
+	next = _xfs_mru_cache_migrate(mru, jiffies);
+	_xfs_mru_cache_clear_reap_list(mru);
+
+	mru->queued = next;
+	if ((mru->queued > 0)) {
+		now = jiffies;
+		if (next <= now)
+			next = 0;
+		else
+			next -= now;
+		queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
+	}
+
+	spin_unlock(&mru->lock);
+}
+
+int
+xfs_mru_cache_init(void)
+{
+	xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
+			XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
+	if (!xfs_mru_reap_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_mru_cache_uninit(void)
+{
+	destroy_workqueue(xfs_mru_reap_wq);
+}
+
+/*
+ * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
+ * with the address of the pointer, a lifetime value in milliseconds, a group
+ * count and a free function to use when deleting elements.  This function
+ * returns 0 if the initialisation was successful.
+ */
+int
+xfs_mru_cache_create(
+	struct xfs_mru_cache	**mrup,
+	void			*data,
+	unsigned int		lifetime_ms,
+	unsigned int		grp_count,
+	xfs_mru_cache_free_func_t free_func)
+{
+	struct xfs_mru_cache	*mru = NULL;
+	int			err = 0, grp;
+	unsigned int		grp_time;
+
+	if (mrup)
+		*mrup = NULL;
+
+	if (!mrup || !grp_count || !lifetime_ms || !free_func)
+		return -EINVAL;
+
+	if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
+		return -EINVAL;
+
+	if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
+		return -ENOMEM;
+
+	/* An extra list is needed to avoid reaping up to a grp_time early. */
+	mru->grp_count = grp_count + 1;
+	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
+
+	if (!mru->lists) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	for (grp = 0; grp < mru->grp_count; grp++)
+		INIT_LIST_HEAD(mru->lists + grp);
+
+	/*
+	 * We use GFP_KERNEL radix tree preload and do inserts under a
+	 * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
+	 */
+	INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
+	INIT_LIST_HEAD(&mru->reap_list);
+	spin_lock_init(&mru->lock);
+	INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
+
+	mru->grp_time  = grp_time;
+	mru->free_func = free_func;
+	mru->data = data;
+	*mrup = mru;
+
+exit:
+	if (err && mru && mru->lists)
+		kmem_free(mru->lists);
+	if (err && mru)
+		kmem_free(mru);
+
+	return err;
+}
+
+/*
+ * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
+ * free functions as they're deleted.  When this function returns, the caller is
+ * guaranteed that all the free functions for all the elements have finished
+ * executing and the reaper is not running.
+ */
+static void
+xfs_mru_cache_flush(
+	struct xfs_mru_cache	*mru)
+{
+	if (!mru || !mru->lists)
+		return;
+
+	spin_lock(&mru->lock);
+	if (mru->queued) {
+		spin_unlock(&mru->lock);
+		cancel_delayed_work_sync(&mru->work);
+		spin_lock(&mru->lock);
+	}
+
+	_xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
+	_xfs_mru_cache_clear_reap_list(mru);
+
+	spin_unlock(&mru->lock);
+}
+
+void
+xfs_mru_cache_destroy(
+	struct xfs_mru_cache	*mru)
+{
+	if (!mru || !mru->lists)
+		return;
+
+	xfs_mru_cache_flush(mru);
+
+	kmem_free(mru->lists);
+	kmem_free(mru);
+}
+
+/*
+ * To insert an element, call xfs_mru_cache_insert() with the data store, the
+ * element's key and the client data pointer.  This function returns 0 on
+ * success or ENOMEM if memory for the data element couldn't be allocated.
+ */
+int
+xfs_mru_cache_insert(
+	struct xfs_mru_cache	*mru,
+	unsigned long		key,
+	struct xfs_mru_cache_elem *elem)
+{
+	int			error;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return -EINVAL;
+
+	if (radix_tree_preload(GFP_NOFS))
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&elem->list_node);
+	elem->key = key;
+
+	spin_lock(&mru->lock);
+	error = radix_tree_insert(&mru->store, key, elem);
+	radix_tree_preload_end();
+	if (!error)
+		_xfs_mru_cache_list_insert(mru, elem);
+	spin_unlock(&mru->lock);
+
+	return error;
+}
+
+/*
+ * To remove an element without calling the free function, call
+ * xfs_mru_cache_remove() with the data store and the element's key.  On success
+ * the client data pointer for the removed element is returned, otherwise this
+ * function will return a NULL pointer.
+ */
+struct xfs_mru_cache_elem *
+xfs_mru_cache_remove(
+	struct xfs_mru_cache	*mru,
+	unsigned long		key)
+{
+	struct xfs_mru_cache_elem *elem;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return NULL;
+
+	spin_lock(&mru->lock);
+	elem = radix_tree_delete(&mru->store, key);
+	if (elem)
+		list_del(&elem->list_node);
+	spin_unlock(&mru->lock);
+
+	return elem;
+}
+
+/*
+ * To remove and element and call the free function, call xfs_mru_cache_delete()
+ * with the data store and the element's key.
+ */
+void
+xfs_mru_cache_delete(
+	struct xfs_mru_cache	*mru,
+	unsigned long		key)
+{
+	struct xfs_mru_cache_elem *elem;
+
+	elem = xfs_mru_cache_remove(mru, key);
+	if (elem)
+		mru->free_func(mru->data, elem);
+}
+
+/*
+ * To look up an element using its key, call xfs_mru_cache_lookup() with the
+ * data store and the element's key.  If found, the element will be moved to the
+ * head of the MRU list to indicate that it's been touched.
+ *
+ * The internal data structures are protected by a spinlock that is STILL HELD
+ * when this function returns.  Call xfs_mru_cache_done() to release it.  Note
+ * that it is not safe to call any function that might sleep in the interim.
+ *
+ * The implementation could have used reference counting to avoid this
+ * restriction, but since most clients simply want to get, set or test a member
+ * of the returned data structure, the extra per-element memory isn't warranted.
+ *
+ * If the element isn't found, this function returns NULL and the spinlock is
+ * released.  xfs_mru_cache_done() should NOT be called when this occurs.
+ *
+ * Because sparse isn't smart enough to know about conditional lock return
+ * status, we need to help it get it right by annotating the path that does
+ * not release the lock.
+ */
+struct xfs_mru_cache_elem *
+xfs_mru_cache_lookup(
+	struct xfs_mru_cache	*mru,
+	unsigned long		key)
+{
+	struct xfs_mru_cache_elem *elem;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return NULL;
+
+	spin_lock(&mru->lock);
+	elem = radix_tree_lookup(&mru->store, key);
+	if (elem) {
+		list_del(&elem->list_node);
+		_xfs_mru_cache_list_insert(mru, elem);
+		__release(mru_lock); /* help sparse not be stupid */
+	} else
+		spin_unlock(&mru->lock);
+
+	return elem;
+}
+
+/*
+ * To release the internal data structure spinlock after having performed an
+ * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
+ * with the data store pointer.
+ */
+void
+xfs_mru_cache_done(
+	struct xfs_mru_cache	*mru)
+		__releases(mru->lock)
+{
+	spin_unlock(&mru->lock);
+}
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
new file mode 100644
index 000000000..f1fde1ecf
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_MRU_CACHE_H__
+#define __XFS_MRU_CACHE_H__
+
+struct xfs_mru_cache;
+
+struct xfs_mru_cache_elem {
+	struct list_head list_node;
+	unsigned long	key;
+};
+
+/* Function pointer type for callback to free a client's data pointer. */
+typedef void (*xfs_mru_cache_free_func_t)(void *, struct xfs_mru_cache_elem *);
+
+int xfs_mru_cache_init(void);
+void xfs_mru_cache_uninit(void);
+int xfs_mru_cache_create(struct xfs_mru_cache **mrup, void *data,
+		unsigned int lifetime_ms, unsigned int grp_count,
+		xfs_mru_cache_free_func_t free_func);
+void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
+int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
+		struct xfs_mru_cache_elem *elem);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
+void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
+void xfs_mru_cache_done(struct xfs_mru_cache *mru);
+
+#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
new file mode 100644
index 000000000..c4078d0ec
--- /dev/null
+++ b/fs/xfs/xfs_notify_failure.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Fujitsu.  All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+
+#include <linux/mm.h>
+#include <linux/dax.h>
+
+struct xfs_failure_info {
+	xfs_agblock_t		startblock;
+	xfs_extlen_t		blockcount;
+	int			mf_flags;
+	bool			want_shutdown;
+};
+
+static pgoff_t
+xfs_failure_pgoff(
+	struct xfs_mount		*mp,
+	const struct xfs_rmap_irec	*rec,
+	const struct xfs_failure_info	*notify)
+{
+	loff_t				pos = XFS_FSB_TO_B(mp, rec->rm_offset);
+
+	if (notify->startblock > rec->rm_startblock)
+		pos += XFS_FSB_TO_B(mp,
+				notify->startblock - rec->rm_startblock);
+	return pos >> PAGE_SHIFT;
+}
+
+static unsigned long
+xfs_failure_pgcnt(
+	struct xfs_mount		*mp,
+	const struct xfs_rmap_irec	*rec,
+	const struct xfs_failure_info	*notify)
+{
+	xfs_agblock_t			end_rec;
+	xfs_agblock_t			end_notify;
+	xfs_agblock_t			start_cross;
+	xfs_agblock_t			end_cross;
+
+	start_cross = max(rec->rm_startblock, notify->startblock);
+
+	end_rec = rec->rm_startblock + rec->rm_blockcount;
+	end_notify = notify->startblock + notify->blockcount;
+	end_cross = min(end_rec, end_notify);
+
+	return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
+}
+
+static int
+xfs_dax_failure_fn(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*data)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_inode		*ip;
+	struct xfs_failure_info		*notify = data;
+	int				error = 0;
+
+	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
+	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+		notify->want_shutdown = true;
+		return 0;
+	}
+
+	/* Get files that incore, filter out others that are not in use. */
+	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
+			 0, &ip);
+	/* Continue the rmap query if the inode isn't incore */
+	if (error == -ENODATA)
+		return 0;
+	if (error) {
+		notify->want_shutdown = true;
+		return 0;
+	}
+
+	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
+				  xfs_failure_pgoff(mp, rec, notify),
+				  xfs_failure_pgcnt(mp, rec, notify),
+				  notify->mf_flags);
+	xfs_irele(ip);
+	return error;
+}
+
+static int
+xfs_dax_notify_ddev_failure(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr,
+	xfs_daddr_t		bblen,
+	int			mf_flags)
+{
+	struct xfs_failure_info	notify = { .mf_flags = mf_flags };
+	struct xfs_trans	*tp = NULL;
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_buf		*agf_bp = NULL;
+	int			error = 0;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
+	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
+	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
+	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	for (; agno <= end_agno; agno++) {
+		struct xfs_rmap_irec	ri_low = { };
+		struct xfs_rmap_irec	ri_high;
+		struct xfs_agf		*agf;
+		xfs_agblock_t		agend;
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, agno);
+		error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+		if (error) {
+			xfs_perag_put(pag);
+			break;
+		}
+
+		cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+
+		/*
+		 * Set the rmap range from ri_low to ri_high, which represents
+		 * a [start, end] where we looking for the files or metadata.
+		 */
+		memset(&ri_high, 0xFF, sizeof(ri_high));
+		ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
+		if (agno == end_agno)
+			ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
+
+		agf = agf_bp->b_addr;
+		agend = min(be32_to_cpu(agf->agf_length),
+				ri_high.rm_startblock);
+		notify.startblock = ri_low.rm_startblock;
+		notify.blockcount = agend - ri_low.rm_startblock;
+
+		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+				xfs_dax_failure_fn, &notify);
+		xfs_btree_del_cursor(cur, error);
+		xfs_trans_brelse(tp, agf_bp);
+		xfs_perag_put(pag);
+		if (error)
+			break;
+
+		fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
+	}
+
+	xfs_trans_cancel(tp);
+	if (error || notify.want_shutdown) {
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+		if (!error)
+			error = -EFSCORRUPTED;
+	}
+	return error;
+}
+
+static int
+xfs_dax_notify_failure(
+	struct dax_device	*dax_dev,
+	u64			offset,
+	u64			len,
+	int			mf_flags)
+{
+	struct xfs_mount	*mp = dax_holder(dax_dev);
+	u64			ddev_start;
+	u64			ddev_end;
+
+	if (!(mp->m_super->s_flags & SB_BORN)) {
+		xfs_warn(mp, "filesystem is not ready for notify_failure()!");
+		return -EIO;
+	}
+
+	if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
+		xfs_debug(mp,
+			 "notify_failure() not supported on realtime device!");
+		return -EOPNOTSUPP;
+	}
+
+	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
+	    mp->m_logdev_targp != mp->m_ddev_targp) {
+		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+		return -EFSCORRUPTED;
+	}
+
+	if (!xfs_has_rmapbt(mp)) {
+		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
+		return -EOPNOTSUPP;
+	}
+
+	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
+	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+
+	/* Ignore the range out of filesystem area */
+	if (offset + len < ddev_start)
+		return -ENXIO;
+	if (offset > ddev_end)
+		return -ENXIO;
+
+	/* Calculate the real range when it touches the boundary */
+	if (offset > ddev_start)
+		offset -= ddev_start;
+	else {
+		len -= ddev_start - offset;
+		offset = 0;
+	}
+	if (offset + len > ddev_end)
+		len -= ddev_end - offset;
+
+	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
+			mf_flags);
+}
+
+const struct dax_holder_operations xfs_dax_holder_operations = {
+	.notify_failure		= xfs_dax_notify_failure,
+};
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
new file mode 100644
index 000000000..9737b5a9f
--- /dev/null
+++ b/fs/xfs/xfs_ondisk.h
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Oracle.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ONDISK_H
+#define __XFS_ONDISK_H
+
+#define XFS_CHECK_STRUCT_SIZE(structname, size) \
+	BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
+		#structname ") is wrong, expected " #size)
+
+#define XFS_CHECK_OFFSET(structname, member, off) \
+	BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+		"XFS: offsetof(" #structname ", " #member ") is wrong, " \
+		"expected " #off)
+
+#define XFS_CHECK_VALUE(value, expected) \
+	BUILD_BUG_ON_MSG((value) != (expected), \
+		"XFS: value of " #value " is wrong, expected " #expected)
+
+static inline void __init
+xfs_check_ondisk_structs(void)
+{
+	/* ag/file structures */
+	XFS_CHECK_STRUCT_SIZE(struct xfs_acl,			4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry,		12);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_agf,			224);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_agfl,			36);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_agi,			344);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key,		8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr,	48);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr,	64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block,		72);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dinode,		176);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot,		104);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk,			136);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dsb,			264);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr,		56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec,		12);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key,		20);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec,		24);
+	XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t,			8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp,	8);
+	XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t,			8);
+	XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t,			4);
+	XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t,			8);
+	XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,			4);
+	XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t,			4);
+
+	/* dir/attr trees */
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr,	80);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock,	88);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr,		56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo,		56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_da3_node_hdr,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_blk_hdr,		48);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_data_hdr,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr,		64);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t,		8);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t,		32);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t,	4);
+
+	/*
+	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
+	 * 4 bytes anyway so it's not obviously a problem.  Hence for the moment
+	 * we don't check this structure. This can be re-instated when the attr
+	 * definitions are updated to use c99 VLA definitions.
+	 *
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,	12);
+	 */
+
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen,	0);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen,	2);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval,	3);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk,	0);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen,	4);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen,	8);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name,	9);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,		40);
+	XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
+	XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count,	 2);
+	XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen,	4);
+	XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen,	5);
+	XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags,	6);
+	XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval,	7);
+	XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,			12);
+	XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,			16);
+	XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,		8);
+	XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t,		16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t,		16);
+	XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag,	0);
+	XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length,	2);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t,		16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t,			16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t,		8);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t,		16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t,			16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t,		3);
+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen,		0);
+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset,		1);
+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name,		3);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t,		10);
+
+	/* log structures */
+	XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format,	88);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat,		24);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32,		12);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode,		176);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log,		28);
+	XFS_CHECK_STRUCT_SIZE(xfs_log_timestamp_t,		8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_log_legacy_timestamp,	8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32,	52);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,	56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,	20);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format,	40);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format,	16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent,		32);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent,		16);
+
+	XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents,	16);
+	XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents,	16);
+	XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents,	16);
+	XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents,	16);
+	XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents,	16);
+	XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents,	16);
+
+	/*
+	 * The v5 superblock format extended several v4 header structures with
+	 * additional data. While new fields are only accessible on v5
+	 * superblocks, it's important that the v5 structures place original v4
+	 * fields/headers in the correct location on-disk. For example, we must
+	 * be able to find magic values at the same location in certain blocks
+	 * regardless of superblock version.
+	 *
+	 * The following checks ensure that various v5 data structures place the
+	 * subset of v4 metadata associated with the same type of block at the
+	 * start of the on-disk block. If there is no data structure definition
+	 * for certain types of v4 blocks, traverse down to the first field of
+	 * common metadata (e.g., magic value) and make sure it is at offset
+	 * zero.
+	 */
+	XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr,	0);
+	XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr,	0);
+	XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic,	0);
+	XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic,	0);
+	XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
+
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat,		192);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers,		24);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req,		64);
+
+	/*
+	 * Make sure the incore inode timestamp range corresponds to hand
+	 * converted values based on the ondisk format specification.
+	 */
+	XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET,
+			XFS_LEGACY_TIME_MIN);
+	XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET,
+			16299260424LL);
+
+	/* Do the same with the incore quota expiration range. */
+	XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
+	XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
+			16299260424LL);
+}
+
+#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
new file mode 100644
index 000000000..37a24f0f7
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_iomap.h"
+#include "xfs_pnfs.h"
+
+/*
+ * Ensure that we do not have any outstanding pNFS layouts that can be used by
+ * clients to directly read from or write to this inode.  This must be called
+ * before every operation that can remove blocks from the extent map.
+ * Additionally we call it during the write operation, where aren't concerned
+ * about exposing unallocated blocks but just want to provide basic
+ * synchronization between a local writer and pNFS clients.  mmap writes would
+ * also benefit from this sort of synchronization, but due to the tricky locking
+ * rules in the page fault path we don't bother.
+ */
+int
+xfs_break_leased_layouts(
+	struct inode		*inode,
+	uint			*iolock,
+	bool			*did_unlock)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
+
+	while ((error = break_layout(inode, false)) == -EWOULDBLOCK) {
+		xfs_iunlock(ip, *iolock);
+		*did_unlock = true;
+		error = break_layout(inode, true);
+		*iolock &= ~XFS_IOLOCK_SHARED;
+		*iolock |= XFS_IOLOCK_EXCL;
+		xfs_ilock(ip, *iolock);
+	}
+
+	return error;
+}
+
+/*
+ * Get a unique ID including its location so that the client can identify
+ * the exported device.
+ */
+int
+xfs_fs_get_uuid(
+	struct super_block	*sb,
+	u8			*buf,
+	u32			*len,
+	u64			*offset)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_notice_once(mp,
+"Using experimental pNFS feature, use at your own risk!");
+
+	if (*len < sizeof(uuid_t))
+		return -EINVAL;
+
+	memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+	*len = sizeof(uuid_t);
+	*offset = offsetof(struct xfs_dsb, sb_uuid);
+	return 0;
+}
+
+/*
+ * We cannot use file based VFS helpers such as file_modified() to update
+ * inode state as we modify the data/metadata in the inode here. Hence we have
+ * to open code the timestamp updates and SUID/SGID stripping. We also need
+ * to set the inode prealloc flag to ensure that the extents we allocate are not
+ * removed if the inode is reclaimed from memory before xfs_fs_block_commit()
+ * is from the client to indicate that data has been written and the file size
+ * can be extended.
+ */
+static int
+xfs_fs_map_update_inode(
+	struct xfs_inode	*ip)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+			0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	VFS_I(ip)->i_mode &= ~S_ISUID;
+	if (VFS_I(ip)->i_mode & S_IXGRP)
+		VFS_I(ip)->i_mode &= ~S_ISGID;
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return xfs_trans_commit(tp);
+}
+
+/*
+ * Get a layout for the pNFS client.
+ */
+int
+xfs_fs_map_blocks(
+	struct inode		*inode,
+	loff_t			offset,
+	u64			length,
+	struct iomap		*iomap,
+	bool			write,
+	u32			*device_generation)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	loff_t			limit;
+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
+	int			nimaps = 1;
+	uint			lock_flags;
+	int			error = 0;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/*
+	 * We can't export inodes residing on the realtime device.  The realtime
+	 * device doesn't have a UUID to identify it, so the client has no way
+	 * to find it.
+	 */
+	if (XFS_IS_REALTIME_INODE(ip))
+		return -ENXIO;
+
+	/*
+	 * The pNFS block layout spec actually supports reflink like
+	 * functionality, but the Linux pNFS server doesn't implement it yet.
+	 */
+	if (xfs_is_reflink_inode(ip))
+		return -ENXIO;
+
+	/*
+	 * Lock out any other I/O before we flush and invalidate the pagecache,
+	 * and then hand out a layout to the remote system.  This is very
+	 * similar to direct I/O, except that the synchronization is much more
+	 * complicated.  See the comment near xfs_break_leased_layouts
+	 * for a detailed explanation.
+	 */
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	error = -EINVAL;
+	limit = mp->m_super->s_maxbytes;
+	if (!write)
+		limit = max(limit, round_up(i_size_read(inode),
+				     inode->i_sb->s_blocksize));
+	if (offset > limit)
+		goto out_unlock;
+	if (offset > limit - length)
+		length = limit - offset;
+
+	error = filemap_write_and_wait(inode->i_mapping);
+	if (error)
+		goto out_unlock;
+	error = invalidate_inode_pages2(inode->i_mapping);
+	if (WARN_ON_ONCE(error))
+		goto out_unlock;
+
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	lock_flags = xfs_ilock_data_map_shared(ip);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, bmapi_flags);
+
+	ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
+
+	if (!error && write &&
+	    (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) {
+		if (offset + length > XFS_ISIZE(ip))
+			end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
+		else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+			end_fsb = min(end_fsb, imap.br_startoff +
+					       imap.br_blockcount);
+		xfs_iunlock(ip, lock_flags);
+
+		error = xfs_iomap_write_direct(ip, offset_fsb,
+				end_fsb - offset_fsb, 0, &imap);
+		if (error)
+			goto out_unlock;
+
+		/*
+		 * Ensure the next transaction is committed synchronously so
+		 * that the blocks allocated and handed out to the client are
+		 * guaranteed to be present even after a server crash.
+		 */
+		error = xfs_fs_map_update_inode(ip);
+		if (!error)
+			error = xfs_log_force_inode(ip);
+		if (error)
+			goto out_unlock;
+
+	} else {
+		xfs_iunlock(ip, lock_flags);
+	}
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
+	*device_generation = mp->m_generation;
+	return error;
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+/*
+ * Ensure the size update falls into a valid allocated block.
+ */
+static int
+xfs_pnfs_validate_isize(
+	struct xfs_inode	*ip,
+	xfs_off_t		isize)
+{
+	struct xfs_bmbt_irec	imap;
+	int			nimaps = 1;
+	int			error = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
+				&imap, &nimaps, 0);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (error)
+		return error;
+
+	if (imap.br_startblock == HOLESTARTBLOCK ||
+	    imap.br_startblock == DELAYSTARTBLOCK ||
+	    imap.br_state == XFS_EXT_UNWRITTEN)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * Make sure the blocks described by maps are stable on disk.  This includes
+ * converting any unwritten extents, flushing the disk cache and updating the
+ * time stamps.
+ *
+ * Note that we rely on the caller to always send us a timestamp update so that
+ * we always commit a transaction here.  If that stops being true we will have
+ * to manually flush the cache here similar to what the fsync code path does
+ * for datasyncs on files that have no dirty metadata.
+ */
+int
+xfs_fs_commit_blocks(
+	struct inode		*inode,
+	struct iomap		*maps,
+	int			nr_maps,
+	struct iattr		*iattr)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	bool			update_isize = false;
+	int			error, i;
+	loff_t			size;
+
+	ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	size = i_size_read(inode);
+	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
+		update_isize = true;
+		size = iattr->ia_size;
+	}
+
+	for (i = 0; i < nr_maps; i++) {
+		u64 start, length, end;
+
+		start = maps[i].offset;
+		if (start > size)
+			continue;
+
+		end = start + maps[i].length;
+		if (end > size)
+			end = size;
+
+		length = end - start;
+		if (!length)
+			continue;
+
+		/*
+		 * Make sure reads through the pagecache see the new data.
+		 */
+		error = invalidate_inode_pages2_range(inode->i_mapping,
+					start >> PAGE_SHIFT,
+					(end - 1) >> PAGE_SHIFT);
+		WARN_ON_ONCE(error);
+
+		error = xfs_iomap_write_unwritten(ip, start, length, false);
+		if (error)
+			goto out_drop_iolock;
+	}
+
+	if (update_isize) {
+		error = xfs_pnfs_validate_isize(ip, size);
+		if (error)
+			goto out_drop_iolock;
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		goto out_drop_iolock;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+	setattr_copy(&init_user_ns, inode, iattr);
+	if (update_isize) {
+		i_size_write(inode, iattr->ia_size);
+		ip->i_disk_size = iattr->ia_size;
+	}
+
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp);
+
+out_drop_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
new file mode 100644
index 000000000..940c6c2ad
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_PNFS_H
+#define _XFS_PNFS_H 1
+
+#ifdef CONFIG_EXPORTFS_BLOCK_OPS
+int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
+		struct iomap *iomap, bool write, u32 *device_generation);
+int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
+		struct iattr *iattr);
+
+int xfs_break_leased_layouts(struct inode *inode, uint *iolock,
+		bool *did_unlock);
+#else
+static inline int
+xfs_break_leased_layouts(struct inode *inode, uint *iolock, bool *did_unlock)
+{
+	return 0;
+}
+#endif /* CONFIG_EXPORTFS_BLOCK_OPS */
+#endif /* _XFS_PNFS_H */
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
new file mode 100644
index 000000000..c283b801c
--- /dev/null
+++ b/fs/xfs/xfs_pwork.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+#include "xfs_sysctl.h"
+#include "xfs_pwork.h"
+#include <linux/nmi.h>
+
+/*
+ * Parallel Work Queue
+ * ===================
+ *
+ * Abstract away the details of running a large and "obviously" parallelizable
+ * task across multiple CPUs.  Callers initialize the pwork control object with
+ * a desired level of parallelization and a work function.  Next, they embed
+ * struct xfs_pwork in whatever structure they use to pass work context to a
+ * worker thread and queue that pwork.  The work function will be passed the
+ * pwork item when it is run (from process context) and any returned error will
+ * be recorded in xfs_pwork_ctl.error.  Work functions should check for errors
+ * and abort if necessary; the non-zeroness of xfs_pwork_ctl.error does not
+ * stop workqueue item processing.
+ *
+ * This is the rough equivalent of the xfsprogs workqueue code, though we can't
+ * reuse that name here.
+ */
+
+/* Invoke our caller's function. */
+static void
+xfs_pwork_work(
+	struct work_struct	*work)
+{
+	struct xfs_pwork	*pwork;
+	struct xfs_pwork_ctl	*pctl;
+	int			error;
+
+	pwork = container_of(work, struct xfs_pwork, work);
+	pctl = pwork->pctl;
+	error = pctl->work_fn(pctl->mp, pwork);
+	if (error && !pctl->error)
+		pctl->error = error;
+	if (atomic_dec_and_test(&pctl->nr_work))
+		wake_up(&pctl->poll_wait);
+}
+
+/*
+ * Set up control data for parallel work.  @work_fn is the function that will
+ * be called.  @tag will be written into the kernel threads.  @nr_threads is
+ * the level of parallelism desired, or 0 for no limit.
+ */
+int
+xfs_pwork_init(
+	struct xfs_mount	*mp,
+	struct xfs_pwork_ctl	*pctl,
+	xfs_pwork_work_fn	work_fn,
+	const char		*tag)
+{
+	unsigned int		nr_threads = 0;
+
+#ifdef DEBUG
+	if (xfs_globals.pwork_threads >= 0)
+		nr_threads = xfs_globals.pwork_threads;
+#endif
+	trace_xfs_pwork_init(mp, nr_threads, current->pid);
+
+	pctl->wq = alloc_workqueue("%s-%d",
+			WQ_UNBOUND | WQ_SYSFS | WQ_FREEZABLE, nr_threads, tag,
+			current->pid);
+	if (!pctl->wq)
+		return -ENOMEM;
+	pctl->work_fn = work_fn;
+	pctl->error = 0;
+	pctl->mp = mp;
+	atomic_set(&pctl->nr_work, 0);
+	init_waitqueue_head(&pctl->poll_wait);
+
+	return 0;
+}
+
+/* Queue some parallel work. */
+void
+xfs_pwork_queue(
+	struct xfs_pwork_ctl	*pctl,
+	struct xfs_pwork	*pwork)
+{
+	INIT_WORK(&pwork->work, xfs_pwork_work);
+	pwork->pctl = pctl;
+	atomic_inc(&pctl->nr_work);
+	queue_work(pctl->wq, &pwork->work);
+}
+
+/* Wait for the work to finish and tear down the control structure. */
+int
+xfs_pwork_destroy(
+	struct xfs_pwork_ctl	*pctl)
+{
+	destroy_workqueue(pctl->wq);
+	pctl->wq = NULL;
+	return pctl->error;
+}
+
+/*
+ * Wait for the work to finish by polling completion status and touch the soft
+ * lockup watchdog.  This is for callers such as mount which hold locks.
+ */
+void
+xfs_pwork_poll(
+	struct xfs_pwork_ctl	*pctl)
+{
+	while (wait_event_timeout(pctl->poll_wait,
+				atomic_read(&pctl->nr_work) == 0, HZ) == 0)
+		touch_softlockup_watchdog();
+}
diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
new file mode 100644
index 000000000..c0ef81fc8
--- /dev/null
+++ b/fs/xfs/xfs_pwork.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_PWORK_H__
+#define __XFS_PWORK_H__
+
+struct xfs_pwork;
+struct xfs_mount;
+
+typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
+
+/*
+ * Parallel work coordination structure.
+ */
+struct xfs_pwork_ctl {
+	struct workqueue_struct	*wq;
+	struct xfs_mount	*mp;
+	xfs_pwork_work_fn	work_fn;
+	struct wait_queue_head	poll_wait;
+	atomic_t		nr_work;
+	int			error;
+};
+
+/*
+ * Embed this parallel work control item inside your own work structure,
+ * then queue work with it.
+ */
+struct xfs_pwork {
+	struct work_struct	work;
+	struct xfs_pwork_ctl	*pctl;
+};
+
+#define XFS_PWORK_SINGLE_THREADED	{ .pctl = NULL }
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_ctl_want_abort(
+	struct xfs_pwork_ctl	*pctl)
+{
+	return pctl && pctl->error;
+}
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_want_abort(
+	struct xfs_pwork	*pwork)
+{
+	return xfs_pwork_ctl_want_abort(pwork->pctl);
+}
+
+int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
+		xfs_pwork_work_fn work_fn, const char *tag);
+void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
+int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
+void xfs_pwork_poll(struct xfs_pwork_ctl *pctl);
+
+#endif /* __XFS_PWORK_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
new file mode 100644
index 000000000..18bb4ec4d
--- /dev/null
+++ b/fs/xfs/xfs_qm.c
@@ -0,0 +1,1921 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iwalk.h"
+#include "xfs_quota.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_error.h"
+#include "xfs_ag.h"
+#include "xfs_ialloc.h"
+#include "xfs_log_priv.h"
+
+/*
+ * The global quota manager. There is only one of these for the entire
+ * system, _not_ one per file system. XQM keeps track of the overall
+ * quota functionality, including maintaining the freelist and hash
+ * tables of dquots.
+ */
+STATIC int	xfs_qm_init_quotainos(struct xfs_mount *mp);
+STATIC int	xfs_qm_init_quotainfo(struct xfs_mount *mp);
+
+STATIC void	xfs_qm_destroy_quotainos(struct xfs_quotainfo *qi);
+STATIC void	xfs_qm_dqfree_one(struct xfs_dquot *dqp);
+/*
+ * We use the batch lookup interface to iterate over the dquots as it
+ * currently is the only interface into the radix tree code that allows
+ * fuzzy lookups instead of exact matches.  Holding the lock over multiple
+ * operations is fine as all callers are used either during mount/umount
+ * or quotaoff.
+ */
+#define XFS_DQ_LOOKUP_BATCH	32
+
+STATIC int
+xfs_qm_dquot_walk(
+	struct xfs_mount	*mp,
+	xfs_dqtype_t		type,
+	int			(*execute)(struct xfs_dquot *dqp, void *data),
+	void			*data)
+{
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct radix_tree_root	*tree = xfs_dquot_tree(qi, type);
+	uint32_t		next_index;
+	int			last_error = 0;
+	int			skipped;
+	int			nr_found;
+
+restart:
+	skipped = 0;
+	next_index = 0;
+	nr_found = 0;
+
+	while (1) {
+		struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
+		int		error = 0;
+		int		i;
+
+		mutex_lock(&qi->qi_tree_lock);
+		nr_found = radix_tree_gang_lookup(tree, (void **)batch,
+					next_index, XFS_DQ_LOOKUP_BATCH);
+		if (!nr_found) {
+			mutex_unlock(&qi->qi_tree_lock);
+			break;
+		}
+
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_dquot *dqp = batch[i];
+
+			next_index = dqp->q_id + 1;
+
+			error = execute(batch[i], data);
+			if (error == -EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != -EFSCORRUPTED)
+				last_error = error;
+		}
+
+		mutex_unlock(&qi->qi_tree_lock);
+
+		/* bail out if the filesystem is corrupted.  */
+		if (last_error == -EFSCORRUPTED) {
+			skipped = 0;
+			break;
+		}
+		/* we're done if id overflows back to zero */
+		if (!next_index)
+			break;
+	}
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+
+	return last_error;
+}
+
+
+/*
+ * Purge a dquot from all tracking data structures and free it.
+ */
+STATIC int
+xfs_qm_dqpurge(
+	struct xfs_dquot	*dqp,
+	void			*data)
+{
+	struct xfs_quotainfo	*qi = dqp->q_mount->m_quotainfo;
+	int			error = -EAGAIN;
+
+	xfs_dqlock(dqp);
+	if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0)
+		goto out_unlock;
+
+	dqp->q_flags |= XFS_DQFLAG_FREEING;
+
+	xfs_dqflock(dqp);
+
+	/*
+	 * If we are turning this type of quotas off, we don't care
+	 * about the dirty metadata sitting in this dquot. OTOH, if
+	 * we're unmounting, we do care, so we flush it and wait.
+	 */
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		struct xfs_buf	*bp = NULL;
+
+		/*
+		 * We don't care about getting disk errors here. We need
+		 * to purge this dquot anyway, so we go ahead regardless.
+		 */
+		error = xfs_qm_dqflush(dqp, &bp);
+		if (!error) {
+			error = xfs_bwrite(bp);
+			xfs_buf_relse(bp);
+		} else if (error == -EAGAIN) {
+			dqp->q_flags &= ~XFS_DQFLAG_FREEING;
+			goto out_unlock;
+		}
+		xfs_dqflock(dqp);
+	}
+
+	ASSERT(atomic_read(&dqp->q_pincount) == 0);
+	ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) ||
+		!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
+
+	xfs_dqfunlock(dqp);
+	xfs_dqunlock(dqp);
+
+	radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
+	qi->qi_dquots--;
+
+	/*
+	 * We move dquots to the freelist as soon as their reference count
+	 * hits zero, so it really should be on the freelist here.
+	 */
+	ASSERT(!list_empty(&dqp->q_lru));
+	list_lru_del(&qi->qi_lru, &dqp->q_lru);
+	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
+
+	xfs_qm_dqdestroy(dqp);
+	return 0;
+
+out_unlock:
+	xfs_dqunlock(dqp);
+	return error;
+}
+
+/*
+ * Purge the dquot cache.
+ */
+static void
+xfs_qm_dqpurge_all(
+	struct xfs_mount	*mp)
+{
+	xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL);
+	xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL);
+	xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL);
+}
+
+/*
+ * Just destroy the quotainfo structure.
+ */
+void
+xfs_qm_unmount(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_quotainfo) {
+		xfs_qm_dqpurge_all(mp);
+		xfs_qm_destroy_quotainfo(mp);
+	}
+}
+
+/*
+ * Called from the vfsops layer.
+ */
+void
+xfs_qm_unmount_quotas(
+	xfs_mount_t	*mp)
+{
+	/*
+	 * Release the dquots that root inode, et al might be holding,
+	 * before we flush quotas and blow away the quotainfo structure.
+	 */
+	ASSERT(mp->m_rootip);
+	xfs_qm_dqdetach(mp->m_rootip);
+	if (mp->m_rbmip)
+		xfs_qm_dqdetach(mp->m_rbmip);
+	if (mp->m_rsumip)
+		xfs_qm_dqdetach(mp->m_rsumip);
+
+	/*
+	 * Release the quota inodes.
+	 */
+	if (mp->m_quotainfo) {
+		if (mp->m_quotainfo->qi_uquotaip) {
+			xfs_irele(mp->m_quotainfo->qi_uquotaip);
+			mp->m_quotainfo->qi_uquotaip = NULL;
+		}
+		if (mp->m_quotainfo->qi_gquotaip) {
+			xfs_irele(mp->m_quotainfo->qi_gquotaip);
+			mp->m_quotainfo->qi_gquotaip = NULL;
+		}
+		if (mp->m_quotainfo->qi_pquotaip) {
+			xfs_irele(mp->m_quotainfo->qi_pquotaip);
+			mp->m_quotainfo->qi_pquotaip = NULL;
+		}
+	}
+}
+
+STATIC int
+xfs_qm_dqattach_one(
+	struct xfs_inode	*ip,
+	xfs_dqtype_t		type,
+	bool			doalloc,
+	struct xfs_dquot	**IO_idqpp)
+{
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	error = 0;
+
+	/*
+	 * See if we already have it in the inode itself. IO_idqpp is &i_udquot
+	 * or &i_gdquot. This made the code look weird, but made the logic a lot
+	 * simpler.
+	 */
+	dqp = *IO_idqpp;
+	if (dqp) {
+		trace_xfs_dqattach_found(dqp);
+		return 0;
+	}
+
+	/*
+	 * Find the dquot from somewhere. This bumps the reference count of
+	 * dquot and returns it locked.  This can return ENOENT if dquot didn't
+	 * exist on disk and we didn't ask it to allocate; ESRCH if quotas got
+	 * turned off suddenly.
+	 */
+	error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp);
+	if (error)
+		return error;
+
+	trace_xfs_dqattach_get(dqp);
+
+	/*
+	 * dqget may have dropped and re-acquired the ilock, but it guarantees
+	 * that the dquot returned is the one that should go in the inode.
+	 */
+	*IO_idqpp = dqp;
+	xfs_dqunlock(dqp);
+	return 0;
+}
+
+static bool
+xfs_qm_need_dqattach(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return false;
+	if (!XFS_NOT_DQATTACHED(mp, ip))
+		return false;
+	if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+		return false;
+	return true;
+}
+
+/*
+ * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
+ * into account.
+ * If @doalloc is true, the dquot(s) will be allocated if needed.
+ * Inode may get unlocked and relocked in here, and the caller must deal with
+ * the consequences.
+ */
+int
+xfs_qm_dqattach_locked(
+	xfs_inode_t	*ip,
+	bool		doalloc)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	int		error = 0;
+
+	if (!xfs_qm_need_dqattach(ip))
+		return 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
+		error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
+				doalloc, &ip->i_udquot);
+		if (error)
+			goto done;
+		ASSERT(ip->i_udquot);
+	}
+
+	if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
+		error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP,
+				doalloc, &ip->i_gdquot);
+		if (error)
+			goto done;
+		ASSERT(ip->i_gdquot);
+	}
+
+	if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
+		error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ,
+				doalloc, &ip->i_pdquot);
+		if (error)
+			goto done;
+		ASSERT(ip->i_pdquot);
+	}
+
+done:
+	/*
+	 * Don't worry about the dquots that we may have attached before any
+	 * error - they'll get detached later if it has not already been done.
+	 */
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	return error;
+}
+
+int
+xfs_qm_dqattach(
+	struct xfs_inode	*ip)
+{
+	int			error;
+
+	if (!xfs_qm_need_dqattach(ip))
+		return 0;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_qm_dqattach_locked(ip, false);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+/*
+ * Release dquots (and their references) if any.
+ * The inode should be locked EXCL except when this's called by
+ * xfs_ireclaim.
+ */
+void
+xfs_qm_dqdetach(
+	xfs_inode_t	*ip)
+{
+	if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
+		return;
+
+	trace_xfs_dquot_dqdetach(ip);
+
+	ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
+	if (ip->i_udquot) {
+		xfs_qm_dqrele(ip->i_udquot);
+		ip->i_udquot = NULL;
+	}
+	if (ip->i_gdquot) {
+		xfs_qm_dqrele(ip->i_gdquot);
+		ip->i_gdquot = NULL;
+	}
+	if (ip->i_pdquot) {
+		xfs_qm_dqrele(ip->i_pdquot);
+		ip->i_pdquot = NULL;
+	}
+}
+
+struct xfs_qm_isolate {
+	struct list_head	buffers;
+	struct list_head	dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+		__releases(lru_lock) __acquires(lru_lock)
+{
+	struct xfs_dquot	*dqp = container_of(item,
+						struct xfs_dquot, q_lru);
+	struct xfs_qm_isolate	*isol = arg;
+
+	if (!xfs_dqlock_nowait(dqp))
+		goto out_miss_busy;
+
+	/*
+	 * This dquot has acquired a reference in the meantime remove it from
+	 * the freelist and try again.
+	 */
+	if (dqp->q_nrefs) {
+		xfs_dqunlock(dqp);
+		XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
+
+		trace_xfs_dqreclaim_want(dqp);
+		list_lru_isolate(lru, &dqp->q_lru);
+		XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
+		return LRU_REMOVED;
+	}
+
+	/*
+	 * If the dquot is dirty, flush it. If it's already being flushed, just
+	 * skip it so there is time for the IO to complete before we try to
+	 * reclaim it again on the next LRU pass.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		xfs_dqunlock(dqp);
+		goto out_miss_busy;
+	}
+
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		struct xfs_buf	*bp = NULL;
+		int		error;
+
+		trace_xfs_dqreclaim_dirty(dqp);
+
+		/* we have to drop the LRU lock to flush the dquot */
+		spin_unlock(lru_lock);
+
+		error = xfs_qm_dqflush(dqp, &bp);
+		if (error)
+			goto out_unlock_dirty;
+
+		xfs_buf_delwri_queue(bp, &isol->buffers);
+		xfs_buf_relse(bp);
+		goto out_unlock_dirty;
+	}
+	xfs_dqfunlock(dqp);
+
+	/*
+	 * Prevent lookups now that we are past the point of no return.
+	 */
+	dqp->q_flags |= XFS_DQFLAG_FREEING;
+	xfs_dqunlock(dqp);
+
+	ASSERT(dqp->q_nrefs == 0);
+	list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
+	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
+	trace_xfs_dqreclaim_done(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
+	return LRU_REMOVED;
+
+out_miss_busy:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
+	return LRU_SKIP;
+
+out_unlock_dirty:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
+	xfs_dqunlock(dqp);
+	spin_lock(lru_lock);
+	return LRU_RETRY;
+}
+
+static unsigned long
+xfs_qm_shrink_scan(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+	struct xfs_qm_isolate	isol;
+	unsigned long		freed;
+	int			error;
+
+	if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
+		return 0;
+
+	INIT_LIST_HEAD(&isol.buffers);
+	INIT_LIST_HEAD(&isol.dispose);
+
+	freed = list_lru_shrink_walk(&qi->qi_lru, sc,
+				     xfs_qm_dquot_isolate, &isol);
+
+	error = xfs_buf_delwri_submit(&isol.buffers);
+	if (error)
+		xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+	while (!list_empty(&isol.dispose)) {
+		struct xfs_dquot	*dqp;
+
+		dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+		list_del_init(&dqp->q_lru);
+		xfs_qm_dqfree_one(dqp);
+	}
+
+	return freed;
+}
+
+static unsigned long
+xfs_qm_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+
+	return list_lru_shrink_count(&qi->qi_lru, sc);
+}
+
+STATIC void
+xfs_qm_set_defquota(
+	struct xfs_mount	*mp,
+	xfs_dqtype_t		type,
+	struct xfs_quotainfo	*qinf)
+{
+	struct xfs_dquot	*dqp;
+	struct xfs_def_quota	*defq;
+	int			error;
+
+	error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
+	if (error)
+		return;
+
+	defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp));
+
+	/*
+	 * Timers and warnings have been already set, let's just set the
+	 * default limits for this quota type
+	 */
+	defq->blk.hard = dqp->q_blk.hardlimit;
+	defq->blk.soft = dqp->q_blk.softlimit;
+	defq->ino.hard = dqp->q_ino.hardlimit;
+	defq->ino.soft = dqp->q_ino.softlimit;
+	defq->rtb.hard = dqp->q_rtb.hardlimit;
+	defq->rtb.soft = dqp->q_rtb.softlimit;
+	xfs_qm_dqdestroy(dqp);
+}
+
+/* Initialize quota time limits from the root dquot. */
+static void
+xfs_qm_init_timelimits(
+	struct xfs_mount	*mp,
+	xfs_dqtype_t		type)
+{
+	struct xfs_quotainfo	*qinf = mp->m_quotainfo;
+	struct xfs_def_quota	*defq;
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	defq = xfs_get_defquota(qinf, type);
+
+	defq->blk.time = XFS_QM_BTIMELIMIT;
+	defq->ino.time = XFS_QM_ITIMELIMIT;
+	defq->rtb.time = XFS_QM_RTBTIMELIMIT;
+
+	/*
+	 * We try to get the limits from the superuser's limits fields.
+	 * This is quite hacky, but it is standard quota practice.
+	 *
+	 * Since we may not have done a quotacheck by this point, just read
+	 * the dquot without attaching it to any hashtables or lists.
+	 */
+	error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
+	if (error)
+		return;
+
+	/*
+	 * The warnings and timers set the grace period given to
+	 * a user or group before he or she can not perform any
+	 * more writing. If it is zero, a default is used.
+	 */
+	if (dqp->q_blk.timer)
+		defq->blk.time = dqp->q_blk.timer;
+	if (dqp->q_ino.timer)
+		defq->ino.time = dqp->q_ino.timer;
+	if (dqp->q_rtb.timer)
+		defq->rtb.time = dqp->q_rtb.timer;
+
+	xfs_qm_dqdestroy(dqp);
+}
+
+/*
+ * This initializes all the quota information that's kept in the
+ * mount structure
+ */
+STATIC int
+xfs_qm_init_quotainfo(
+	struct xfs_mount	*mp)
+{
+	struct xfs_quotainfo	*qinf;
+	int			error;
+
+	ASSERT(XFS_IS_QUOTA_ON(mp));
+
+	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
+
+	error = list_lru_init(&qinf->qi_lru);
+	if (error)
+		goto out_free_qinf;
+
+	/*
+	 * See if quotainodes are setup, and if not, allocate them,
+	 * and change the superblock accordingly.
+	 */
+	error = xfs_qm_init_quotainos(mp);
+	if (error)
+		goto out_free_lru;
+
+	INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
+	INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+	INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
+	mutex_init(&qinf->qi_tree_lock);
+
+	/* mutex used to serialize quotaoffs */
+	mutex_init(&qinf->qi_quotaofflock);
+
+	/* Precalc some constants */
+	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
+	if (xfs_has_bigtime(mp)) {
+		qinf->qi_expiry_min =
+			xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN);
+		qinf->qi_expiry_max =
+			xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX);
+	} else {
+		qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN;
+		qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX;
+	}
+	trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min,
+			qinf->qi_expiry_max);
+
+	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
+
+	xfs_qm_init_timelimits(mp, XFS_DQTYPE_USER);
+	xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP);
+	xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ);
+
+	if (XFS_IS_UQUOTA_ON(mp))
+		xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf);
+	if (XFS_IS_GQUOTA_ON(mp))
+		xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf);
+	if (XFS_IS_PQUOTA_ON(mp))
+		xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
+
+	qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
+	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
+
+	error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
+				  mp->m_super->s_id);
+	if (error)
+		goto out_free_inos;
+
+	return 0;
+
+out_free_inos:
+	mutex_destroy(&qinf->qi_quotaofflock);
+	mutex_destroy(&qinf->qi_tree_lock);
+	xfs_qm_destroy_quotainos(qinf);
+out_free_lru:
+	list_lru_destroy(&qinf->qi_lru);
+out_free_qinf:
+	kmem_free(qinf);
+	mp->m_quotainfo = NULL;
+	return error;
+}
+
+/*
+ * Gets called when unmounting a filesystem or when all quotas get
+ * turned off.
+ * This purges the quota inodes, destroys locks and frees itself.
+ */
+void
+xfs_qm_destroy_quotainfo(
+	struct xfs_mount	*mp)
+{
+	struct xfs_quotainfo	*qi;
+
+	qi = mp->m_quotainfo;
+	ASSERT(qi != NULL);
+
+	unregister_shrinker(&qi->qi_shrinker);
+	list_lru_destroy(&qi->qi_lru);
+	xfs_qm_destroy_quotainos(qi);
+	mutex_destroy(&qi->qi_tree_lock);
+	mutex_destroy(&qi->qi_quotaofflock);
+	kmem_free(qi);
+	mp->m_quotainfo = NULL;
+}
+
+/*
+ * Create an inode and return with a reference already taken, but unlocked
+ * This is how we create quota inodes
+ */
+STATIC int
+xfs_qm_qino_alloc(
+	struct xfs_mount	*mp,
+	struct xfs_inode	**ipp,
+	unsigned int		flags)
+{
+	struct xfs_trans	*tp;
+	int			error;
+	bool			need_alloc = true;
+
+	*ipp = NULL;
+	/*
+	 * With superblock that doesn't have separate pquotino, we
+	 * share an inode between gquota and pquota. If the on-disk
+	 * superblock has GQUOTA and the filesystem is now mounted
+	 * with PQUOTA, just use sb_gquotino for sb_pquotino and
+	 * vice-versa.
+	 */
+	if (!xfs_has_pquotino(mp) &&
+			(flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
+		xfs_ino_t ino = NULLFSINO;
+
+		if ((flags & XFS_QMOPT_PQUOTA) &&
+			     (mp->m_sb.sb_gquotino != NULLFSINO)) {
+			ino = mp->m_sb.sb_gquotino;
+			if (XFS_IS_CORRUPT(mp,
+					   mp->m_sb.sb_pquotino != NULLFSINO))
+				return -EFSCORRUPTED;
+		} else if ((flags & XFS_QMOPT_GQUOTA) &&
+			     (mp->m_sb.sb_pquotino != NULLFSINO)) {
+			ino = mp->m_sb.sb_pquotino;
+			if (XFS_IS_CORRUPT(mp,
+					   mp->m_sb.sb_gquotino != NULLFSINO))
+				return -EFSCORRUPTED;
+		}
+		if (ino != NULLFSINO) {
+			error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
+			if (error)
+				return error;
+			mp->m_sb.sb_gquotino = NULLFSINO;
+			mp->m_sb.sb_pquotino = NULLFSINO;
+			need_alloc = false;
+		}
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+			need_alloc ? XFS_QM_QINOCREATE_SPACE_RES(mp) : 0,
+			0, 0, &tp);
+	if (error)
+		return error;
+
+	if (need_alloc) {
+		xfs_ino_t	ino;
+
+		error = xfs_dialloc(&tp, 0, S_IFREG, &ino);
+		if (!error)
+			error = xfs_init_new_inode(&init_user_ns, tp, NULL, ino,
+					S_IFREG, 1, 0, 0, false, ipp);
+		if (error) {
+			xfs_trans_cancel(tp);
+			return error;
+		}
+	}
+
+	/*
+	 * Make the changes in the superblock, and log those too.
+	 * sbfields arg may contain fields other than *QUOTINO;
+	 * VERSIONNUM for example.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	if (flags & XFS_QMOPT_SBVERSION) {
+		ASSERT(!xfs_has_quota(mp));
+
+		xfs_add_quota(mp);
+		mp->m_sb.sb_uquotino = NULLFSINO;
+		mp->m_sb.sb_gquotino = NULLFSINO;
+		mp->m_sb.sb_pquotino = NULLFSINO;
+
+		/* qflags will get updated fully _after_ quotacheck */
+		mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
+	}
+	if (flags & XFS_QMOPT_UQUOTA)
+		mp->m_sb.sb_uquotino = (*ipp)->i_ino;
+	else if (flags & XFS_QMOPT_GQUOTA)
+		mp->m_sb.sb_gquotino = (*ipp)->i_ino;
+	else
+		mp->m_sb.sb_pquotino = (*ipp)->i_ino;
+	spin_unlock(&mp->m_sb_lock);
+	xfs_log_sb(tp);
+
+	error = xfs_trans_commit(tp);
+	if (error) {
+		ASSERT(xfs_is_shutdown(mp));
+		xfs_alert(mp, "%s failed (error %d)!", __func__, error);
+	}
+	if (need_alloc)
+		xfs_finish_inode_setup(*ipp);
+	return error;
+}
+
+
+STATIC void
+xfs_qm_reset_dqcounts(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type)
+{
+	struct xfs_dqblk	*dqb;
+	int			j;
+
+	trace_xfs_reset_dqcounts(bp, _RET_IP_);
+
+	/*
+	 * Reset all counters and timers. They'll be
+	 * started afresh by xfs_qm_quotacheck.
+	 */
+#ifdef DEBUG
+	j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
+		sizeof(struct xfs_dqblk);
+	ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
+#endif
+	dqb = bp->b_addr;
+	for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+		struct xfs_disk_dquot	*ddq;
+
+		ddq = (struct xfs_disk_dquot *)&dqb[j];
+
+		/*
+		 * Do a sanity check, and if needed, repair the dqblk. Don't
+		 * output any warnings because it's perfectly possible to
+		 * find uninitialised dquot blks. See comment in
+		 * xfs_dquot_verify.
+		 */
+		if (xfs_dqblk_verify(mp, &dqb[j], id + j) ||
+		    (dqb[j].dd_diskdq.d_type & XFS_DQTYPE_REC_MASK) != type)
+			xfs_dqblk_repair(mp, &dqb[j], id + j, type);
+
+		/*
+		 * Reset type in case we are reusing group quota file for
+		 * project quotas or vice versa
+		 */
+		ddq->d_type = type;
+		ddq->d_bcount = 0;
+		ddq->d_icount = 0;
+		ddq->d_rtbcount = 0;
+
+		/*
+		 * dquot id 0 stores the default grace period and the maximum
+		 * warning limit that were set by the administrator, so we
+		 * should not reset them.
+		 */
+		if (ddq->d_id != 0) {
+			ddq->d_btimer = 0;
+			ddq->d_itimer = 0;
+			ddq->d_rtbtimer = 0;
+			ddq->d_bwarns = 0;
+			ddq->d_iwarns = 0;
+			ddq->d_rtbwarns = 0;
+			if (xfs_has_bigtime(mp))
+				ddq->d_type |= XFS_DQTYPE_BIGTIME;
+		}
+
+		if (xfs_has_crc(mp)) {
+			xfs_update_cksum((char *)&dqb[j],
+					 sizeof(struct xfs_dqblk),
+					 XFS_DQUOT_CRC_OFF);
+		}
+	}
+}
+
+STATIC int
+xfs_qm_reset_dqcounts_all(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		firstid,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		blkcnt,
+	xfs_dqtype_t		type,
+	struct list_head	*buffer_list)
+{
+	struct xfs_buf		*bp;
+	int			error = 0;
+
+	ASSERT(blkcnt > 0);
+
+	/*
+	 * Blkcnt arg can be a very big number, and might even be
+	 * larger than the log itself. So, we have to break it up into
+	 * manageable-sized transactions.
+	 * Note that we don't start a permanent transaction here; we might
+	 * not be able to get a log reservation for the whole thing up front,
+	 * and we don't really care to either, because we just discard
+	 * everything if we were to crash in the middle of this loop.
+	 */
+	while (blkcnt--) {
+		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+			      XFS_FSB_TO_DADDR(mp, bno),
+			      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+			      &xfs_dquot_buf_ops);
+
+		/*
+		 * CRC and validation errors will return a EFSCORRUPTED here. If
+		 * this occurs, re-read without CRC validation so that we can
+		 * repair the damage via xfs_qm_reset_dqcounts(). This process
+		 * will leave a trace in the log indicating corruption has
+		 * been detected.
+		 */
+		if (error == -EFSCORRUPTED) {
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+				      XFS_FSB_TO_DADDR(mp, bno),
+				      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+				      NULL);
+		}
+
+		if (error)
+			break;
+
+		/*
+		 * A corrupt buffer might not have a verifier attached, so
+		 * make sure we have the correct one attached before writeback
+		 * occurs.
+		 */
+		bp->b_ops = &xfs_dquot_buf_ops;
+		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+		xfs_buf_delwri_queue(bp, buffer_list);
+		xfs_buf_relse(bp);
+
+		/* goto the next block. */
+		bno++;
+		firstid += mp->m_quotainfo->qi_dqperchunk;
+	}
+
+	return error;
+}
+
+/*
+ * Iterate over all allocated dquot blocks in this quota inode, zeroing all
+ * counters for every chunk of dquots that we find.
+ */
+STATIC int
+xfs_qm_reset_dqcounts_buf(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*qip,
+	xfs_dqtype_t		type,
+	struct list_head	*buffer_list)
+{
+	struct xfs_bmbt_irec	*map;
+	int			i, nmaps;	/* number of map entries */
+	int			error;		/* return value */
+	xfs_fileoff_t		lblkno;
+	xfs_filblks_t		maxlblkcnt;
+	xfs_dqid_t		firstid;
+	xfs_fsblock_t		rablkno;
+	xfs_filblks_t		rablkcnt;
+
+	error = 0;
+	/*
+	 * This looks racy, but we can't keep an inode lock across a
+	 * trans_reserve. But, this gets called during quotacheck, and that
+	 * happens only at mount time which is single threaded.
+	 */
+	if (qip->i_nblocks == 0)
+		return 0;
+
+	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
+
+	lblkno = 0;
+	maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+	do {
+		uint		lock_mode;
+
+		nmaps = XFS_DQITER_MAP_SIZE;
+		/*
+		 * We aren't changing the inode itself. Just changing
+		 * some of its data. No new blocks are added here, and
+		 * the inode is never added to the transaction.
+		 */
+		lock_mode = xfs_ilock_data_map_shared(qip);
+		error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
+				       map, &nmaps, 0);
+		xfs_iunlock(qip, lock_mode);
+		if (error)
+			break;
+
+		ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
+		for (i = 0; i < nmaps; i++) {
+			ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+			ASSERT(map[i].br_blockcount);
+
+
+			lblkno += map[i].br_blockcount;
+
+			if (map[i].br_startblock == HOLESTARTBLOCK)
+				continue;
+
+			firstid = (xfs_dqid_t) map[i].br_startoff *
+				mp->m_quotainfo->qi_dqperchunk;
+			/*
+			 * Do a read-ahead on the next extent.
+			 */
+			if ((i+1 < nmaps) &&
+			    (map[i+1].br_startblock != HOLESTARTBLOCK)) {
+				rablkcnt =  map[i+1].br_blockcount;
+				rablkno = map[i+1].br_startblock;
+				while (rablkcnt--) {
+					xfs_buf_readahead(mp->m_ddev_targp,
+					       XFS_FSB_TO_DADDR(mp, rablkno),
+					       mp->m_quotainfo->qi_dqchunklen,
+					       &xfs_dquot_buf_ops);
+					rablkno++;
+				}
+			}
+			/*
+			 * Iterate thru all the blks in the extent and
+			 * reset the counters of all the dquots inside them.
+			 */
+			error = xfs_qm_reset_dqcounts_all(mp, firstid,
+						   map[i].br_startblock,
+						   map[i].br_blockcount,
+						   type, buffer_list);
+			if (error)
+				goto out;
+		}
+	} while (nmaps > 0);
+
+out:
+	kmem_free(map);
+	return error;
+}
+
+/*
+ * Called by dqusage_adjust in doing a quotacheck.
+ *
+ * Given the inode, and a dquot id this updates both the incore dqout as well
+ * as the buffer copy. This is so that once the quotacheck is done, we can
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
+ */
+STATIC int
+xfs_qm_quotacheck_dqadjust(
+	struct xfs_inode	*ip,
+	xfs_dqtype_t		type,
+	xfs_qcnt_t		nblks,
+	xfs_qcnt_t		rtblks)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*dqp;
+	xfs_dqid_t		id;
+	int			error;
+
+	id = xfs_qm_id_for_quotatype(ip, type);
+	error = xfs_qm_dqget(mp, id, type, true, &dqp);
+	if (error) {
+		/*
+		 * Shouldn't be able to turn off quotas here.
+		 */
+		ASSERT(error != -ESRCH);
+		ASSERT(error != -ENOENT);
+		return error;
+	}
+
+	trace_xfs_dqadjust(dqp);
+
+	/*
+	 * Adjust the inode count and the block count to reflect this inode's
+	 * resource usage.
+	 */
+	dqp->q_ino.count++;
+	dqp->q_ino.reserved++;
+	if (nblks) {
+		dqp->q_blk.count += nblks;
+		dqp->q_blk.reserved += nblks;
+	}
+	if (rtblks) {
+		dqp->q_rtb.count += rtblks;
+		dqp->q_rtb.reserved += rtblks;
+	}
+
+	/*
+	 * Set default limits, adjust timers (since we changed usages)
+	 *
+	 * There are no timers for the default values set in the root dquot.
+	 */
+	if (dqp->q_id) {
+		xfs_qm_adjust_dqlimits(dqp);
+		xfs_qm_adjust_dqtimers(dqp);
+	}
+
+	dqp->q_flags |= XFS_DQFLAG_DIRTY;
+	xfs_qm_dqput(dqp);
+	return 0;
+}
+
+/*
+ * callback routine supplied to bulkstat(). Given an inumber, find its
+ * dquots and update them to account for resources taken by that inode.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqusage_adjust(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	void			*data)
+{
+	struct xfs_inode	*ip;
+	xfs_qcnt_t		nblks;
+	xfs_filblks_t		rtblks = 0;	/* total rt blks */
+	int			error;
+
+	ASSERT(XFS_IS_QUOTA_ON(mp));
+
+	/*
+	 * rootino must have its resources accounted for, not so with the quota
+	 * inodes.
+	 */
+	if (xfs_is_quota_inode(&mp->m_sb, ino))
+		return 0;
+
+	/*
+	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
+	 * at mount time and therefore nobody will be racing chown/chproj.
+	 */
+	error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
+	if (error == -EINVAL || error == -ENOENT)
+		return 0;
+	if (error)
+		return error;
+
+	ASSERT(ip->i_delayed_blks == 0);
+
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+		error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+		if (error)
+			goto error0;
+
+		xfs_bmap_count_leaves(ifp, &rtblks);
+	}
+
+	nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
+
+	/*
+	 * Add the (disk blocks and inode) resources occupied by this
+	 * inode to its dquots. We do this adjustment in the incore dquot,
+	 * and also copy the changes to its buffer.
+	 * We don't care about putting these changes in a transaction
+	 * envelope because if we crash in the middle of a 'quotacheck'
+	 * we have to start from the beginning anyway.
+	 * Once we're done, we'll log all the dquot bufs.
+	 *
+	 * The *QUOTA_ON checks below may look pretty racy, but quotachecks
+	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_USER, nblks,
+				rtblks);
+		if (error)
+			goto error0;
+	}
+
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_GROUP, nblks,
+				rtblks);
+		if (error)
+			goto error0;
+	}
+
+	if (XFS_IS_PQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_PROJ, nblks,
+				rtblks);
+		if (error)
+			goto error0;
+	}
+
+error0:
+	xfs_irele(ip);
+	return error;
+}
+
+STATIC int
+xfs_qm_flush_one(
+	struct xfs_dquot	*dqp,
+	void			*data)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct list_head	*buffer_list = data;
+	struct xfs_buf		*bp = NULL;
+	int			error = 0;
+
+	xfs_dqlock(dqp);
+	if (dqp->q_flags & XFS_DQFLAG_FREEING)
+		goto out_unlock;
+	if (!XFS_DQ_IS_DIRTY(dqp))
+		goto out_unlock;
+
+	/*
+	 * The only way the dquot is already flush locked by the time quotacheck
+	 * gets here is if reclaim flushed it before the dqadjust walk dirtied
+	 * it for the final time. Quotacheck collects all dquot bufs in the
+	 * local delwri queue before dquots are dirtied, so reclaim can't have
+	 * possibly queued it for I/O. The only way out is to push the buffer to
+	 * cycle the flush lock.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		/* buf is pinned in-core by delwri list */
+		error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
+				mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+		if (error)
+			goto out_unlock;
+
+		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+			error = -EAGAIN;
+			xfs_buf_relse(bp);
+			goto out_unlock;
+		}
+		xfs_buf_unlock(bp);
+
+		xfs_buf_delwri_pushbuf(bp, buffer_list);
+		xfs_buf_rele(bp);
+
+		error = -EAGAIN;
+		goto out_unlock;
+	}
+
+	error = xfs_qm_dqflush(dqp, &bp);
+	if (error)
+		goto out_unlock;
+
+	xfs_buf_delwri_queue(bp, buffer_list);
+	xfs_buf_relse(bp);
+out_unlock:
+	xfs_dqunlock(dqp);
+	return error;
+}
+
+/*
+ * Walk thru all the filesystem inodes and construct a consistent view
+ * of the disk quota world. If the quotacheck fails, disable quotas.
+ */
+STATIC int
+xfs_qm_quotacheck(
+	xfs_mount_t	*mp)
+{
+	int			error, error2;
+	uint			flags;
+	LIST_HEAD		(buffer_list);
+	struct xfs_inode	*uip = mp->m_quotainfo->qi_uquotaip;
+	struct xfs_inode	*gip = mp->m_quotainfo->qi_gquotaip;
+	struct xfs_inode	*pip = mp->m_quotainfo->qi_pquotaip;
+
+	flags = 0;
+
+	ASSERT(uip || gip || pip);
+	ASSERT(XFS_IS_QUOTA_ON(mp));
+
+	xfs_notice(mp, "Quotacheck needed: Please wait.");
+
+	/*
+	 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
+	 * their counters to zero. We need a clean slate.
+	 * We don't log our changes till later.
+	 */
+	if (uip) {
+		error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQTYPE_USER,
+					 &buffer_list);
+		if (error)
+			goto error_return;
+		flags |= XFS_UQUOTA_CHKD;
+	}
+
+	if (gip) {
+		error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQTYPE_GROUP,
+					 &buffer_list);
+		if (error)
+			goto error_return;
+		flags |= XFS_GQUOTA_CHKD;
+	}
+
+	if (pip) {
+		error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQTYPE_PROJ,
+					 &buffer_list);
+		if (error)
+			goto error_return;
+		flags |= XFS_PQUOTA_CHKD;
+	}
+
+	error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
+			NULL);
+	if (error) {
+		/*
+		 * The inode walk may have partially populated the dquot
+		 * caches.  We must purge them before disabling quota and
+		 * tearing down the quotainfo, or else the dquots will leak.
+		 */
+		xfs_qm_dqpurge_all(mp);
+		goto error_return;
+	}
+
+	/*
+	 * We've made all the changes that we need to make incore.  Flush them
+	 * down to disk buffers if everything was updated successfully.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		error = xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_flush_one,
+					  &buffer_list);
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_flush_one,
+					   &buffer_list);
+		if (!error)
+			error = error2;
+	}
+	if (XFS_IS_PQUOTA_ON(mp)) {
+		error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_flush_one,
+					   &buffer_list);
+		if (!error)
+			error = error2;
+	}
+
+	error2 = xfs_buf_delwri_submit(&buffer_list);
+	if (!error)
+		error = error2;
+
+	/*
+	 * We can get this error if we couldn't do a dquot allocation inside
+	 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
+	 * dirty dquots that might be cached, we just want to get rid of them
+	 * and turn quotaoff. The dquots won't be attached to any of the inodes
+	 * at this point (because we intentionally didn't in dqget_noattach).
+	 */
+	if (error) {
+		xfs_qm_dqpurge_all(mp);
+		goto error_return;
+	}
+
+	/*
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD;
+	mp->m_qflags |= flags;
+
+ error_return:
+	xfs_buf_delwri_cancel(&buffer_list);
+
+	if (error) {
+		xfs_warn(mp,
+	"Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
+			error);
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo != NULL);
+		xfs_qm_destroy_quotainfo(mp);
+		if (xfs_mount_reset_sbqflags(mp)) {
+			xfs_warn(mp,
+				"Quotacheck: Failed to reset quota flags.");
+		}
+	} else
+		xfs_notice(mp, "Quotacheck: Done.");
+	return error;
+}
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo.  This is also responsible for
+ * running a quotacheck as necessary.  We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+	struct xfs_mount	*mp)
+{
+	int			error = 0;
+	uint			sbf;
+
+	/*
+	 * If quotas on realtime volumes is not supported, we disable
+	 * quotas immediately.
+	 */
+	if (mp->m_sb.sb_rextents) {
+		xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+
+	ASSERT(XFS_IS_QUOTA_ON(mp));
+
+	/*
+	 * Allocate the quotainfo structure inside the mount struct, and
+	 * create quotainode(s), and change/rev superblock if necessary.
+	 */
+	error = xfs_qm_init_quotainfo(mp);
+	if (error) {
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo == NULL);
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+	/*
+	 * If any of the quotas are not consistent, do a quotacheck.
+	 */
+	if (XFS_QM_NEED_QUOTACHECK(mp)) {
+		error = xfs_qm_quotacheck(mp);
+		if (error) {
+			/* Quotacheck failed and disabled quotas. */
+			return;
+		}
+	}
+	/*
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	if (!XFS_IS_UQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+	if (!XFS_IS_GQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+	if (!XFS_IS_PQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_PQUOTA_CHKD;
+
+ write_changes:
+	/*
+	 * We actually don't have to acquire the m_sb_lock at all.
+	 * This can only be called from mount, and that's single threaded. XXX
+	 */
+	spin_lock(&mp->m_sb_lock);
+	sbf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+		if (xfs_sync_sb(mp, false)) {
+			/*
+			 * We could only have been turning quotas off.
+			 * We aren't in very good shape actually because
+			 * the incore structures are convinced that quotas are
+			 * off, but the on disk superblock doesn't know that !
+			 */
+			ASSERT(!(XFS_IS_QUOTA_ON(mp)));
+			xfs_alert(mp, "%s: Superblock update failed!",
+				__func__);
+		}
+	}
+
+	if (error) {
+		xfs_warn(mp, "Failed to initialize disk quotas.");
+		return;
+	}
+}
+
+/*
+ * This is called after the superblock has been read in and we're ready to
+ * iget the quota inodes.
+ */
+STATIC int
+xfs_qm_init_quotainos(
+	xfs_mount_t	*mp)
+{
+	struct xfs_inode	*uip = NULL;
+	struct xfs_inode	*gip = NULL;
+	struct xfs_inode	*pip = NULL;
+	int			error;
+	uint			flags = 0;
+
+	ASSERT(mp->m_quotainfo);
+
+	/*
+	 * Get the uquota and gquota inodes
+	 */
+	if (xfs_has_quota(mp)) {
+		if (XFS_IS_UQUOTA_ON(mp) &&
+		    mp->m_sb.sb_uquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_uquotino > 0);
+			error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+					     0, 0, &uip);
+			if (error)
+				return error;
+		}
+		if (XFS_IS_GQUOTA_ON(mp) &&
+		    mp->m_sb.sb_gquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_gquotino > 0);
+			error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+					     0, 0, &gip);
+			if (error)
+				goto error_rele;
+		}
+		if (XFS_IS_PQUOTA_ON(mp) &&
+		    mp->m_sb.sb_pquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_pquotino > 0);
+			error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+					     0, 0, &pip);
+			if (error)
+				goto error_rele;
+		}
+	} else {
+		flags |= XFS_QMOPT_SBVERSION;
+	}
+
+	/*
+	 * Create the three inodes, if they don't exist already. The changes
+	 * made above will get added to a transaction and logged in one of
+	 * the qino_alloc calls below.  If the device is readonly,
+	 * temporarily switch to read-write to do this.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
+		error = xfs_qm_qino_alloc(mp, &uip,
+					      flags | XFS_QMOPT_UQUOTA);
+		if (error)
+			goto error_rele;
+
+		flags &= ~XFS_QMOPT_SBVERSION;
+	}
+	if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
+		error = xfs_qm_qino_alloc(mp, &gip,
+					  flags | XFS_QMOPT_GQUOTA);
+		if (error)
+			goto error_rele;
+
+		flags &= ~XFS_QMOPT_SBVERSION;
+	}
+	if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
+		error = xfs_qm_qino_alloc(mp, &pip,
+					  flags | XFS_QMOPT_PQUOTA);
+		if (error)
+			goto error_rele;
+	}
+
+	mp->m_quotainfo->qi_uquotaip = uip;
+	mp->m_quotainfo->qi_gquotaip = gip;
+	mp->m_quotainfo->qi_pquotaip = pip;
+
+	return 0;
+
+error_rele:
+	if (uip)
+		xfs_irele(uip);
+	if (gip)
+		xfs_irele(gip);
+	if (pip)
+		xfs_irele(pip);
+	return error;
+}
+
+STATIC void
+xfs_qm_destroy_quotainos(
+	struct xfs_quotainfo	*qi)
+{
+	if (qi->qi_uquotaip) {
+		xfs_irele(qi->qi_uquotaip);
+		qi->qi_uquotaip = NULL; /* paranoia */
+	}
+	if (qi->qi_gquotaip) {
+		xfs_irele(qi->qi_gquotaip);
+		qi->qi_gquotaip = NULL;
+	}
+	if (qi->qi_pquotaip) {
+		xfs_irele(qi->qi_pquotaip);
+		qi->qi_pquotaip = NULL;
+	}
+}
+
+STATIC void
+xfs_qm_dqfree_one(
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+
+	mutex_lock(&qi->qi_tree_lock);
+	radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
+
+	qi->qi_dquots--;
+	mutex_unlock(&qi->qi_tree_lock);
+
+	xfs_qm_dqdestroy(dqp);
+}
+
+/* --------------- utility functions for vnodeops ---------------- */
+
+
+/*
+ * Given an inode, a uid, gid and prid make sure that we have
+ * allocated relevant dquot(s) on disk, and that we won't exceed inode
+ * quotas by creating this file.
+ * This also attaches dquot(s) to the given inode after locking it,
+ * and returns the dquots corresponding to the uid and/or gid.
+ *
+ * in	: inode (unlocked)
+ * out	: udquot, gdquot with references taken and unlocked
+ */
+int
+xfs_qm_vop_dqalloc(
+	struct xfs_inode	*ip,
+	kuid_t			uid,
+	kgid_t			gid,
+	prid_t			prid,
+	uint			flags,
+	struct xfs_dquot	**O_udqpp,
+	struct xfs_dquot	**O_gdqpp,
+	struct xfs_dquot	**O_pdqpp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	struct user_namespace	*user_ns = inode->i_sb->s_user_ns;
+	struct xfs_dquot	*uq = NULL;
+	struct xfs_dquot	*gq = NULL;
+	struct xfs_dquot	*pq = NULL;
+	int			error;
+	uint			lockflags;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	lockflags = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lockflags);
+
+	if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
+		gid = inode->i_gid;
+
+	/*
+	 * Attach the dquot(s) to this inode, doing a dquot allocation
+	 * if necessary. The dquot(s) will not be locked.
+	 */
+	if (XFS_NOT_DQATTACHED(mp, ip)) {
+		error = xfs_qm_dqattach_locked(ip, true);
+		if (error) {
+			xfs_iunlock(ip, lockflags);
+			return error;
+		}
+	}
+
+	if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(O_udqpp);
+		if (!uid_eq(inode->i_uid, uid)) {
+			/*
+			 * What we need is the dquot that has this uid, and
+			 * if we send the inode to dqget, the uid of the inode
+			 * takes priority over what's sent in the uid argument.
+			 * We must unlock inode here before calling dqget if
+			 * we're not sending the inode, because otherwise
+			 * we'll deadlock by doing trans_reserve while
+			 * holding ilock.
+			 */
+			xfs_iunlock(ip, lockflags);
+			error = xfs_qm_dqget(mp, from_kuid(user_ns, uid),
+					XFS_DQTYPE_USER, true, &uq);
+			if (error) {
+				ASSERT(error != -ENOENT);
+				return error;
+			}
+			/*
+			 * Get the ilock in the right order.
+			 */
+			xfs_dqunlock(uq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			/*
+			 * Take an extra reference, because we'll return
+			 * this to caller
+			 */
+			ASSERT(ip->i_udquot);
+			uq = xfs_qm_dqhold(ip->i_udquot);
+		}
+	}
+	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(O_gdqpp);
+		if (!gid_eq(inode->i_gid, gid)) {
+			xfs_iunlock(ip, lockflags);
+			error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
+					XFS_DQTYPE_GROUP, true, &gq);
+			if (error) {
+				ASSERT(error != -ENOENT);
+				goto error_rele;
+			}
+			xfs_dqunlock(gq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			ASSERT(ip->i_gdquot);
+			gq = xfs_qm_dqhold(ip->i_gdquot);
+		}
+	}
+	if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+		ASSERT(O_pdqpp);
+		if (ip->i_projid != prid) {
+			xfs_iunlock(ip, lockflags);
+			error = xfs_qm_dqget(mp, prid,
+					XFS_DQTYPE_PROJ, true, &pq);
+			if (error) {
+				ASSERT(error != -ENOENT);
+				goto error_rele;
+			}
+			xfs_dqunlock(pq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			ASSERT(ip->i_pdquot);
+			pq = xfs_qm_dqhold(ip->i_pdquot);
+		}
+	}
+	trace_xfs_dquot_dqalloc(ip);
+
+	xfs_iunlock(ip, lockflags);
+	if (O_udqpp)
+		*O_udqpp = uq;
+	else
+		xfs_qm_dqrele(uq);
+	if (O_gdqpp)
+		*O_gdqpp = gq;
+	else
+		xfs_qm_dqrele(gq);
+	if (O_pdqpp)
+		*O_pdqpp = pq;
+	else
+		xfs_qm_dqrele(pq);
+	return 0;
+
+error_rele:
+	xfs_qm_dqrele(gq);
+	xfs_qm_dqrele(uq);
+	return error;
+}
+
+/*
+ * Actually transfer ownership, and do dquot modifications.
+ * These were already reserved.
+ */
+struct xfs_dquot *
+xfs_qm_vop_chown(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_dquot	**IO_olddq,
+	struct xfs_dquot	*newdq)
+{
+	struct xfs_dquot	*prevdq;
+	uint		bfield = XFS_IS_REALTIME_INODE(ip) ?
+				 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
+
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
+
+	/* old dquot */
+	prevdq = *IO_olddq;
+	ASSERT(prevdq);
+	ASSERT(prevdq != newdq);
+
+	xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks));
+	xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+
+	/* the sparkling new dquot */
+	xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks);
+	xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+
+	/*
+	 * Back when we made quota reservations for the chown, we reserved the
+	 * ondisk blocks + delalloc blocks with the new dquot.  Now that we've
+	 * switched the dquots, decrease the new dquot's block reservation
+	 * (having already bumped up the real counter) so that we don't have
+	 * any reservation to give back when we commit.
+	 */
+	xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+			-ip->i_delayed_blks);
+
+	/*
+	 * Give the incore reservation for delalloc blocks back to the old
+	 * dquot.  We don't normally handle delalloc quota reservations
+	 * transactionally, so just lock the dquot and subtract from the
+	 * reservation.  Dirty the transaction because it's too late to turn
+	 * back now.
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	xfs_dqlock(prevdq);
+	ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+	prevdq->q_blk.reserved -= ip->i_delayed_blks;
+	xfs_dqunlock(prevdq);
+
+	/*
+	 * Take an extra reference, because the inode is going to keep
+	 * this dquot pointer even after the trans_commit.
+	 */
+	*IO_olddq = xfs_qm_dqhold(newdq);
+
+	return prevdq;
+}
+
+int
+xfs_qm_vop_rename_dqattach(
+	struct xfs_inode	**i_tab)
+{
+	struct xfs_mount	*mp = i_tab[0]->i_mount;
+	int			i;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	for (i = 0; (i < 4 && i_tab[i]); i++) {
+		struct xfs_inode	*ip = i_tab[i];
+		int			error;
+
+		/*
+		 * Watch out for duplicate entries in the table.
+		 */
+		if (i == 0 || ip != i_tab[i-1]) {
+			if (XFS_NOT_DQATTACHED(mp, ip)) {
+				error = xfs_qm_dqattach(ip);
+				if (error)
+					return error;
+			}
+		}
+	}
+	return 0;
+}
+
+void
+xfs_qm_vop_create_dqattach(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	struct xfs_dquot	*pdqp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (udqp && XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(ip->i_udquot == NULL);
+		ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id);
+
+		ip->i_udquot = xfs_qm_dqhold(udqp);
+		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+	if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(ip->i_gdquot == NULL);
+		ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id);
+
+		ip->i_gdquot = xfs_qm_dqhold(gdqp);
+		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+	if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
+		ASSERT(ip->i_pdquot == NULL);
+		ASSERT(ip->i_projid == pdqp->q_id);
+
+		ip->i_pdquot = xfs_qm_dqhold(pdqp);
+		xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+}
+
+/* Decide if this inode's dquot is near an enforcement boundary. */
+bool
+xfs_inode_near_dquot_enforcement(
+	struct xfs_inode	*ip,
+	xfs_dqtype_t		type)
+{
+	struct xfs_dquot	*dqp;
+	int64_t			freesp;
+
+	/* We only care for quotas that are enabled and enforced. */
+	dqp = xfs_inode_dquot(ip, type);
+	if (!dqp || !xfs_dquot_is_enforced(dqp))
+		return false;
+
+	if (xfs_dquot_res_over_limits(&dqp->q_ino) ||
+	    xfs_dquot_res_over_limits(&dqp->q_rtb))
+		return true;
+
+	/* For space on the data device, check the various thresholds. */
+	if (!dqp->q_prealloc_hi_wmark)
+		return false;
+
+	if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark)
+		return false;
+
+	if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark)
+		return true;
+
+	freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved;
+	if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT])
+		return true;
+
+	return false;
+}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
new file mode 100644
index 000000000..9683f0457
--- /dev/null
+++ b/fs/xfs/xfs_qm.h
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_QM_H__
+#define __XFS_QM_H__
+
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+
+struct xfs_inode;
+
+extern struct kmem_cache	*xfs_dqtrx_cache;
+
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE	10
+
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+	!dqp->q_blk.hardlimit && \
+	!dqp->q_blk.softlimit && \
+	!dqp->q_rtb.hardlimit && \
+	!dqp->q_rtb.softlimit && \
+	!dqp->q_ino.hardlimit && \
+	!dqp->q_ino.softlimit && \
+	!dqp->q_blk.count && \
+	!dqp->q_rtb.count && \
+	!dqp->q_ino.count)
+
+struct xfs_quota_limits {
+	xfs_qcnt_t		hard;	/* default hard limit */
+	xfs_qcnt_t		soft;	/* default soft limit */
+	time64_t		time;	/* limit for timers */
+};
+
+/* Defaults for each quota type: time limits, warn limits, usage limits */
+struct xfs_def_quota {
+	struct xfs_quota_limits	blk;
+	struct xfs_quota_limits	ino;
+	struct xfs_quota_limits	rtb;
+};
+
+/*
+ * Various quota information for individual filesystems.
+ * The mount structure keeps a pointer to this.
+ */
+struct xfs_quotainfo {
+	struct radix_tree_root	qi_uquota_tree;
+	struct radix_tree_root	qi_gquota_tree;
+	struct radix_tree_root	qi_pquota_tree;
+	struct mutex		qi_tree_lock;
+	struct xfs_inode	*qi_uquotaip;	/* user quota inode */
+	struct xfs_inode	*qi_gquotaip;	/* group quota inode */
+	struct xfs_inode	*qi_pquotaip;	/* project quota inode */
+	struct list_lru		qi_lru;
+	int			qi_dquots;
+	struct mutex		qi_quotaofflock;/* to serialize quotaoff */
+	xfs_filblks_t		qi_dqchunklen;	/* # BBs in a chunk of dqs */
+	uint			qi_dqperchunk;	/* # ondisk dq in above chunk */
+	struct xfs_def_quota	qi_usr_default;
+	struct xfs_def_quota	qi_grp_default;
+	struct xfs_def_quota	qi_prj_default;
+	struct shrinker		qi_shrinker;
+
+	/* Minimum and maximum quota expiration timestamp values. */
+	time64_t		qi_expiry_min;
+	time64_t		qi_expiry_max;
+};
+
+static inline struct radix_tree_root *
+xfs_dquot_tree(
+	struct xfs_quotainfo	*qi,
+	xfs_dqtype_t		type)
+{
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		return &qi->qi_uquota_tree;
+	case XFS_DQTYPE_GROUP:
+		return &qi->qi_gquota_tree;
+	case XFS_DQTYPE_PROJ:
+		return &qi->qi_pquota_tree;
+	default:
+		ASSERT(0);
+	}
+	return NULL;
+}
+
+static inline struct xfs_inode *
+xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type)
+{
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		return mp->m_quotainfo->qi_uquotaip;
+	case XFS_DQTYPE_GROUP:
+		return mp->m_quotainfo->qi_gquotaip;
+	case XFS_DQTYPE_PROJ:
+		return mp->m_quotainfo->qi_pquotaip;
+	default:
+		ASSERT(0);
+	}
+	return NULL;
+}
+
+extern void	xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
+				    uint field, int64_t delta);
+extern void	xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void	xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
+
+/*
+ * We keep the usr, grp, and prj dquots separately so that locking will be
+ * easier to do at commit time. All transactions that we know of at this point
+ * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
+ */
+enum {
+	XFS_QM_TRANS_USR = 0,
+	XFS_QM_TRANS_GRP,
+	XFS_QM_TRANS_PRJ,
+	XFS_QM_TRANS_DQTYPES
+};
+#define XFS_QM_TRANS_MAXDQS		2
+struct xfs_dquot_acct {
+	struct xfs_dqtrx	dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
+};
+
+/*
+ * Users are allowed to have a usage exceeding their softlimit for
+ * a period this long.
+ */
+#define XFS_QM_BTIMELIMIT	(7 * 24*60*60)          /* 1 week */
+#define XFS_QM_RTBTIMELIMIT	(7 * 24*60*60)          /* 1 week */
+#define XFS_QM_ITIMELIMIT	(7 * 24*60*60)          /* 1 week */
+
+extern void		xfs_qm_destroy_quotainfo(struct xfs_mount *);
+
+/* quota ops */
+extern int		xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
+extern int		xfs_qm_scall_getquota(struct xfs_mount *mp,
+					xfs_dqid_t id,
+					xfs_dqtype_t type,
+					struct qc_dqblk *dst);
+extern int		xfs_qm_scall_getquota_next(struct xfs_mount *mp,
+					xfs_dqid_t *id,
+					xfs_dqtype_t type,
+					struct qc_dqblk *dst);
+extern int		xfs_qm_scall_setqlim(struct xfs_mount *mp,
+					xfs_dqid_t id,
+					xfs_dqtype_t type,
+					struct qc_dqblk *newlim);
+extern int		xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int		xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
+
+static inline struct xfs_def_quota *
+xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type)
+{
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		return &qi->qi_usr_default;
+	case XFS_DQTYPE_GROUP:
+		return &qi->qi_grp_default;
+	case XFS_DQTYPE_PROJ:
+		return &qi->qi_prj_default;
+	default:
+		ASSERT(0);
+		return NULL;
+	}
+}
+
+#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
new file mode 100644
index 000000000..b77673dd0
--- /dev/null
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+
+
+STATIC void
+xfs_fill_statvfs_from_dquot(
+	struct kstatfs		*statp,
+	struct xfs_dquot	*dqp)
+{
+	uint64_t		limit;
+
+	limit = dqp->q_blk.softlimit ?
+		dqp->q_blk.softlimit :
+		dqp->q_blk.hardlimit;
+	if (limit && statp->f_blocks > limit) {
+		statp->f_blocks = limit;
+		statp->f_bfree = statp->f_bavail =
+			(statp->f_blocks > dqp->q_blk.reserved) ?
+			 (statp->f_blocks - dqp->q_blk.reserved) : 0;
+	}
+
+	limit = dqp->q_ino.softlimit ?
+		dqp->q_ino.softlimit :
+		dqp->q_ino.hardlimit;
+	if (limit && statp->f_files > limit) {
+		statp->f_files = limit;
+		statp->f_ffree =
+			(statp->f_files > dqp->q_ino.reserved) ?
+			 (statp->f_files - dqp->q_ino.reserved) : 0;
+	}
+}
+
+
+/*
+ * Directory tree accounting is implemented using project quotas, where
+ * the project identifier is inherited from parent directories.
+ * A statvfs (df, etc.) of a directory that is using project quota should
+ * return a statvfs of the project, not the entire filesystem.
+ * This makes such trees appear as if they are filesystems in themselves.
+ */
+void
+xfs_qm_statvfs(
+	struct xfs_inode	*ip,
+	struct kstatfs		*statp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*dqp;
+
+	if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) {
+		xfs_fill_statvfs_from_dquot(statp, dqp);
+		xfs_qm_dqput(dqp);
+	}
+}
+
+int
+xfs_qm_newmount(
+	xfs_mount_t	*mp,
+	uint		*needquotamount,
+	uint		*quotaflags)
+{
+	uint		quotaondisk;
+	uint		uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
+
+	quotaondisk = xfs_has_quota(mp) &&
+				(mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
+
+	if (quotaondisk) {
+		uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
+		pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT;
+		gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
+	}
+
+	/*
+	 * If the device itself is read-only, we can't allow
+	 * the user to change the state of quota on the mount -
+	 * this would generate a transaction on the ro device,
+	 * which would lead to an I/O error and shutdown
+	 */
+
+	if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
+	    (!uquotaondisk &&  XFS_IS_UQUOTA_ON(mp)) ||
+	     (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
+	    (!gquotaondisk &&  XFS_IS_GQUOTA_ON(mp)) ||
+	     (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+	    (!pquotaondisk &&  XFS_IS_PQUOTA_ON(mp)))  &&
+	    xfs_dev_is_read_only(mp, "changing quota state")) {
+		xfs_warn(mp, "please mount with%s%s%s%s.",
+			(!quotaondisk ? "out quota" : ""),
+			(uquotaondisk ? " usrquota" : ""),
+			(gquotaondisk ? " grpquota" : ""),
+			(pquotaondisk ? " prjquota" : ""));
+		return -EPERM;
+	}
+
+	if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
+		/*
+		 * Call mount_quotas at this point only if we won't have to do
+		 * a quotacheck.
+		 */
+		if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
+			/*
+			 * If an error occurred, qm_mount_quotas code
+			 * has already disabled quotas. So, just finish
+			 * mounting, and get on with the boring life
+			 * without disk quotas.
+			 */
+			xfs_qm_mount_quotas(mp);
+		} else {
+			/*
+			 * Clear the quota flags, but remember them. This
+			 * is so that the quota code doesn't get invoked
+			 * before we're ready. This can happen when an
+			 * inode goes inactive and wants to free blocks,
+			 * or via xfs_log_mount_finish.
+			 */
+			*needquotamount = true;
+			*quotaflags = mp->m_qflags;
+			mp->m_qflags = 0;
+		}
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
new file mode 100644
index 000000000..392cb39cc
--- /dev/null
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+
+int
+xfs_qm_scall_quotaoff(
+	xfs_mount_t		*mp,
+	uint			flags)
+{
+	/*
+	 * No file system can have quotas enabled on disk but not in core.
+	 * Note that quota utilities (like quotaoff) _expect_
+	 * errno == -EEXIST here.
+	 */
+	if ((mp->m_qflags & flags) == 0)
+		return -EEXIST;
+
+	/*
+	 * We do not support actually turning off quota accounting any more.
+	 * Just log a warning and ignore the accounting related flags.
+	 */
+	if (flags & XFS_ALL_QUOTA_ACCT)
+		xfs_info(mp, "disabling of quota accounting not supported.");
+
+	mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+	mp->m_qflags &= ~(flags & XFS_ALL_QUOTA_ENFD);
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_qflags = mp->m_qflags;
+	spin_unlock(&mp->m_sb_lock);
+	mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
+
+	/* XXX what to do if error ? Revert back to old vals incore ? */
+	return xfs_sync_sb(mp, false);
+}
+
+STATIC int
+xfs_qm_scall_trunc_qfile(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (ino == NULLFSINO)
+		return 0;
+
+	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error) {
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		goto out_put;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	ip->i_disk_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+	if (error) {
+		xfs_trans_cancel(tp);
+		goto out_unlock;
+	}
+
+	ASSERT(ip->i_df.if_nextents == 0);
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	error = xfs_trans_commit(tp);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+	xfs_irele(ip);
+	return error;
+}
+
+int
+xfs_qm_scall_trunc_qfiles(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error = -EINVAL;
+
+	if (!xfs_has_quota(mp) || flags == 0 ||
+	    (flags & ~XFS_QMOPT_QUOTALL)) {
+		xfs_debug(mp, "%s: flags=%x m_qflags=%x",
+			__func__, flags, mp->m_qflags);
+		return -EINVAL;
+	}
+
+	if (flags & XFS_QMOPT_UQUOTA) {
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
+		if (error)
+			return error;
+	}
+	if (flags & XFS_QMOPT_GQUOTA) {
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+		if (error)
+			return error;
+	}
+	if (flags & XFS_QMOPT_PQUOTA)
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
+
+	return error;
+}
+
+/*
+ * Switch on (a given) quota enforcement for a filesystem.  This takes
+ * effect immediately.
+ * (Switching on quota accounting must be done at mount time.)
+ */
+int
+xfs_qm_scall_quotaon(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error;
+	uint		qf;
+
+	/*
+	 * Switching on quota accounting must be done at mount time,
+	 * only consider quota enforcement stuff here.
+	 */
+	flags &= XFS_ALL_QUOTA_ENFD;
+
+	if (flags == 0) {
+		xfs_debug(mp, "%s: zero flags, m_qflags=%x",
+			__func__, mp->m_qflags);
+		return -EINVAL;
+	}
+
+	/*
+	 * Can't enforce without accounting. We check the superblock
+	 * qflags here instead of m_qflags because rootfs can have
+	 * quota acct on ondisk without m_qflags' knowing.
+	 */
+	if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+	     (flags & XFS_UQUOTA_ENFD)) ||
+	    ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+	     (flags & XFS_GQUOTA_ENFD)) ||
+	    ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+	     (flags & XFS_PQUOTA_ENFD))) {
+		xfs_debug(mp,
+			"%s: Can't enforce without acct, flags=%x sbflags=%x",
+			__func__, flags, mp->m_sb.sb_qflags);
+		return -EINVAL;
+	}
+	/*
+	 * If everything's up to-date incore, then don't waste time.
+	 */
+	if ((mp->m_qflags & flags) == flags)
+		return -EEXIST;
+
+	/*
+	 * Change sb_qflags on disk but not incore mp->qflags
+	 * if this is the root filesystem.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	qf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = qf | flags;
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * There's nothing to change if it's the same.
+	 */
+	if ((qf & flags) == flags)
+		return -EEXIST;
+
+	error = xfs_sync_sb(mp, false);
+	if (error)
+		return error;
+	/*
+	 * If we aren't trying to switch on quota enforcement, we are done.
+	 */
+	if  (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
+	     ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
+	     ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_GQUOTA_ACCT)))
+		return 0;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+
+	/*
+	 * Switch on quota enforcement in core.
+	 */
+	mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
+	mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
+
+	return 0;
+}
+
+#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK)
+
+/*
+ * Adjust limits of this quota, and the defaults if passed in.  Returns true
+ * if the new limits made sense and were applied, false otherwise.
+ */
+static inline bool
+xfs_setqlim_limits(
+	struct xfs_mount	*mp,
+	struct xfs_dquot_res	*res,
+	struct xfs_quota_limits	*qlim,
+	xfs_qcnt_t		hard,
+	xfs_qcnt_t		soft,
+	const char		*tag)
+{
+	/* The hard limit can't be less than the soft limit. */
+	if (hard != 0 && hard < soft) {
+		xfs_debug(mp, "%shard %lld < %ssoft %lld", tag, hard, tag,
+				soft);
+		return false;
+	}
+
+	res->hardlimit = hard;
+	res->softlimit = soft;
+	if (qlim) {
+		qlim->hard = hard;
+		qlim->soft = soft;
+	}
+
+	return true;
+}
+
+static inline void
+xfs_setqlim_timer(
+	struct xfs_mount	*mp,
+	struct xfs_dquot_res	*res,
+	struct xfs_quota_limits	*qlim,
+	s64			timer)
+{
+	if (qlim) {
+		/* Set the length of the default grace period. */
+		res->timer = xfs_dquot_set_grace_period(timer);
+		qlim->time = res->timer;
+	} else {
+		/* Set the grace period expiration on a quota. */
+		res->timer = xfs_dquot_set_timeout(mp, timer);
+	}
+}
+
+/*
+ * Adjust quota limits, and start/stop timers accordingly.
+ */
+int
+xfs_qm_scall_setqlim(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type,
+	struct qc_dqblk		*newlim)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_dquot	*dqp;
+	struct xfs_trans	*tp;
+	struct xfs_def_quota	*defq;
+	struct xfs_dquot_res	*res;
+	struct xfs_quota_limits	*qlim;
+	int			error;
+	xfs_qcnt_t		hard, soft;
+
+	if (newlim->d_fieldmask & ~XFS_QC_MASK)
+		return -EINVAL;
+	if ((newlim->d_fieldmask & XFS_QC_MASK) == 0)
+		return 0;
+
+	/*
+	 * Get the dquot (locked) before we start, as we need to do a
+	 * transaction to allocate it if it doesn't exist. Once we have the
+	 * dquot, unlock it so we can start the next transaction safely. We hold
+	 * a reference to the dquot, so it's safe to do this unlock/lock without
+	 * it being reclaimed in the mean time.
+	 */
+	error = xfs_qm_dqget(mp, id, type, true, &dqp);
+	if (error) {
+		ASSERT(error != -ENOENT);
+		return error;
+	}
+
+	defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
+	xfs_dqunlock(dqp);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
+	if (error)
+		goto out_rele;
+
+	xfs_dqlock(dqp);
+	xfs_trans_dqjoin(tp, dqp);
+
+	/*
+	 * Update quota limits, warnings, and timers, and the defaults
+	 * if we're touching id == 0.
+	 *
+	 * Make sure that hardlimits are >= soft limits before changing.
+	 *
+	 * Update warnings counter(s) if requested.
+	 *
+	 * Timelimits for the super user set the relative time the other users
+	 * can be over quota for this file system. If it is zero a default is
+	 * used.  Ditto for the default soft and hard limit values (already
+	 * done, above), and for warnings.
+	 *
+	 * For other IDs, userspace can bump out the grace period if over
+	 * the soft limit.
+	 */
+
+	/* Blocks on the data device. */
+	hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
+			dqp->q_blk.hardlimit;
+	soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
+			dqp->q_blk.softlimit;
+	res = &dqp->q_blk;
+	qlim = id == 0 ? &defq->blk : NULL;
+
+	if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
+		xfs_dquot_set_prealloc_limits(dqp);
+	if (newlim->d_fieldmask & QC_SPC_TIMER)
+		xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
+
+	/* Blocks on the realtime device. */
+	hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
+			dqp->q_rtb.hardlimit;
+	soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
+			dqp->q_rtb.softlimit;
+	res = &dqp->q_rtb;
+	qlim = id == 0 ? &defq->rtb : NULL;
+
+	xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
+	if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
+		xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
+
+	/* Inodes */
+	hard = (newlim->d_fieldmask & QC_INO_HARD) ?
+		(xfs_qcnt_t) newlim->d_ino_hardlimit :
+			dqp->q_ino.hardlimit;
+	soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
+		(xfs_qcnt_t) newlim->d_ino_softlimit :
+			dqp->q_ino.softlimit;
+	res = &dqp->q_ino;
+	qlim = id == 0 ? &defq->ino : NULL;
+
+	xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
+	if (newlim->d_fieldmask & QC_INO_TIMER)
+		xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
+
+	if (id != 0) {
+		/*
+		 * If the user is now over quota, start the timelimit.
+		 * The user will not be 'warned'.
+		 * Note that we keep the timers ticking, whether enforcement
+		 * is on or off. We don't really want to bother with iterating
+		 * over all ondisk dquots and turning the timers on/off.
+		 */
+		xfs_qm_adjust_dqtimers(dqp);
+	}
+	dqp->q_flags |= XFS_DQFLAG_DIRTY;
+	xfs_trans_log_dquot(tp, dqp);
+
+	error = xfs_trans_commit(tp);
+
+out_rele:
+	xfs_qm_dqrele(dqp);
+	return error;
+}
+
+/* Fill out the quota context. */
+static void
+xfs_qm_scall_getquota_fill_qc(
+	struct xfs_mount	*mp,
+	xfs_dqtype_t		type,
+	const struct xfs_dquot	*dqp,
+	struct qc_dqblk		*dst)
+{
+	memset(dst, 0, sizeof(*dst));
+	dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit);
+	dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit);
+	dst->d_ino_hardlimit = dqp->q_ino.hardlimit;
+	dst->d_ino_softlimit = dqp->q_ino.softlimit;
+	dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved);
+	dst->d_ino_count = dqp->q_ino.reserved;
+	dst->d_spc_timer = dqp->q_blk.timer;
+	dst->d_ino_timer = dqp->q_ino.timer;
+	dst->d_ino_warns = 0;
+	dst->d_spc_warns = 0;
+	dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
+	dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
+	dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
+	dst->d_rt_spc_timer = dqp->q_rtb.timer;
+	dst->d_rt_spc_warns = 0;
+
+	/*
+	 * Internally, we don't reset all the timers when quota enforcement
+	 * gets turned off. No need to confuse the user level code,
+	 * so return zeroes in that case.
+	 */
+	if (!xfs_dquot_is_enforced(dqp)) {
+		dst->d_spc_timer = 0;
+		dst->d_ino_timer = 0;
+		dst->d_rt_spc_timer = 0;
+	}
+
+#ifdef DEBUG
+	if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) {
+		if ((dst->d_space > dst->d_spc_softlimit) &&
+		    (dst->d_spc_softlimit > 0)) {
+			ASSERT(dst->d_spc_timer != 0);
+		}
+		if ((dst->d_ino_count > dqp->q_ino.softlimit) &&
+		    (dqp->q_ino.softlimit > 0)) {
+			ASSERT(dst->d_ino_timer != 0);
+		}
+	}
+#endif
+}
+
+/* Return the quota information for the dquot matching id. */
+int
+xfs_qm_scall_getquota(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	xfs_dqtype_t		type,
+	struct qc_dqblk		*dst)
+{
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	/*
+	 * Expedite pending inodegc work at the start of a quota reporting
+	 * scan but don't block waiting for it to complete.
+	 */
+	if (id == 0)
+		xfs_inodegc_push(mp);
+
+	/*
+	 * Try to get the dquot. We don't want it allocated on disk, so don't
+	 * set doalloc. If it doesn't exist, we'll get ENOENT back.
+	 */
+	error = xfs_qm_dqget(mp, id, type, false, &dqp);
+	if (error)
+		return error;
+
+	/*
+	 * If everything's NULL, this dquot doesn't quite exist as far as
+	 * our utility programs are concerned.
+	 */
+	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+		error = -ENOENT;
+		goto out_put;
+	}
+
+	xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+
+out_put:
+	xfs_qm_dqput(dqp);
+	return error;
+}
+
+/*
+ * Return the quota information for the first initialized dquot whose id
+ * is at least as high as id.
+ */
+int
+xfs_qm_scall_getquota_next(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		*id,
+	xfs_dqtype_t		type,
+	struct qc_dqblk		*dst)
+{
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	/* Flush inodegc work at the start of a quota reporting scan. */
+	if (*id == 0)
+		xfs_inodegc_push(mp);
+
+	error = xfs_qm_dqget_next(mp, *id, type, &dqp);
+	if (error)
+		return error;
+
+	/* Fill in the ID we actually read from disk */
+	*id = dqp->q_id;
+
+	xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+
+	xfs_qm_dqput(dqp);
+	return error;
+}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
new file mode 100644
index 000000000..dcc785fdd
--- /dev/null
+++ b/fs/xfs/xfs_quota.h
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_QUOTA_H__
+#define __XFS_QUOTA_H__
+
+#include "xfs_quota_defs.h"
+
+/*
+ * Kernel only quota definitions and functions
+ */
+
+struct xfs_trans;
+struct xfs_buf;
+
+/*
+ * This check is done typically without holding the inode lock;
+ * that may seem racy, but it is harmless in the context that it is used.
+ * The inode cannot go inactive as long a reference is kept, and
+ * therefore if dquot(s) were attached, they'll stay consistent.
+ * If, for example, the ownership of the inode changes while
+ * we didn't have the inode locked, the appropriate dquot(s) will be
+ * attached atomically.
+ */
+#define XFS_NOT_DQATTACHED(mp, ip) \
+	((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \
+	 (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
+	 (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
+
+#define XFS_QM_NEED_QUOTACHECK(mp) \
+	((XFS_IS_UQUOTA_ON(mp) && \
+		(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
+	 (XFS_IS_GQUOTA_ON(mp) && \
+		(mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
+	 (XFS_IS_PQUOTA_ON(mp) && \
+		(mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
+
+static inline uint
+xfs_quota_chkd_flag(
+	xfs_dqtype_t		type)
+{
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		return XFS_UQUOTA_CHKD;
+	case XFS_DQTYPE_GROUP:
+		return XFS_GQUOTA_CHKD;
+	case XFS_DQTYPE_PROJ:
+		return XFS_PQUOTA_CHKD;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * The structure kept inside the xfs_trans_t keep track of dquot changes
+ * within a transaction and apply them later.
+ */
+struct xfs_dqtrx {
+	struct xfs_dquot *qt_dquot;	  /* the dquot this refers to */
+
+	uint64_t	qt_blk_res;	  /* blks reserved on a dquot */
+	int64_t		qt_bcount_delta;  /* dquot blk count changes */
+	int64_t		qt_delbcnt_delta; /* delayed dquot blk count changes */
+
+	uint64_t	qt_rtblk_res;	  /* # blks reserved on a dquot */
+	uint64_t	qt_rtblk_res_used;/* # blks used from reservation */
+	int64_t		qt_rtbcount_delta;/* dquot realtime blk changes */
+	int64_t		qt_delrtb_delta;  /* delayed RT blk count changes */
+
+	uint64_t	qt_ino_res;	  /* inode reserved on a dquot */
+	uint64_t	qt_ino_res_used;  /* inodes used from the reservation */
+	int64_t		qt_icount_delta;  /* dquot inode count changes */
+};
+
+#ifdef CONFIG_XFS_QUOTA
+extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
+extern void xfs_trans_free_dqinfo(struct xfs_trans *);
+extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
+		uint, int64_t);
+extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
+extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
+		int64_t dblocks, int64_t rblocks, bool force);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+		struct xfs_mount *, struct xfs_dquot *,
+		struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
+int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp,
+		struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
+		struct xfs_dquot *pdqp, int64_t dblocks);
+
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t,
+		prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
+		struct xfs_dquot **);
+extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
+		struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
+extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
+extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
+		struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
+extern int xfs_qm_dqattach(struct xfs_inode *);
+extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc);
+extern void xfs_qm_dqdetach(struct xfs_inode *);
+extern void xfs_qm_dqrele(struct xfs_dquot *);
+extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
+extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
+extern void xfs_qm_mount_quotas(struct xfs_mount *);
+extern void xfs_qm_unmount(struct xfs_mount *);
+extern void xfs_qm_unmount_quotas(struct xfs_mount *);
+
+static inline int
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+	return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
+}
+bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
+#else
+static inline int
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
+		prid_t prid, uint flags, struct xfs_dquot **udqp,
+		struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
+{
+	*udqp = NULL;
+	*gdqp = NULL;
+	*pdqp = NULL;
+	return 0;
+}
+#define xfs_trans_dup_dqinfo(tp, tp2)
+#define xfs_trans_free_dqinfo(tp)
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0)
+#define xfs_trans_apply_dquot_deltas(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
+		struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
+		bool force)
+{
+	return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+		struct xfs_mount *mp, struct xfs_dquot *udqp,
+		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+		int64_t nblks, long nions, uint flags)
+{
+	return 0;
+}
+
+static inline int
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+	return 0;
+}
+
+static inline int
+xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
+		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks)
+{
+	return 0;
+}
+
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
+#define xfs_qm_vop_rename_dqattach(it)					(0)
+#define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)
+#define xfs_qm_dqattach(ip)						(0)
+#define xfs_qm_dqattach_locked(ip, fl)					(0)
+#define xfs_qm_dqdetach(ip)
+#define xfs_qm_dqrele(d)			do { (d) = (d); } while(0)
+#define xfs_qm_statvfs(ip, s)			do { } while(0)
+#define xfs_qm_newmount(mp, a, b)					(0)
+#define xfs_qm_mount_quotas(mp)
+#define xfs_qm_unmount(mp)
+#define xfs_qm_unmount_quotas(mp)
+#define xfs_inode_near_dquot_enforcement(ip, type)			(false)
+#endif /* CONFIG_XFS_QUOTA */
+
+static inline int
+xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+	return xfs_quota_reserve_blkres(ip, -blocks);
+}
+
+extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
+
+#endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
new file mode 100644
index 000000000..9c162e699
--- /dev/null
+++ b/fs/xfs/xfs_quotaops.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_qm.h"
+
+
+static void
+xfs_qm_fill_state(
+	struct qc_type_state	*tstate,
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
+	struct xfs_def_quota	*defq)
+{
+	bool			tempqip = false;
+
+	tstate->ino = ino;
+	if (!ip && ino == NULLFSINO)
+		return;
+	if (!ip) {
+		if (xfs_iget(mp, NULL, ino, 0, 0, &ip))
+			return;
+		tempqip = true;
+	}
+	tstate->flags |= QCI_SYSFILE;
+	tstate->blocks = ip->i_nblocks;
+	tstate->nextents = ip->i_df.if_nextents;
+	tstate->spc_timelimit = (u32)defq->blk.time;
+	tstate->ino_timelimit = (u32)defq->ino.time;
+	tstate->rt_spc_timelimit = (u32)defq->rtb.time;
+	tstate->spc_warnlimit = 0;
+	tstate->ino_warnlimit = 0;
+	tstate->rt_spc_warnlimit = 0;
+	if (tempqip)
+		xfs_irele(ip);
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int
+xfs_fs_get_quota_state(
+	struct super_block	*sb,
+	struct qc_state		*state)
+{
+	struct xfs_mount *mp = XFS_M(sb);
+	struct xfs_quotainfo *q = mp->m_quotainfo;
+
+	memset(state, 0, sizeof(*state));
+	if (!XFS_IS_QUOTA_ON(mp))
+		return 0;
+	state->s_incoredqs = q->qi_dquots;
+	if (XFS_IS_UQUOTA_ON(mp))
+		state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED;
+	if (XFS_IS_UQUOTA_ENFORCED(mp))
+		state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
+	if (XFS_IS_GQUOTA_ON(mp))
+		state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED;
+	if (XFS_IS_GQUOTA_ENFORCED(mp))
+		state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
+	if (XFS_IS_PQUOTA_ON(mp))
+		state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED;
+	if (XFS_IS_PQUOTA_ENFORCED(mp))
+		state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED;
+
+	xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip,
+			  mp->m_sb.sb_uquotino, &q->qi_usr_default);
+	xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip,
+			  mp->m_sb.sb_gquotino, &q->qi_grp_default);
+	xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip,
+			  mp->m_sb.sb_pquotino, &q->qi_prj_default);
+	return 0;
+}
+
+STATIC xfs_dqtype_t
+xfs_quota_type(int type)
+{
+	switch (type) {
+	case USRQUOTA:
+		return XFS_DQTYPE_USER;
+	case GRPQUOTA:
+		return XFS_DQTYPE_GROUP;
+	default:
+		return XFS_DQTYPE_PROJ;
+	}
+}
+
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK)
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int
+xfs_fs_set_info(
+	struct super_block	*sb,
+	int			type,
+	struct qc_info		*info)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	struct qc_dqblk		newlim;
+
+	if (sb_rdonly(sb))
+		return -EROFS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ENOSYS;
+	if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK)
+		return -EINVAL;
+	if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0)
+		return 0;
+
+	newlim.d_fieldmask = info->i_fieldmask;
+	newlim.d_spc_timer = info->i_spc_timelimit;
+	newlim.d_ino_timer = info->i_ino_timelimit;
+	newlim.d_rt_spc_timer = info->i_rt_spc_timelimit;
+	newlim.d_ino_warns = info->i_ino_warnlimit;
+	newlim.d_spc_warns = info->i_spc_warnlimit;
+	newlim.d_rt_spc_warns = info->i_rt_spc_warnlimit;
+
+	return xfs_qm_scall_setqlim(mp, 0, xfs_quota_type(type), &newlim);
+}
+
+static unsigned int
+xfs_quota_flags(unsigned int uflags)
+{
+	unsigned int flags = 0;
+
+	if (uflags & FS_QUOTA_UDQ_ACCT)
+		flags |= XFS_UQUOTA_ACCT;
+	if (uflags & FS_QUOTA_PDQ_ACCT)
+		flags |= XFS_PQUOTA_ACCT;
+	if (uflags & FS_QUOTA_GDQ_ACCT)
+		flags |= XFS_GQUOTA_ACCT;
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		flags |= XFS_UQUOTA_ENFD;
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		flags |= XFS_GQUOTA_ENFD;
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		flags |= XFS_PQUOTA_ENFD;
+
+	return flags;
+}
+
+STATIC int
+xfs_quota_enable(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb_rdonly(sb))
+		return -EROFS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ENOSYS;
+
+	return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
+}
+
+STATIC int
+xfs_quota_disable(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb_rdonly(sb))
+		return -EROFS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ENOSYS;
+
+	return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
+}
+
+STATIC int
+xfs_fs_rm_xquota(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	unsigned int		flags = 0;
+
+	if (sb_rdonly(sb))
+		return -EROFS;
+
+	if (XFS_IS_QUOTA_ON(mp))
+		return -EINVAL;
+
+	if (uflags & ~(FS_USER_QUOTA | FS_GROUP_QUOTA | FS_PROJ_QUOTA))
+		return -EINVAL;
+
+	if (uflags & FS_USER_QUOTA)
+		flags |= XFS_QMOPT_UQUOTA;
+	if (uflags & FS_GROUP_QUOTA)
+		flags |= XFS_QMOPT_GQUOTA;
+	if (uflags & FS_PROJ_QUOTA)
+		flags |= XFS_QMOPT_PQUOTA;
+
+	return xfs_qm_scall_trunc_qfiles(mp, flags);
+}
+
+STATIC int
+xfs_fs_get_dqblk(
+	struct super_block	*sb,
+	struct kqid		qid,
+	struct qc_dqblk		*qdq)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	xfs_dqid_t		id;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ENOSYS;
+
+	id = from_kqid(&init_user_ns, qid);
+	return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq);
+}
+
+/* Return quota info for active quota >= this qid */
+STATIC int
+xfs_fs_get_nextdqblk(
+	struct super_block	*sb,
+	struct kqid		*qid,
+	struct qc_dqblk		*qdq)
+{
+	int			ret;
+	struct xfs_mount	*mp = XFS_M(sb);
+	xfs_dqid_t		id;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ENOSYS;
+
+	id = from_kqid(&init_user_ns, *qid);
+	ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type),
+			qdq);
+	if (ret)
+		return ret;
+
+	/* ID may be different, so convert back what we got */
+	*qid = make_kqid(current_user_ns(), qid->type, id);
+	return 0;
+}
+
+STATIC int
+xfs_fs_set_dqblk(
+	struct super_block	*sb,
+	struct kqid		qid,
+	struct qc_dqblk		*qdq)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb_rdonly(sb))
+		return -EROFS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ENOSYS;
+
+	return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
+				     xfs_quota_type(qid.type), qdq);
+}
+
+const struct quotactl_ops xfs_quotactl_operations = {
+	.get_state		= xfs_fs_get_quota_state,
+	.set_info		= xfs_fs_set_info,
+	.quota_enable		= xfs_quota_enable,
+	.quota_disable		= xfs_quota_disable,
+	.rm_xquota		= xfs_fs_rm_xquota,
+	.get_dqblk		= xfs_fs_get_dqblk,
+	.get_nextdqblk		= xfs_fs_get_nextdqblk,
+	.set_dqblk		= xfs_fs_set_dqblk,
+};
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
new file mode 100644
index 000000000..858e3e9eb
--- /dev/null
+++ b/fs/xfs/xfs_refcount_item.c
@@ -0,0 +1,723 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_refcount_item.h"
+#include "xfs_log.h"
+#include "xfs_refcount.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache	*xfs_cui_cache;
+struct kmem_cache	*xfs_cud_cache;
+
+static const struct xfs_item_ops xfs_cui_item_ops;
+
+static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_cui_log_item, cui_item);
+}
+
+STATIC void
+xfs_cui_item_free(
+	struct xfs_cui_log_item	*cuip)
+{
+	kmem_free(cuip->cui_item.li_lv_shadow);
+	if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
+		kmem_free(cuip);
+	else
+		kmem_cache_free(xfs_cui_cache, cuip);
+}
+
+/*
+ * Freeing the CUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the CUI may not yet have been placed in the AIL
+ * when called by xfs_cui_release() from CUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the CUI.
+ */
+STATIC void
+xfs_cui_release(
+	struct xfs_cui_log_item	*cuip)
+{
+	ASSERT(atomic_read(&cuip->cui_refcount) > 0);
+	if (!atomic_dec_and_test(&cuip->cui_refcount))
+		return;
+
+	xfs_trans_ail_delete(&cuip->cui_item, 0);
+	xfs_cui_item_free(cuip);
+}
+
+
+STATIC void
+xfs_cui_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given cui log item. We use only 1 iovec, and we point that
+ * at the cui_log_format structure embedded in the cui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the cui item have been filled.
+ */
+STATIC void
+xfs_cui_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&cuip->cui_next_extent) ==
+			cuip->cui_format.cui_nextents);
+
+	cuip->cui_format.cui_type = XFS_LI_CUI;
+	cuip->cui_format.cui_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format,
+			xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents));
+}
+
+/*
+ * The unpin operation is the last place an CUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the CUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the CUI to either construct
+ * and commit the CUD or drop the CUD's reference in the event of error. Simply
+ * drop the log's CUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_cui_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+
+	xfs_cui_release(cuip);
+}
+
+/*
+ * The CUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an CUD isn't going to be
+ * constructed and thus we free the CUI here directly.
+ */
+STATIC void
+xfs_cui_item_release(
+	struct xfs_log_item	*lip)
+{
+	xfs_cui_release(CUI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an cui item with the given number of extents.
+ */
+STATIC struct xfs_cui_log_item *
+xfs_cui_init(
+	struct xfs_mount		*mp,
+	uint				nextents)
+
+{
+	struct xfs_cui_log_item		*cuip;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
+		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
+				0);
+	else
+		cuip = kmem_cache_zalloc(xfs_cui_cache,
+					 GFP_KERNEL | __GFP_NOFAIL);
+
+	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+	cuip->cui_format.cui_nextents = nextents;
+	cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
+	atomic_set(&cuip->cui_next_extent, 0);
+	atomic_set(&cuip->cui_refcount, 2);
+
+	return cuip;
+}
+
+static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_cud_log_item, cud_item);
+}
+
+STATIC void
+xfs_cud_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_cud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given cud log item. We use only 1 iovec, and we point that
+ * at the cud_log_format structure embedded in the cud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the cud item have been filled.
+ */
+STATIC void
+xfs_cud_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	cudp->cud_format.cud_type = XFS_LI_CUD;
+	cudp->cud_format.cud_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format,
+			sizeof(struct xfs_cud_log_format));
+}
+
+/*
+ * The CUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the CUI and free the
+ * CUD.
+ */
+STATIC void
+xfs_cud_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
+
+	xfs_cui_release(cudp->cud_cuip);
+	kmem_free(cudp->cud_item.li_lv_shadow);
+	kmem_cache_free(xfs_cud_cache, cudp);
+}
+
+static struct xfs_log_item *
+xfs_cud_item_intent(
+	struct xfs_log_item	*lip)
+{
+	return &CUD_ITEM(lip)->cud_cuip->cui_item;
+}
+
+static const struct xfs_item_ops xfs_cud_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED |
+			  XFS_ITEM_INTENT_DONE,
+	.iop_size	= xfs_cud_item_size,
+	.iop_format	= xfs_cud_item_format,
+	.iop_release	= xfs_cud_item_release,
+	.iop_intent	= xfs_cud_item_intent,
+};
+
+static struct xfs_cud_log_item *
+xfs_trans_get_cud(
+	struct xfs_trans		*tp,
+	struct xfs_cui_log_item		*cuip)
+{
+	struct xfs_cud_log_item		*cudp;
+
+	cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
+			  &xfs_cud_item_ops);
+	cudp->cud_cuip = cuip;
+	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+	xfs_trans_add_item(tp, &cudp->cud_item);
+	return cudp;
+}
+
+/*
+ * Finish an refcount update and log it to the CUD. Note that the
+ * transaction is marked dirty regardless of whether the refcount
+ * update succeeds or fails to support the CUI/CUD lifecycle rules.
+ */
+static int
+xfs_trans_log_finish_refcount_update(
+	struct xfs_trans		*tp,
+	struct xfs_cud_log_item		*cudp,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	xfs_fsblock_t			*new_fsb,
+	xfs_extlen_t			*new_len,
+	struct xfs_btree_cur		**pcur)
+{
+	int				error;
+
+	error = xfs_refcount_finish_one(tp, type, startblock,
+			blockcount, new_fsb, new_len, pcur);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the CUI and frees the CUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+	return error;
+}
+
+/* Sort refcount intents by AG. */
+static int
+xfs_refcount_update_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_mount		*mp = priv;
+	struct xfs_refcount_intent	*ra;
+	struct xfs_refcount_intent	*rb;
+
+	ra = container_of(a, struct xfs_refcount_intent, ri_list);
+	rb = container_of(b, struct xfs_refcount_intent, ri_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+}
+
+/* Set the phys extent flags for this reverse mapping. */
+static void
+xfs_trans_set_refcount_flags(
+	struct xfs_phys_extent		*refc,
+	enum xfs_refcount_intent_type	type)
+{
+	refc->pe_flags = 0;
+	switch (type) {
+	case XFS_REFCOUNT_INCREASE:
+	case XFS_REFCOUNT_DECREASE:
+	case XFS_REFCOUNT_ALLOC_COW:
+	case XFS_REFCOUNT_FREE_COW:
+		refc->pe_flags |= type;
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+/* Log refcount updates in the intent item. */
+STATIC void
+xfs_refcount_update_log_item(
+	struct xfs_trans		*tp,
+	struct xfs_cui_log_item		*cuip,
+	struct xfs_refcount_intent	*refc)
+{
+	uint				next_extent;
+	struct xfs_phys_extent		*ext;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
+	ASSERT(next_extent < cuip->cui_format.cui_nextents);
+	ext = &cuip->cui_format.cui_extents[next_extent];
+	ext->pe_startblock = refc->ri_startblock;
+	ext->pe_len = refc->ri_blockcount;
+	xfs_trans_set_refcount_flags(ext, refc->ri_type);
+}
+
+static struct xfs_log_item *
+xfs_refcount_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_cui_log_item		*cuip = xfs_cui_init(mp, count);
+	struct xfs_refcount_intent	*refc;
+
+	ASSERT(count > 0);
+
+	xfs_trans_add_item(tp, &cuip->cui_item);
+	if (sort)
+		list_sort(mp, items, xfs_refcount_update_diff_items);
+	list_for_each_entry(refc, items, ri_list)
+		xfs_refcount_update_log_item(tp, cuip, refc);
+	return &cuip->cui_item;
+}
+
+/* Get an CUD so we can process all the deferred refcount updates. */
+static struct xfs_log_item *
+xfs_refcount_update_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
+}
+
+/* Process a deferred refcount update. */
+STATIC int
+xfs_refcount_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_refcount_intent	*refc;
+	xfs_fsblock_t			new_fsb;
+	xfs_extlen_t			new_aglen;
+	int				error;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+	error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done),
+			refc->ri_type, refc->ri_startblock, refc->ri_blockcount,
+			&new_fsb, &new_aglen, state);
+
+	/* Did we run out of reservation?  Requeue what we didn't finish. */
+	if (!error && new_aglen > 0) {
+		ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
+		       refc->ri_type == XFS_REFCOUNT_DECREASE);
+		refc->ri_startblock = new_fsb;
+		refc->ri_blockcount = new_aglen;
+		return -EAGAIN;
+	}
+	kmem_cache_free(xfs_refcount_intent_cache, refc);
+	return error;
+}
+
+/* Abort all pending CUIs. */
+STATIC void
+xfs_refcount_update_abort_intent(
+	struct xfs_log_item		*intent)
+{
+	xfs_cui_release(CUI_ITEM(intent));
+}
+
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_refcount_intent	*refc;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+	kmem_cache_free(xfs_refcount_intent_cache, refc);
+}
+
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_refcount_update_create_intent,
+	.abort_intent	= xfs_refcount_update_abort_intent,
+	.create_done	= xfs_refcount_update_create_done,
+	.finish_item	= xfs_refcount_update_finish_item,
+	.finish_cleanup = xfs_refcount_finish_one_cleanup,
+	.cancel_item	= xfs_refcount_update_cancel_item,
+};
+
+/* Is this recovered CUI ok? */
+static inline bool
+xfs_cui_validate_phys(
+	struct xfs_mount		*mp,
+	struct xfs_phys_extent		*refc)
+{
+	if (!xfs_has_reflink(mp))
+		return false;
+
+	if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
+		return false;
+
+	switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
+	case XFS_REFCOUNT_INCREASE:
+	case XFS_REFCOUNT_DECREASE:
+	case XFS_REFCOUNT_ALLOC_COW:
+	case XFS_REFCOUNT_FREE_COW:
+		break;
+	default:
+		return false;
+	}
+
+	return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len);
+}
+
+/*
+ * Process a refcount update intent item that was recovered from the log.
+ * We need to update the refcountbt.
+ */
+STATIC int
+xfs_cui_item_recover(
+	struct xfs_log_item		*lip,
+	struct list_head		*capture_list)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_cui_log_item		*cuip = CUI_ITEM(lip);
+	struct xfs_phys_extent		*refc;
+	struct xfs_cud_log_item		*cudp;
+	struct xfs_trans		*tp;
+	struct xfs_btree_cur		*rcur = NULL;
+	struct xfs_mount		*mp = lip->li_log->l_mp;
+	xfs_fsblock_t			new_fsb;
+	xfs_extlen_t			new_len;
+	unsigned int			refc_type;
+	bool				requeue_only = false;
+	enum xfs_refcount_intent_type	type;
+	int				i;
+	int				error = 0;
+
+	/*
+	 * First check the validity of the extents described by the
+	 * CUI.  If any are bad, then assume that all are bad and
+	 * just toss the CUI.
+	 */
+	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+		if (!xfs_cui_validate_phys(mp,
+					&cuip->cui_format.cui_extents[i])) {
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					&cuip->cui_format,
+					sizeof(cuip->cui_format));
+			return -EFSCORRUPTED;
+		}
+	}
+
+	/*
+	 * Under normal operation, refcount updates are deferred, so we
+	 * wouldn't be adding them directly to a transaction.  All
+	 * refcount updates manage reservation usage internally and
+	 * dynamically by deferring work that won't fit in the
+	 * transaction.  Normally, any work that needs to be deferred
+	 * gets attached to the same defer_ops that scheduled the
+	 * refcount update.  However, we're in log recovery here, so we
+	 * use the passed in defer_ops and to finish up any work that
+	 * doesn't fit.  We need to reserve enough blocks to handle a
+	 * full btree split on either end of the refcount range.
+	 */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+			mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	cudp = xfs_trans_get_cud(tp, cuip);
+
+	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+		refc = &cuip->cui_format.cui_extents[i];
+		refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+		switch (refc_type) {
+		case XFS_REFCOUNT_INCREASE:
+		case XFS_REFCOUNT_DECREASE:
+		case XFS_REFCOUNT_ALLOC_COW:
+		case XFS_REFCOUNT_FREE_COW:
+			type = refc_type;
+			break;
+		default:
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					&cuip->cui_format,
+					sizeof(cuip->cui_format));
+			error = -EFSCORRUPTED;
+			goto abort_error;
+		}
+		if (requeue_only) {
+			new_fsb = refc->pe_startblock;
+			new_len = refc->pe_len;
+		} else
+			error = xfs_trans_log_finish_refcount_update(tp, cudp,
+				type, refc->pe_startblock, refc->pe_len,
+				&new_fsb, &new_len, &rcur);
+		if (error == -EFSCORRUPTED)
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					&cuip->cui_format,
+					sizeof(cuip->cui_format));
+		if (error)
+			goto abort_error;
+
+		/* Requeue what we didn't finish. */
+		if (new_len > 0) {
+			irec.br_startblock = new_fsb;
+			irec.br_blockcount = new_len;
+			switch (type) {
+			case XFS_REFCOUNT_INCREASE:
+				xfs_refcount_increase_extent(tp, &irec);
+				break;
+			case XFS_REFCOUNT_DECREASE:
+				xfs_refcount_decrease_extent(tp, &irec);
+				break;
+			case XFS_REFCOUNT_ALLOC_COW:
+				xfs_refcount_alloc_cow_extent(tp,
+						irec.br_startblock,
+						irec.br_blockcount);
+				break;
+			case XFS_REFCOUNT_FREE_COW:
+				xfs_refcount_free_cow_extent(tp,
+						irec.br_startblock,
+						irec.br_blockcount);
+				break;
+			default:
+				ASSERT(0);
+			}
+			requeue_only = true;
+		}
+	}
+
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+abort_error:
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+STATIC bool
+xfs_cui_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_cui_item_relog(
+	struct xfs_log_item		*intent,
+	struct xfs_trans		*tp)
+{
+	struct xfs_cud_log_item		*cudp;
+	struct xfs_cui_log_item		*cuip;
+	struct xfs_phys_extent		*extp;
+	unsigned int			count;
+
+	count = CUI_ITEM(intent)->cui_format.cui_nextents;
+	extp = CUI_ITEM(intent)->cui_format.cui_extents;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
+	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+	cuip = xfs_cui_init(tp->t_mountp, count);
+	memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
+	atomic_set(&cuip->cui_next_extent, count);
+	xfs_trans_add_item(tp, &cuip->cui_item);
+	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+	return &cuip->cui_item;
+}
+
+static const struct xfs_item_ops xfs_cui_item_ops = {
+	.flags		= XFS_ITEM_INTENT,
+	.iop_size	= xfs_cui_item_size,
+	.iop_format	= xfs_cui_item_format,
+	.iop_unpin	= xfs_cui_item_unpin,
+	.iop_release	= xfs_cui_item_release,
+	.iop_recover	= xfs_cui_item_recover,
+	.iop_match	= xfs_cui_item_match,
+	.iop_relog	= xfs_cui_item_relog,
+};
+
+static inline void
+xfs_cui_copy_format(
+	struct xfs_cui_log_format	*dst,
+	const struct xfs_cui_log_format	*src)
+{
+	unsigned int			i;
+
+	memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents));
+
+	for (i = 0; i < src->cui_nextents; i++)
+		memcpy(&dst->cui_extents[i], &src->cui_extents[i],
+				sizeof(struct xfs_phys_extent));
+}
+
+/*
+ * This routine is called to create an in-core extent refcount update
+ * item from the cui format structure which was logged on disk.
+ * It allocates an in-core cui, copies the extents from the format
+ * structure into it, and adds the cui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_cui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_cui_log_item		*cuip;
+	struct xfs_cui_log_format	*cui_formatp;
+	size_t				len;
+
+	cui_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+	xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
+	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+	/*
+	 * Insert the intent into the AIL directly and drop one reference so
+	 * that finishing or canceling the work will drop the other.
+	 */
+	xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn);
+	xfs_cui_release(cuip);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_cui_item_ops = {
+	.item_type		= XFS_LI_CUI,
+	.commit_pass2		= xlog_recover_cui_commit_pass2,
+};
+
+/*
+ * This routine is called when an CUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding CUI if it
+ * was still in the log. To do this it searches the AIL for the CUI with an id
+ * equal to that in the CUD format structure. If we find it we drop the CUD
+ * reference, which removes the CUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_cud_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_cud_log_format	*cud_formatp;
+
+	cud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_CUI, cud_formatp->cud_cui_id);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_cud_item_ops = {
+	.item_type		= XFS_LI_CUD,
+	.commit_pass2		= xlog_recover_cud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
new file mode 100644
index 000000000..eb0ab1368
--- /dev/null
+++ b/fs/xfs/xfs_refcount_item.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef	__XFS_REFCOUNT_ITEM_H__
+#define	__XFS_REFCOUNT_ITEM_H__
+
+/*
+ * There are (currently) two pairs of refcount btree redo item types:
+ * increase and decrease.  The log items for these are CUI (refcount
+ * update intent) and CUD (refcount update done).  The redo item type
+ * is encoded in the flags field of each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same
+ * transaction that records the associated refcountbt updates.
+ *
+ * Should the system crash after the commit of the first transaction
+ * but before the commit of the final transaction in a series, log
+ * recovery will use the redo information recorded by the intent items
+ * to replay the refcountbt metadata updates.
+ */
+
+/* kernel only CUI/CUD definitions */
+
+struct xfs_mount;
+struct kmem_cache;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_CUI_MAX_FAST_EXTENTS	16
+
+/*
+ * This is the "refcount update intent" log item.  It is used to log
+ * the fact that some reverse mappings need to change.  It is used in
+ * conjunction with the "refcount update done" log item described
+ * below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item;
+ * see the comments about that structure (in xfs_extfree_item.h) for
+ * more details.
+ */
+struct xfs_cui_log_item {
+	struct xfs_log_item		cui_item;
+	atomic_t			cui_refcount;
+	atomic_t			cui_next_extent;
+	struct xfs_cui_log_format	cui_format;
+};
+
+static inline size_t
+xfs_cui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_cui_log_item, cui_format) +
+			xfs_cui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "refcount update done" log item.  It is used to log the
+ * fact that some refcountbt updates mentioned in an earlier cui item
+ * have been performed.
+ */
+struct xfs_cud_log_item {
+	struct xfs_log_item		cud_item;
+	struct xfs_cui_log_item		*cud_cuip;
+	struct xfs_cud_log_format	cud_format;
+};
+
+extern struct kmem_cache	*xfs_cui_cache;
+extern struct kmem_cache	*xfs_cud_cache;
+
+#endif	/* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
new file mode 100644
index 000000000..93bdd2568
--- /dev/null
+++ b/fs/xfs/xfs_reflink.c
@@ -0,0 +1,1715 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_bit.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_reflink.h"
+#include "xfs_iomap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+
+/*
+ * Copy on Write of Shared Blocks
+ *
+ * XFS must preserve "the usual" file semantics even when two files share
+ * the same physical blocks.  This means that a write to one file must not
+ * alter the blocks in a different file; the way that we'll do that is
+ * through the use of a copy-on-write mechanism.  At a high level, that
+ * means that when we want to write to a shared block, we allocate a new
+ * block, write the data to the new block, and if that succeeds we map the
+ * new block into the file.
+ *
+ * XFS provides a "delayed allocation" mechanism that defers the allocation
+ * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
+ * possible.  This reduces fragmentation by enabling the filesystem to ask
+ * for bigger chunks less often, which is exactly what we want for CoW.
+ *
+ * The delalloc mechanism begins when the kernel wants to make a block
+ * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
+ * create a delalloc mapping, which is a regular in-core extent, but without
+ * a real startblock.  (For delalloc mappings, the startblock encodes both
+ * a flag that this is a delalloc mapping, and a worst-case estimate of how
+ * many blocks might be required to put the mapping into the BMBT.)  delalloc
+ * mappings are a reservation against the free space in the filesystem;
+ * adjacent mappings can also be combined into fewer larger mappings.
+ *
+ * As an optimization, the CoW extent size hint (cowextsz) creates
+ * outsized aligned delalloc reservations in the hope of landing out of
+ * order nearby CoW writes in a single extent on disk, thereby reducing
+ * fragmentation and improving future performance.
+ *
+ * D: --RRRRRRSSSRRRRRRRR--- (data fork)
+ * C: ------DDDDDDD--------- (CoW fork)
+ *
+ * When dirty pages are being written out (typically in writepage), the
+ * delalloc reservations are converted into unwritten mappings by
+ * allocating blocks and replacing the delalloc mapping with real ones.
+ * A delalloc mapping can be replaced by several unwritten ones if the
+ * free space is fragmented.
+ *
+ * D: --RRRRRRSSSRRRRRRRR---
+ * C: ------UUUUUUU---------
+ *
+ * We want to adapt the delalloc mechanism for copy-on-write, since the
+ * write paths are similar.  The first two steps (creating the reservation
+ * and allocating the blocks) are exactly the same as delalloc except that
+ * the mappings must be stored in a separate CoW fork because we do not want
+ * to disturb the mapping in the data fork until we're sure that the write
+ * succeeded.  IO completion in this case is the process of removing the old
+ * mapping from the data fork and moving the new mapping from the CoW fork to
+ * the data fork.  This will be discussed shortly.
+ *
+ * For now, unaligned directio writes will be bounced back to the page cache.
+ * Block-aligned directio writes will use the same mechanism as buffered
+ * writes.
+ *
+ * Just prior to submitting the actual disk write requests, we convert
+ * the extents representing the range of the file actually being written
+ * (as opposed to extra pieces created for the cowextsize hint) to real
+ * extents.  This will become important in the next step:
+ *
+ * D: --RRRRRRSSSRRRRRRRR---
+ * C: ------UUrrUUU---------
+ *
+ * CoW remapping must be done after the data block write completes,
+ * because we don't want to destroy the old data fork map until we're sure
+ * the new block has been written.  Since the new mappings are kept in a
+ * separate fork, we can simply iterate these mappings to find the ones
+ * that cover the file blocks that we just CoW'd.  For each extent, simply
+ * unmap the corresponding range in the data fork, map the new range into
+ * the data fork, and remove the extent from the CoW fork.  Because of
+ * the presence of the cowextsize hint, however, we must be careful
+ * only to remap the blocks that we've actually written out --  we must
+ * never remap delalloc reservations nor CoW staging blocks that have
+ * yet to be written.  This corresponds exactly to the real extents in
+ * the CoW fork:
+ *
+ * D: --RRRRRRrrSRRRRRRRR---
+ * C: ------UU--UUU---------
+ *
+ * Since the remapping operation can be applied to an arbitrary file
+ * range, we record the need for the remap step as a flag in the ioend
+ * instead of declaring a new IO type.  This is required for direct io
+ * because we only have ioend for the whole dio, and we have to be able to
+ * remember the presence of unwritten blocks and CoW blocks with a single
+ * ioend structure.  Better yet, the more ground we can cover with one
+ * ioend, the better.
+ */
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen.  If
+ * find_end_of_shared is true, return the longest contiguous extent of
+ * shared blocks.  If there are no shared extents, fbno and flen will
+ * be set to NULLAGBLOCK and 0, respectively.
+ */
+static int
+xfs_reflink_find_shared(
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	xfs_agblock_t		*fbno,
+	xfs_extlen_t		*flen,
+	bool			find_end_of_shared)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+	if (error)
+		return error;
+
+	cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
+
+	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
+			find_end_of_shared);
+
+	xfs_btree_del_cursor(cur, error);
+
+	xfs_trans_brelse(tp, agbp);
+	return error;
+}
+
+/*
+ * Trim the mapping to the next block where there's a change in the
+ * shared/unshared status.  More specifically, this means that we
+ * find the lowest-numbered extent of shared blocks that coincides with
+ * the given block mapping.  If the shared extent overlaps the start of
+ * the mapping, trim the mapping to the end of the shared extent.  If
+ * the shared region intersects the mapping, trim the mapping to the
+ * start of the shared extent.  If there are no shared regions that
+ * overlap, just return the original extent.
+ */
+int
+xfs_reflink_trim_around_shared(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*irec,
+	bool			*shared)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		aglen;
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	int			error = 0;
+
+	/* Holes, unwritten, and delalloc extents cannot be shared */
+	if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
+		*shared = false;
+		return 0;
+	}
+
+	trace_xfs_reflink_trim_around_shared(ip, irec);
+
+	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
+	aglen = irec->br_blockcount;
+
+	error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
+			true);
+	xfs_perag_put(pag);
+	if (error)
+		return error;
+
+	*shared = false;
+	if (fbno == NULLAGBLOCK) {
+		/* No shared blocks at all. */
+		return 0;
+	}
+
+	if (fbno == agbno) {
+		/*
+		 * The start of this extent is shared.  Truncate the
+		 * mapping at the end of the shared region so that a
+		 * subsequent iteration starts at the start of the
+		 * unshared region.
+		 */
+		irec->br_blockcount = flen;
+		*shared = true;
+		return 0;
+	}
+
+	/*
+	 * There's a shared extent midway through this extent.
+	 * Truncate the mapping at the start of the shared
+	 * extent so that a subsequent iteration starts at the
+	 * start of the shared region.
+	 */
+	irec->br_blockcount = fbno - agbno;
+	return 0;
+}
+
+int
+xfs_bmap_trim_cow(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	bool			*shared)
+{
+	/* We can't update any real extents in always COW mode. */
+	if (xfs_is_always_cow_inode(ip) &&
+	    !isnullstartblock(imap->br_startblock)) {
+		*shared = true;
+		return 0;
+	}
+
+	/* Trim the mapping to the nearest shared extent boundary. */
+	return xfs_reflink_trim_around_shared(ip, imap, shared);
+}
+
+static int
+xfs_reflink_convert_cow_locked(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb,
+	xfs_filblks_t		count_fsb)
+{
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	struct xfs_btree_cur	*dummy_cur = NULL;
+	int			dummy_logflags;
+	int			error = 0;
+
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+		return 0;
+
+	do {
+		if (got.br_startoff >= offset_fsb + count_fsb)
+			break;
+		if (got.br_state == XFS_EXT_NORM)
+			continue;
+		if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+			return -EIO;
+
+		xfs_trim_extent(&got, offset_fsb, count_fsb);
+		if (!got.br_blockcount)
+			continue;
+
+		got.br_state = XFS_EXT_NORM;
+		error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+				XFS_COW_FORK, &icur, &dummy_cur, &got,
+				&dummy_logflags);
+		if (error)
+			return error;
+	} while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+	return error;
+}
+
+/* Convert all of the unwritten CoW extents in a file's range to real ones. */
+int
+xfs_reflink_convert_cow(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+	xfs_filblks_t		count_fsb = end_fsb - offset_fsb;
+	int			error;
+
+	ASSERT(count != 0);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Find the extent that maps the given range in the COW fork. Even if the extent
+ * is not shared we might have a preallocation for it in the COW fork. If so we
+ * use it that rather than trigger a new allocation.
+ */
+static int
+xfs_find_trim_cow_extent(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	struct xfs_bmbt_irec	*cmap,
+	bool			*shared,
+	bool			*found)
+{
+	xfs_fileoff_t		offset_fsb = imap->br_startoff;
+	xfs_filblks_t		count_fsb = imap->br_blockcount;
+	struct xfs_iext_cursor	icur;
+
+	*found = false;
+
+	/*
+	 * If we don't find an overlapping extent, trim the range we need to
+	 * allocate to fit the hole we found.
+	 */
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
+		cmap->br_startoff = offset_fsb + count_fsb;
+	if (cmap->br_startoff > offset_fsb) {
+		xfs_trim_extent(imap, imap->br_startoff,
+				cmap->br_startoff - imap->br_startoff);
+		return xfs_bmap_trim_cow(ip, imap, shared);
+	}
+
+	*shared = true;
+	if (isnullstartblock(cmap->br_startblock)) {
+		xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
+		return 0;
+	}
+
+	/* real extent found - no need to allocate */
+	xfs_trim_extent(cmap, offset_fsb, count_fsb);
+	*found = true;
+	return 0;
+}
+
+static int
+xfs_reflink_convert_unwritten(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	struct xfs_bmbt_irec	*cmap,
+	bool			convert_now)
+{
+	xfs_fileoff_t		offset_fsb = imap->br_startoff;
+	xfs_filblks_t		count_fsb = imap->br_blockcount;
+	int			error;
+
+	/*
+	 * cmap might larger than imap due to cowextsize hint.
+	 */
+	xfs_trim_extent(cmap, offset_fsb, count_fsb);
+
+	/*
+	 * COW fork extents are supposed to remain unwritten until we're ready
+	 * to initiate a disk write.  For direct I/O we are going to write the
+	 * data and need the conversion, but for buffered writes we're done.
+	 */
+	if (!convert_now || cmap->br_state == XFS_EXT_NORM)
+		return 0;
+
+	trace_xfs_reflink_convert_cow(ip, cmap);
+
+	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+	if (!error)
+		cmap->br_state = XFS_EXT_NORM;
+
+	return error;
+}
+
+static int
+xfs_reflink_fill_cow_hole(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	struct xfs_bmbt_irec	*cmap,
+	bool			*shared,
+	uint			*lockmode,
+	bool			convert_now)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	xfs_filblks_t		resaligned;
+	xfs_extlen_t		resblks;
+	int			nimaps;
+	int			error;
+	bool			found;
+
+	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
+		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+
+	xfs_iunlock(ip, *lockmode);
+	*lockmode = 0;
+
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+			false, &tp);
+	if (error)
+		return error;
+
+	*lockmode = XFS_ILOCK_EXCL;
+
+	error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+	if (error || !*shared)
+		goto out_trans_cancel;
+
+	if (found) {
+		xfs_trans_cancel(tp);
+		goto convert;
+	}
+
+	ASSERT(cmap->br_startoff > imap->br_startoff);
+
+	/* Allocate the entire reservation as unwritten blocks. */
+	nimaps = 1;
+	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
+			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
+			&nimaps);
+	if (error)
+		goto out_trans_cancel;
+
+	xfs_inode_set_cowblocks_tag(ip);
+	error = xfs_trans_commit(tp);
+	if (error)
+		return error;
+
+	/*
+	 * Allocation succeeded but the requested range was not even partially
+	 * satisfied?  Bail out!
+	 */
+	if (nimaps == 0)
+		return -ENOSPC;
+
+convert:
+	return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+static int
+xfs_reflink_fill_delalloc(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	struct xfs_bmbt_irec	*cmap,
+	bool			*shared,
+	uint			*lockmode,
+	bool			convert_now)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			nimaps;
+	int			error;
+	bool			found;
+
+	do {
+		xfs_iunlock(ip, *lockmode);
+		*lockmode = 0;
+
+		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
+				false, &tp);
+		if (error)
+			return error;
+
+		*lockmode = XFS_ILOCK_EXCL;
+
+		error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
+				&found);
+		if (error || !*shared)
+			goto out_trans_cancel;
+
+		if (found) {
+			xfs_trans_cancel(tp);
+			break;
+		}
+
+		ASSERT(isnullstartblock(cmap->br_startblock) ||
+		       cmap->br_startblock == DELAYSTARTBLOCK);
+
+		/*
+		 * Replace delalloc reservation with an unwritten extent.
+		 */
+		nimaps = 1;
+		error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
+				cmap->br_blockcount,
+				XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
+				cmap, &nimaps);
+		if (error)
+			goto out_trans_cancel;
+
+		xfs_inode_set_cowblocks_tag(ip);
+		error = xfs_trans_commit(tp);
+		if (error)
+			return error;
+
+		/*
+		 * Allocation succeeded but the requested range was not even
+		 * partially satisfied?  Bail out!
+		 */
+		if (nimaps == 0)
+			return -ENOSPC;
+	} while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
+
+	return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+int
+xfs_reflink_allocate_cow(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	struct xfs_bmbt_irec	*cmap,
+	bool			*shared,
+	uint			*lockmode,
+	bool			convert_now)
+{
+	int			error;
+	bool			found;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	if (!ip->i_cowfp) {
+		ASSERT(!xfs_is_reflink_inode(ip));
+		xfs_ifork_init_cow(ip);
+	}
+
+	error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+	if (error || !*shared)
+		return error;
+
+	/* CoW fork has a real extent */
+	if (found)
+		return xfs_reflink_convert_unwritten(ip, imap, cmap,
+				convert_now);
+
+	/*
+	 * CoW fork does not have an extent and data extent is shared.
+	 * Allocate a real extent in the CoW fork.
+	 */
+	if (cmap->br_startoff > imap->br_startoff)
+		return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
+				lockmode, convert_now);
+
+	/*
+	 * CoW fork has a delalloc reservation. Replace it with a real extent.
+	 * There may or may not be a data fork mapping.
+	 */
+	if (isnullstartblock(cmap->br_startblock) ||
+	    cmap->br_startblock == DELAYSTARTBLOCK)
+		return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
+				lockmode, convert_now);
+
+	/* Shouldn't get here. */
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Cancel CoW reservations for some block range of an inode.
+ *
+ * If cancel_real is true this function cancels all COW fork extents for the
+ * inode; if cancel_real is false, real extents are not cleared.
+ *
+ * Caller must have already joined the inode to the current transaction. The
+ * inode will be joined to the transaction returned to the caller.
+ */
+int
+xfs_reflink_cancel_cow_blocks(
+	struct xfs_inode		*ip,
+	struct xfs_trans		**tpp,
+	xfs_fileoff_t			offset_fsb,
+	xfs_fileoff_t			end_fsb,
+	bool				cancel_real)
+{
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+	struct xfs_bmbt_irec		got, del;
+	struct xfs_iext_cursor		icur;
+	int				error = 0;
+
+	if (!xfs_inode_has_cow_data(ip))
+		return 0;
+	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
+		return 0;
+
+	/* Walk backwards until we're out of the I/O range... */
+	while (got.br_startoff + got.br_blockcount > offset_fsb) {
+		del = got;
+		xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+
+		/* Extent delete may have bumped ext forward */
+		if (!del.br_blockcount) {
+			xfs_iext_prev(ifp, &icur);
+			goto next_extent;
+		}
+
+		trace_xfs_reflink_cancel_cow(ip, &del);
+
+		if (isnullstartblock(del.br_startblock)) {
+			error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
+					&icur, &got, &del);
+			if (error)
+				break;
+		} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
+			ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
+
+			/* Free the CoW orphan record. */
+			xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
+					del.br_blockcount);
+
+			xfs_free_extent_later(*tpp, del.br_startblock,
+					  del.br_blockcount, NULL);
+
+			/* Roll the transaction */
+			error = xfs_defer_finish(tpp);
+			if (error)
+				break;
+
+			/* Remove the mapping from the CoW fork. */
+			xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
+
+			/* Remove the quota reservation */
+			error = xfs_quota_unreserve_blkres(ip,
+					del.br_blockcount);
+			if (error)
+				break;
+		} else {
+			/* Didn't do anything, push cursor back. */
+			xfs_iext_prev(ifp, &icur);
+		}
+next_extent:
+		if (!xfs_iext_get_extent(ifp, &icur, &got))
+			break;
+	}
+
+	/* clear tag if cow fork is emptied */
+	if (!ifp->if_bytes)
+		xfs_inode_clear_cowblocks_tag(ip);
+	return error;
+}
+
+/*
+ * Cancel CoW reservations for some byte range of an inode.
+ *
+ * If cancel_real is true this function cancels all COW fork extents for the
+ * inode; if cancel_real is false, real extents are not cleared.
+ */
+int
+xfs_reflink_cancel_cow_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count,
+	bool			cancel_real)
+{
+	struct xfs_trans	*tp;
+	xfs_fileoff_t		offset_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error;
+
+	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
+	ASSERT(ip->i_cowfp);
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	if (count == NULLFILEOFF)
+		end_fsb = NULLFILEOFF;
+	else
+		end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+	/* Start a rolling transaction to remove the mappings */
+	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
+			0, 0, 0, &tp);
+	if (error)
+		goto out;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/* Scrape out the old CoW reservations */
+	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
+			cancel_real);
+	if (error)
+		goto out_cancel;
+
+	error = xfs_trans_commit(tp);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Remap part of the CoW fork into the data fork.
+ *
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
+ * into the data fork; this function will remap what it can (at the end of the
+ * range) and update @end_fsb appropriately.  Each remap gets its own
+ * transaction because we can end up merging and splitting bmbt blocks for
+ * every remap operation and we'd like to keep the block reservation
+ * requirements as low as possible.
+ */
+STATIC int
+xfs_reflink_end_cow_extent(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*offset_fsb,
+	xfs_fileoff_t		end_fsb)
+{
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got, del, data;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+	unsigned int		resblks;
+	int			nmaps;
+	int			error;
+
+	/* No COW extents?  That's easy! */
+	if (ifp->if_bytes == 0) {
+		*offset_fsb = end_fsb;
+		return 0;
+	}
+
+	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+			XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	/*
+	 * Lock the inode.  We have to ijoin without automatic unlock because
+	 * the lead transaction is the refcountbt record deletion; the data
+	 * fork update follows as a deferred log item.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+			XFS_IEXT_REFLINK_END_COW_CNT);
+	if (error == -EFBIG)
+		error = xfs_iext_count_upgrade(tp, ip,
+				XFS_IEXT_REFLINK_END_COW_CNT);
+	if (error)
+		goto out_cancel;
+
+	/*
+	 * In case of racing, overlapping AIO writes no COW extents might be
+	 * left by the time I/O completes for the loser of the race.  In that
+	 * case we are done.
+	 */
+	if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
+	    got.br_startoff >= end_fsb) {
+		*offset_fsb = end_fsb;
+		goto out_cancel;
+	}
+
+	/*
+	 * Only remap real extents that contain data.  With AIO, speculative
+	 * preallocations can leak into the range we are called upon, and we
+	 * need to skip them.  Preserve @got for the eventual CoW fork
+	 * deletion; from now on @del represents the mapping that we're
+	 * actually remapping.
+	 */
+	while (!xfs_bmap_is_written_extent(&got)) {
+		if (!xfs_iext_next_extent(ifp, &icur, &got) ||
+		    got.br_startoff >= end_fsb) {
+			*offset_fsb = end_fsb;
+			goto out_cancel;
+		}
+	}
+	del = got;
+
+	/* Grab the corresponding mapping in the data fork. */
+	nmaps = 1;
+	error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
+			&nmaps, 0);
+	if (error)
+		goto out_cancel;
+
+	/* We can only remap the smaller of the two extent sizes. */
+	data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
+	del.br_blockcount = data.br_blockcount;
+
+	trace_xfs_reflink_cow_remap_from(ip, &del);
+	trace_xfs_reflink_cow_remap_to(ip, &data);
+
+	if (xfs_bmap_is_real_extent(&data)) {
+		/*
+		 * If the extent we're remapping is backed by storage (written
+		 * or not), unmap the extent and drop its refcount.
+		 */
+		xfs_bmap_unmap_extent(tp, ip, &data);
+		xfs_refcount_decrease_extent(tp, &data);
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+				-data.br_blockcount);
+	} else if (data.br_startblock == DELAYSTARTBLOCK) {
+		int		done;
+
+		/*
+		 * If the extent we're remapping is a delalloc reservation,
+		 * we can use the regular bunmapi function to release the
+		 * incore state.  Dropping the delalloc reservation takes care
+		 * of the quota reservation for us.
+		 */
+		error = xfs_bunmapi(NULL, ip, data.br_startoff,
+				data.br_blockcount, 0, 1, &done);
+		if (error)
+			goto out_cancel;
+		ASSERT(done);
+	}
+
+	/* Free the CoW orphan record. */
+	xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
+
+	/* Map the new blocks into the data fork. */
+	xfs_bmap_map_extent(tp, ip, &del);
+
+	/* Charge this new data fork mapping to the on-disk quota. */
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
+			(long)del.br_blockcount);
+
+	/* Remove the mapping from the CoW fork. */
+	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
+
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		return error;
+
+	/* Update the caller about how much progress we made. */
+	*offset_fsb = del.br_startoff + del.br_blockcount;
+	return 0;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Remap parts of a file's data fork after a successful CoW.
+ */
+int
+xfs_reflink_end_cow(
+	struct xfs_inode		*ip,
+	xfs_off_t			offset,
+	xfs_off_t			count)
+{
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	int				error = 0;
+
+	trace_xfs_reflink_end_cow(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+	/*
+	 * Walk forwards until we've remapped the I/O range.  The loop function
+	 * repeatedly cycles the ILOCK to allocate one transaction per remapped
+	 * extent.
+	 *
+	 * If we're being called by writeback then the pages will still
+	 * have PageWriteback set, which prevents races with reflink remapping
+	 * and truncate.  Reflink remapping prevents races with writeback by
+	 * taking the iolock and mmaplock before flushing the pages and
+	 * remapping, which means there won't be any further writeback or page
+	 * cache dirtying until the reflink completes.
+	 *
+	 * We should never have two threads issuing writeback for the same file
+	 * region.  There are also have post-eof checks in the writeback
+	 * preparation code so that we don't bother writing out pages that are
+	 * about to be truncated.
+	 *
+	 * If we're being called as part of directio write completion, the dio
+	 * count is still elevated, which reflink and truncate will wait for.
+	 * Reflink remapping takes the iolock and mmaplock and waits for
+	 * pending dio to finish, which should prevent any directio until the
+	 * remap completes.  Multiple concurrent directio writes to the same
+	 * region are handled by end_cow processing only occurring for the
+	 * threads which succeed; the outcome of multiple overlapping direct
+	 * writes is not well defined anyway.
+	 *
+	 * It's possible that a buffered write and a direct write could collide
+	 * here (the buffered write stumbles in after the dio flushes and
+	 * invalidates the page cache and immediately queues writeback), but we
+	 * have never supported this 100%.  If either disk write succeeds the
+	 * blocks will be remapped.
+	 */
+	while (end_fsb > offset_fsb && !error)
+		error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
+
+	if (error)
+		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Free all CoW staging blocks that are still referenced by the ondisk refcount
+ * metadata.  The ondisk metadata does not track which inode created the
+ * staging extent, so callers must ensure that there are no cached inodes with
+ * live CoW staging extents.
+ */
+int
+xfs_reflink_recover_cow(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	int			error = 0;
+
+	if (!xfs_has_reflink(mp))
+		return 0;
+
+	for_each_perag(mp, agno, pag) {
+		error = xfs_refcount_recover_cow_leftovers(mp, pag);
+		if (error) {
+			xfs_perag_put(pag);
+			break;
+		}
+	}
+
+	return error;
+}
+
+/*
+ * Reflinking (Block) Ranges of Two Files Together
+ *
+ * First, ensure that the reflink flag is set on both inodes.  The flag is an
+ * optimization to avoid unnecessary refcount btree lookups in the write path.
+ *
+ * Now we can iteratively remap the range of extents (and holes) in src to the
+ * corresponding ranges in dest.  Let drange and srange denote the ranges of
+ * logical blocks in dest and src touched by the reflink operation.
+ *
+ * While the length of drange is greater than zero,
+ *    - Read src's bmbt at the start of srange ("imap")
+ *    - If imap doesn't exist, make imap appear to start at the end of srange
+ *      with zero length.
+ *    - If imap starts before srange, advance imap to start at srange.
+ *    - If imap goes beyond srange, truncate imap to end at the end of srange.
+ *    - Punch (imap start - srange start + imap len) blocks from dest at
+ *      offset (drange start).
+ *    - If imap points to a real range of pblks,
+ *         > Increase the refcount of the imap's pblks
+ *         > Map imap's pblks into dest at the offset
+ *           (drange start + imap start - srange start)
+ *    - Advance drange and srange by (imap start - srange start + imap len)
+ *
+ * Finally, if the reflink made dest longer, update both the in-core and
+ * on-disk file sizes.
+ *
+ * ASCII Art Demonstration:
+ *
+ * Let's say we want to reflink this source file:
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS (src file)
+ *   <-------------------->
+ *
+ * into this destination file:
+ *
+ * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
+ *        <-------------------->
+ * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
+ * Observe that the range has different logical offsets in either file.
+ *
+ * Consider that the first extent in the source file doesn't line up with our
+ * reflink range.  Unmapping  and remapping are separate operations, so we can
+ * unmap more blocks from the destination file than we remap.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *   <------->
+ * --DDDDD---------DDDDD--DDD
+ *        <------->
+ *
+ * Now remap the source extent into the destination file:
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *   <------->
+ * --DDDDD--SSSSSSSDDDDD--DDD
+ *        <------->
+ *
+ * Do likewise with the second hole and extent in our range.  Holes in the
+ * unmap range don't affect our operation.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *            <---->
+ * --DDDDD--SSSSSSS-SSSSS-DDD
+ *                 <---->
+ *
+ * Finally, unmap and remap part of the third extent.  This will increase the
+ * size of the destination file.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *                  <----->
+ * --DDDDD--SSSSSSS-SSSSS----SSS
+ *                       <----->
+ *
+ * Once we update the destination file's i_size, we're done.
+ */
+
+/*
+ * Ensure the reflink bit is set in both inodes.
+ */
+STATIC int
+xfs_reflink_set_inode_flag(
+	struct xfs_inode	*src,
+	struct xfs_inode	*dest)
+{
+	struct xfs_mount	*mp = src->i_mount;
+	int			error;
+	struct xfs_trans	*tp;
+
+	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
+		return 0;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		goto out_error;
+
+	/* Lock both files against IO */
+	if (src->i_ino == dest->i_ino)
+		xfs_ilock(src, XFS_ILOCK_EXCL);
+	else
+		xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
+
+	if (!xfs_is_reflink_inode(src)) {
+		trace_xfs_reflink_set_inode_flag(src);
+		xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
+		src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+		xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
+		xfs_ifork_init_cow(src);
+	} else
+		xfs_iunlock(src, XFS_ILOCK_EXCL);
+
+	if (src->i_ino == dest->i_ino)
+		goto commit_flags;
+
+	if (!xfs_is_reflink_inode(dest)) {
+		trace_xfs_reflink_set_inode_flag(dest);
+		xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
+		dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+		xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+		xfs_ifork_init_cow(dest);
+	} else
+		xfs_iunlock(dest, XFS_ILOCK_EXCL);
+
+commit_flags:
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_error;
+	return error;
+
+out_error:
+	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Update destination inode size & cowextsize hint, if necessary.
+ */
+int
+xfs_reflink_update_dest(
+	struct xfs_inode	*dest,
+	xfs_off_t		newlen,
+	xfs_extlen_t		cowextsize,
+	unsigned int		remap_flags)
+{
+	struct xfs_mount	*mp = dest->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+		return 0;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		goto out_error;
+
+	xfs_ilock(dest, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
+
+	if (newlen > i_size_read(VFS_I(dest))) {
+		trace_xfs_reflink_update_inode_size(dest, newlen);
+		i_size_write(VFS_I(dest), newlen);
+		dest->i_disk_size = newlen;
+	}
+
+	if (cowextsize) {
+		dest->i_cowextsize = cowextsize;
+		dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
+	}
+
+	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_error;
+	return error;
+
+out_error:
+	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Do we have enough reserve in this AG to handle a reflink?  The refcount
+ * btree already reserved all the space it needs, but the rmap btree can grow
+ * infinitely, so we won't allow more reflinks when the AG is down to the
+ * btree reserves.
+ */
+static int
+xfs_reflink_ag_has_free_space(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+
+	if (!xfs_has_rmapbt(mp))
+		return 0;
+
+	pag = xfs_perag_get(mp, agno);
+	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
+	    xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
+		error = -ENOSPC;
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Remap the given extent into the file.  The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
+ */
+STATIC int
+xfs_reflink_remap_extent(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*dmap,
+	xfs_off_t		new_isize)
+{
+	struct xfs_bmbt_irec	smap;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	xfs_off_t		newlen;
+	int64_t			qdelta = 0;
+	unsigned int		resblks;
+	bool			quota_reserved = true;
+	bool			smap_real;
+	bool			dmap_written = xfs_bmap_is_written_extent(dmap);
+	int			iext_delta = 0;
+	int			nimaps;
+	int			error;
+
+	/*
+	 * Start a rolling transaction to switch the mappings.
+	 *
+	 * Adding a written extent to the extent map can cause a bmbt split,
+	 * and removing a mapped extent from the extent can cause a bmbt split.
+	 * The two operations cannot both cause a split since they operate on
+	 * the same index in the bmap btree, so we only need a reservation for
+	 * one bmbt split if either thing is happening.  However, we haven't
+	 * locked the inode yet, so we reserve assuming this is the case.
+	 *
+	 * The first allocation call tries to reserve enough space to handle
+	 * mapping dmap into a sparse part of the file plus the bmbt split.  We
+	 * haven't locked the inode or read the existing mapping yet, so we do
+	 * not know for sure that we need the space.  This should succeed most
+	 * of the time.
+	 *
+	 * If the first attempt fails, try again but reserving only enough
+	 * space to handle a bmbt split.  This is the hard minimum requirement,
+	 * and we revisit quota reservations later when we know more about what
+	 * we're remapping.
+	 */
+	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+			resblks + dmap->br_blockcount, 0, false, &tp);
+	if (error == -EDQUOT || error == -ENOSPC) {
+		quota_reserved = false;
+		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+				resblks, 0, false, &tp);
+	}
+	if (error)
+		goto out;
+
+	/*
+	 * Read what's currently mapped in the destination file into smap.
+	 * If smap isn't a hole, we will have to remove it before we can add
+	 * dmap to the destination file.
+	 */
+	nimaps = 1;
+	error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+			&smap, &nimaps, 0);
+	if (error)
+		goto out_cancel;
+	ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+	smap_real = xfs_bmap_is_real_extent(&smap);
+
+	/*
+	 * We can only remap as many blocks as the smaller of the two extent
+	 * maps, because we can only remap one extent at a time.
+	 */
+	dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+	ASSERT(dmap->br_blockcount == smap.br_blockcount);
+
+	trace_xfs_reflink_remap_extent_dest(ip, &smap);
+
+	/*
+	 * Two extents mapped to the same physical block must not have
+	 * different states; that's filesystem corruption.  Move on to the next
+	 * extent if they're both holes or both the same physical extent.
+	 */
+	if (dmap->br_startblock == smap.br_startblock) {
+		if (dmap->br_state != smap.br_state)
+			error = -EFSCORRUPTED;
+		goto out_cancel;
+	}
+
+	/* If both extents are unwritten, leave them alone. */
+	if (dmap->br_state == XFS_EXT_UNWRITTEN &&
+	    smap.br_state == XFS_EXT_UNWRITTEN)
+		goto out_cancel;
+
+	/* No reflinking if the AG of the dest mapping is low on space. */
+	if (dmap_written) {
+		error = xfs_reflink_ag_has_free_space(mp,
+				XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
+		if (error)
+			goto out_cancel;
+	}
+
+	/*
+	 * Increase quota reservation if we think the quota block counter for
+	 * this file could increase.
+	 *
+	 * If we are mapping a written extent into the file, we need to have
+	 * enough quota block count reservation to handle the blocks in that
+	 * extent.  We log only the delta to the quota block counts, so if the
+	 * extent we're unmapping also has blocks allocated to it, we don't
+	 * need a quota reservation for the extent itself.
+	 *
+	 * Note that if we're replacing a delalloc reservation with a written
+	 * extent, we have to take the full quota reservation because removing
+	 * the delalloc reservation gives the block count back to the quota
+	 * count.  This is suboptimal, but the VFS flushed the dest range
+	 * before we started.  That should have removed all the delalloc
+	 * reservations, but we code defensively.
+	 *
+	 * xfs_trans_alloc_inode above already tried to grab an even larger
+	 * quota reservation, and kicked off a blockgc scan if it couldn't.
+	 * If we can't get a potentially smaller quota reservation now, we're
+	 * done.
+	 */
+	if (!quota_reserved && !smap_real && dmap_written) {
+		error = xfs_trans_reserve_quota_nblks(tp, ip,
+				dmap->br_blockcount, 0, false);
+		if (error)
+			goto out_cancel;
+	}
+
+	if (smap_real)
+		++iext_delta;
+
+	if (dmap_written)
+		++iext_delta;
+
+	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+	if (error == -EFBIG)
+		error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+	if (error)
+		goto out_cancel;
+
+	if (smap_real) {
+		/*
+		 * If the extent we're unmapping is backed by storage (written
+		 * or not), unmap the extent and drop its refcount.
+		 */
+		xfs_bmap_unmap_extent(tp, ip, &smap);
+		xfs_refcount_decrease_extent(tp, &smap);
+		qdelta -= smap.br_blockcount;
+	} else if (smap.br_startblock == DELAYSTARTBLOCK) {
+		int		done;
+
+		/*
+		 * If the extent we're unmapping is a delalloc reservation,
+		 * we can use the regular bunmapi function to release the
+		 * incore state.  Dropping the delalloc reservation takes care
+		 * of the quota reservation for us.
+		 */
+		error = xfs_bunmapi(NULL, ip, smap.br_startoff,
+				smap.br_blockcount, 0, 1, &done);
+		if (error)
+			goto out_cancel;
+		ASSERT(done);
+	}
+
+	/*
+	 * If the extent we're sharing is backed by written storage, increase
+	 * its refcount and map it into the file.
+	 */
+	if (dmap_written) {
+		xfs_refcount_increase_extent(tp, dmap);
+		xfs_bmap_map_extent(tp, ip, dmap);
+		qdelta += dmap->br_blockcount;
+	}
+
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
+
+	/* Update dest isize if needed. */
+	newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+	newlen = min_t(xfs_off_t, newlen, new_isize);
+	if (newlen > i_size_read(VFS_I(ip))) {
+		trace_xfs_reflink_update_inode_size(ip, newlen);
+		i_size_write(VFS_I(ip), newlen);
+		ip->i_disk_size = newlen;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	}
+
+	/* Commit everything and unlock. */
+	error = xfs_trans_commit(tp);
+	goto out_unlock;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	if (error)
+		trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/* Remap a range of one file to the other. */
+int
+xfs_reflink_remap_blocks(
+	struct xfs_inode	*src,
+	loff_t			pos_in,
+	struct xfs_inode	*dest,
+	loff_t			pos_out,
+	loff_t			remap_len,
+	loff_t			*remapped)
+{
+	struct xfs_bmbt_irec	imap;
+	struct xfs_mount	*mp = src->i_mount;
+	xfs_fileoff_t		srcoff = XFS_B_TO_FSBT(mp, pos_in);
+	xfs_fileoff_t		destoff = XFS_B_TO_FSBT(mp, pos_out);
+	xfs_filblks_t		len;
+	xfs_filblks_t		remapped_len = 0;
+	xfs_off_t		new_isize = pos_out + remap_len;
+	int			nimaps;
+	int			error = 0;
+
+	len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+			XFS_MAX_FILEOFF);
+
+	trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
+
+	while (len > 0) {
+		unsigned int	lock_mode;
+
+		/* Read extent from the source file */
+		nimaps = 1;
+		lock_mode = xfs_ilock_data_map_shared(src);
+		error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
+		xfs_iunlock(src, lock_mode);
+		if (error)
+			break;
+		/*
+		 * The caller supposedly flushed all dirty pages in the source
+		 * file range, which means that writeback should have allocated
+		 * or deleted all delalloc reservations in that range.  If we
+		 * find one, that's a good sign that something is seriously
+		 * wrong here.
+		 */
+		ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+		if (imap.br_startblock == DELAYSTARTBLOCK) {
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			error = -EFSCORRUPTED;
+			break;
+		}
+
+		trace_xfs_reflink_remap_extent_src(src, &imap);
+
+		/* Remap into the destination file at the given offset. */
+		imap.br_startoff = destoff;
+		error = xfs_reflink_remap_extent(dest, &imap, new_isize);
+		if (error)
+			break;
+
+		if (fatal_signal_pending(current)) {
+			error = -EINTR;
+			break;
+		}
+
+		/* Advance drange/srange */
+		srcoff += imap.br_blockcount;
+		destoff += imap.br_blockcount;
+		len -= imap.br_blockcount;
+		remapped_len += imap.br_blockcount;
+	}
+
+	if (error)
+		trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+	*remapped = min_t(loff_t, remap_len,
+			  XFS_FSB_TO_B(src->i_mount, remapped_len));
+	return error;
+}
+
+/*
+ * If we're reflinking to a point past the destination file's EOF, we must
+ * zero any speculative post-EOF preallocations that sit between the old EOF
+ * and the destination file offset.
+ */
+static int
+xfs_reflink_zero_posteof(
+	struct xfs_inode	*ip,
+	loff_t			pos)
+{
+	loff_t			isize = i_size_read(VFS_I(ip));
+
+	if (pos <= isize)
+		return 0;
+
+	trace_xfs_zero_eof(ip, isize, pos - isize);
+	return xfs_zero_range(ip, isize, pos - isize, NULL);
+}
+
+/*
+ * Prepare two files for range cloning.  Upon a successful return both inodes
+ * will have the iolock and mmaplock held, the page cache of the out file will
+ * be truncated, and any leases on the out file will have been broken.  This
+ * function borrows heavily from xfs_file_aio_write_checks.
+ *
+ * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
+ * checked that the bytes beyond EOF physically match. Hence we cannot use the
+ * EOF block in the source dedupe range because it's not a complete block match,
+ * hence can introduce a corruption into the file that has it's block replaced.
+ *
+ * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
+ * "block aligned" for the purposes of cloning entire files.  However, if the
+ * source file range includes the EOF block and it lands within the existing EOF
+ * of the destination file, then we can expose stale data from beyond the source
+ * file EOF in the destination file.
+ *
+ * XFS doesn't support partial block sharing, so in both cases we have check
+ * these cases ourselves. For dedupe, we can simply round the length to dedupe
+ * down to the previous whole block and ignore the partial EOF block. While this
+ * means we can't dedupe the last block of a file, this is an acceptible
+ * tradeoff for simplicity on implementation.
+ *
+ * For cloning, we want to share the partial EOF block if it is also the new EOF
+ * block of the destination file. If the partial EOF block lies inside the
+ * existing destination EOF, then we have to abort the clone to avoid exposing
+ * stale data in the destination file. Hence we reject these clone attempts with
+ * -EINVAL in this case.
+ */
+int
+xfs_reflink_remap_prep(
+	struct file		*file_in,
+	loff_t			pos_in,
+	struct file		*file_out,
+	loff_t			pos_out,
+	loff_t			*len,
+	unsigned int		remap_flags)
+{
+	struct inode		*inode_in = file_inode(file_in);
+	struct xfs_inode	*src = XFS_I(inode_in);
+	struct inode		*inode_out = file_inode(file_out);
+	struct xfs_inode	*dest = XFS_I(inode_out);
+	int			ret;
+
+	/* Lock both files against IO */
+	ret = xfs_ilock2_io_mmap(src, dest);
+	if (ret)
+		return ret;
+
+	/* Check file eligibility and prepare for block sharing. */
+	ret = -EINVAL;
+	/* Don't reflink realtime inodes */
+	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+		goto out_unlock;
+
+	/* Don't share DAX file data with non-DAX file. */
+	if (IS_DAX(inode_in) != IS_DAX(inode_out))
+		goto out_unlock;
+
+	if (!IS_DAX(inode_in))
+		ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+				pos_out, len, remap_flags);
+	else
+		ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+				pos_out, len, remap_flags, &xfs_read_iomap_ops);
+	if (ret || *len == 0)
+		goto out_unlock;
+
+	/* Attach dquots to dest inode before changing block map */
+	ret = xfs_qm_dqattach(dest);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Zero existing post-eof speculative preallocations in the destination
+	 * file.
+	 */
+	ret = xfs_reflink_zero_posteof(dest, pos_out);
+	if (ret)
+		goto out_unlock;
+
+	/* Set flags and remap blocks. */
+	ret = xfs_reflink_set_inode_flag(src, dest);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * If pos_out > EOF, we may have dirtied blocks between EOF and
+	 * pos_out. In that case, we need to extend the flush and unmap to cover
+	 * from EOF to the end of the copy length.
+	 */
+	if (pos_out > XFS_ISIZE(dest)) {
+		loff_t	flen = *len + (pos_out - XFS_ISIZE(dest));
+		ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
+	} else {
+		ret = xfs_flush_unmap_range(dest, pos_out, *len);
+	}
+	if (ret)
+		goto out_unlock;
+
+	return 0;
+out_unlock:
+	xfs_iunlock2_io_mmap(src, dest);
+	return ret;
+}
+
+/* Does this inode need the reflink flag? */
+int
+xfs_reflink_inode_has_shared_extents(
+	struct xfs_trans		*tp,
+	struct xfs_inode		*ip,
+	bool				*has_shared)
+{
+	struct xfs_bmbt_irec		got;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp;
+	struct xfs_iext_cursor		icur;
+	bool				found;
+	int				error;
+
+	ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	*has_shared = false;
+	found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
+	while (found) {
+		struct xfs_perag	*pag;
+		xfs_agblock_t		agbno;
+		xfs_extlen_t		aglen;
+		xfs_agblock_t		rbno;
+		xfs_extlen_t		rlen;
+
+		if (isnullstartblock(got.br_startblock) ||
+		    got.br_state != XFS_EXT_NORM)
+			goto next;
+
+		pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
+		agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
+		aglen = got.br_blockcount;
+		error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
+				&rbno, &rlen, false);
+		xfs_perag_put(pag);
+		if (error)
+			return error;
+
+		/* Is there still a shared block here? */
+		if (rbno != NULLAGBLOCK) {
+			*has_shared = true;
+			return 0;
+		}
+next:
+		found = xfs_iext_next_extent(ifp, &icur, &got);
+	}
+
+	return 0;
+}
+
+/*
+ * Clear the inode reflink flag if there are no shared extents.
+ *
+ * The caller is responsible for joining the inode to the transaction passed in.
+ * The inode will be joined to the transaction that is returned to the caller.
+ */
+int
+xfs_reflink_clear_inode_flag(
+	struct xfs_inode	*ip,
+	struct xfs_trans	**tpp)
+{
+	bool			needs_flag;
+	int			error = 0;
+
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
+	if (error || needs_flag)
+		return error;
+
+	/*
+	 * We didn't find any shared blocks so turn off the reflink flag.
+	 * First, get rid of any leftover CoW mappings.
+	 */
+	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
+			true);
+	if (error)
+		return error;
+
+	/* Clear the inode flag. */
+	trace_xfs_reflink_unset_inode_flag(ip);
+	ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+	xfs_inode_clear_cowblocks_tag(ip);
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+
+	return error;
+}
+
+/*
+ * Clear the inode reflink flag if there are no shared extents and the size
+ * hasn't changed.
+ */
+STATIC int
+xfs_reflink_try_clear_inode_flag(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error = 0;
+
+	/* Start a rolling transaction to remove the mappings */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_reflink_clear_inode_flag(ip, &tp);
+	if (error)
+		goto cancel;
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out;
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+cancel:
+	xfs_trans_cancel(tp);
+out:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Pre-COW all shared blocks within a given byte range of a file and turn off
+ * the reflink flag if we unshare all of the file's blocks.
+ */
+int
+xfs_reflink_unshare(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct inode		*inode = VFS_I(ip);
+	int			error;
+
+	if (!xfs_is_reflink_inode(ip))
+		return 0;
+
+	trace_xfs_reflink_unshare(ip, offset, len);
+
+	inode_dio_wait(inode);
+
+	error = iomap_file_unshare(inode, offset, len,
+			&xfs_buffered_write_iomap_ops);
+	if (error)
+		goto out;
+
+	error = filemap_write_and_wait_range(inode->i_mapping, offset,
+			offset + len - 1);
+	if (error)
+		goto out;
+
+	/* Turn off the reflink flag if possible. */
+	error = xfs_reflink_try_clear_inode_flag(ip);
+	if (error)
+		goto out;
+	return 0;
+
+out:
+	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
+	return error;
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
new file mode 100644
index 000000000..65c5dfe17
--- /dev/null
+++ b/fs/xfs/xfs_reflink.h
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_REFLINK_H
+#define __XFS_REFLINK_H 1
+
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+	return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
+extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
+		struct xfs_bmbt_irec *irec, bool *shared);
+int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+		bool *shared);
+
+int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+		struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode,
+		bool convert_now);
+extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
+
+extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
+		struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
+		xfs_fileoff_t end_fsb, bool cancel_real);
+extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count, bool cancel_real);
+extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
+extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
+extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+		struct file *file_out, loff_t pos_out, loff_t len,
+		unsigned int remap_flags);
+extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
+		struct xfs_inode *ip, bool *has_shared);
+extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
+		struct xfs_trans **tpp);
+extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t len);
+extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
+		struct file *file_out, loff_t pos_out, loff_t *len,
+		unsigned int remap_flags);
+extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
+		struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
+		loff_t *remapped);
+extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
+		xfs_extlen_t cowextsize, unsigned int remap_flags);
+
+#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
new file mode 100644
index 000000000..534504ede
--- /dev/null
+++ b/fs/xfs/xfs_rmap_item.c
@@ -0,0 +1,727 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_rmap_item.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache	*xfs_rui_cache;
+struct kmem_cache	*xfs_rud_cache;
+
+static const struct xfs_item_ops xfs_rui_item_ops;
+
+static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_rui_log_item, rui_item);
+}
+
+STATIC void
+xfs_rui_item_free(
+	struct xfs_rui_log_item	*ruip)
+{
+	kmem_free(ruip->rui_item.li_lv_shadow);
+	if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
+		kmem_free(ruip);
+	else
+		kmem_cache_free(xfs_rui_cache, ruip);
+}
+
+/*
+ * Freeing the RUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the RUI may not yet have been placed in the AIL
+ * when called by xfs_rui_release() from RUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the RUI.
+ */
+STATIC void
+xfs_rui_release(
+	struct xfs_rui_log_item	*ruip)
+{
+	ASSERT(atomic_read(&ruip->rui_refcount) > 0);
+	if (!atomic_dec_and_test(&ruip->rui_refcount))
+		return;
+
+	xfs_trans_ail_delete(&ruip->rui_item, 0);
+	xfs_rui_item_free(ruip);
+}
+
+STATIC void
+xfs_rui_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_rui_log_item	*ruip = RUI_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given rui log item. We use only 1 iovec, and we point that
+ * at the rui_log_format structure embedded in the rui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the rui item have been filled.
+ */
+STATIC void
+xfs_rui_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_rui_log_item	*ruip = RUI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&ruip->rui_next_extent) ==
+			ruip->rui_format.rui_nextents);
+
+	ruip->rui_format.rui_type = XFS_LI_RUI;
+	ruip->rui_format.rui_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
+			xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents));
+}
+
+/*
+ * The unpin operation is the last place an RUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the RUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the RUI to either construct
+ * and commit the RUD or drop the RUD's reference in the event of error. Simply
+ * drop the log's RUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_rui_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_rui_log_item	*ruip = RUI_ITEM(lip);
+
+	xfs_rui_release(ruip);
+}
+
+/*
+ * The RUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an RUD isn't going to be
+ * constructed and thus we free the RUI here directly.
+ */
+STATIC void
+xfs_rui_item_release(
+	struct xfs_log_item	*lip)
+{
+	xfs_rui_release(RUI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an rui item with the given number of extents.
+ */
+STATIC struct xfs_rui_log_item *
+xfs_rui_init(
+	struct xfs_mount		*mp,
+	uint				nextents)
+
+{
+	struct xfs_rui_log_item		*ruip;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
+	else
+		ruip = kmem_cache_zalloc(xfs_rui_cache,
+					 GFP_KERNEL | __GFP_NOFAIL);
+
+	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+	ruip->rui_format.rui_nextents = nextents;
+	ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
+	atomic_set(&ruip->rui_next_extent, 0);
+	atomic_set(&ruip->rui_refcount, 2);
+
+	return ruip;
+}
+
+static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_rud_log_item, rud_item);
+}
+
+STATIC void
+xfs_rud_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_rud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given rud log item. We use only 1 iovec, and we point that
+ * at the rud_log_format structure embedded in the rud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the rud item have been filled.
+ */
+STATIC void
+xfs_rud_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_rud_log_item	*rudp = RUD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	rudp->rud_format.rud_type = XFS_LI_RUD;
+	rudp->rud_format.rud_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format,
+			sizeof(struct xfs_rud_log_format));
+}
+
+/*
+ * The RUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the RUI and free the
+ * RUD.
+ */
+STATIC void
+xfs_rud_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_rud_log_item	*rudp = RUD_ITEM(lip);
+
+	xfs_rui_release(rudp->rud_ruip);
+	kmem_free(rudp->rud_item.li_lv_shadow);
+	kmem_cache_free(xfs_rud_cache, rudp);
+}
+
+static struct xfs_log_item *
+xfs_rud_item_intent(
+	struct xfs_log_item	*lip)
+{
+	return &RUD_ITEM(lip)->rud_ruip->rui_item;
+}
+
+static const struct xfs_item_ops xfs_rud_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED |
+			  XFS_ITEM_INTENT_DONE,
+	.iop_size	= xfs_rud_item_size,
+	.iop_format	= xfs_rud_item_format,
+	.iop_release	= xfs_rud_item_release,
+	.iop_intent	= xfs_rud_item_intent,
+};
+
+static struct xfs_rud_log_item *
+xfs_trans_get_rud(
+	struct xfs_trans		*tp,
+	struct xfs_rui_log_item		*ruip)
+{
+	struct xfs_rud_log_item		*rudp;
+
+	rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
+			  &xfs_rud_item_ops);
+	rudp->rud_ruip = ruip;
+	rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+	xfs_trans_add_item(tp, &rudp->rud_item);
+	return rudp;
+}
+
+/* Set the map extent flags for this reverse mapping. */
+static void
+xfs_trans_set_rmap_flags(
+	struct xfs_map_extent		*rmap,
+	enum xfs_rmap_intent_type	type,
+	int				whichfork,
+	xfs_exntst_t			state)
+{
+	rmap->me_flags = 0;
+	if (state == XFS_EXT_UNWRITTEN)
+		rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+	if (whichfork == XFS_ATTR_FORK)
+		rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+	switch (type) {
+	case XFS_RMAP_MAP:
+		rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
+		break;
+	case XFS_RMAP_MAP_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+		break;
+	case XFS_RMAP_UNMAP:
+		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+		break;
+	case XFS_RMAP_UNMAP_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+		break;
+	case XFS_RMAP_CONVERT:
+		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+		break;
+	case XFS_RMAP_CONVERT_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+		break;
+	case XFS_RMAP_ALLOC:
+		rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+		break;
+	case XFS_RMAP_FREE:
+		rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+/*
+ * Finish an rmap update and log it to the RUD. Note that the transaction is
+ * marked dirty regardless of whether the rmap update succeeds or fails to
+ * support the RUI/RUD lifecycle rules.
+ */
+static int
+xfs_trans_log_finish_rmap_update(
+	struct xfs_trans		*tp,
+	struct xfs_rud_log_item		*rudp,
+	enum xfs_rmap_intent_type	type,
+	uint64_t			owner,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			blockcount,
+	xfs_exntst_t			state,
+	struct xfs_btree_cur		**pcur)
+{
+	int				error;
+
+	error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
+			startblock, blockcount, state, pcur);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the RUI and frees the RUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+
+	return error;
+}
+
+/* Sort rmap intents by AG. */
+static int
+xfs_rmap_update_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_mount		*mp = priv;
+	struct xfs_rmap_intent		*ra;
+	struct xfs_rmap_intent		*rb;
+
+	ra = container_of(a, struct xfs_rmap_intent, ri_list);
+	rb = container_of(b, struct xfs_rmap_intent, ri_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+}
+
+/* Log rmap updates in the intent item. */
+STATIC void
+xfs_rmap_update_log_item(
+	struct xfs_trans		*tp,
+	struct xfs_rui_log_item		*ruip,
+	struct xfs_rmap_intent		*rmap)
+{
+	uint				next_extent;
+	struct xfs_map_extent		*map;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
+	ASSERT(next_extent < ruip->rui_format.rui_nextents);
+	map = &ruip->rui_format.rui_extents[next_extent];
+	map->me_owner = rmap->ri_owner;
+	map->me_startblock = rmap->ri_bmap.br_startblock;
+	map->me_startoff = rmap->ri_bmap.br_startoff;
+	map->me_len = rmap->ri_bmap.br_blockcount;
+	xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
+			rmap->ri_bmap.br_state);
+}
+
+static struct xfs_log_item *
+xfs_rmap_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_rui_log_item		*ruip = xfs_rui_init(mp, count);
+	struct xfs_rmap_intent		*rmap;
+
+	ASSERT(count > 0);
+
+	xfs_trans_add_item(tp, &ruip->rui_item);
+	if (sort)
+		list_sort(mp, items, xfs_rmap_update_diff_items);
+	list_for_each_entry(rmap, items, ri_list)
+		xfs_rmap_update_log_item(tp, ruip, rmap);
+	return &ruip->rui_item;
+}
+
+/* Get an RUD so we can process all the deferred rmap updates. */
+static struct xfs_log_item *
+xfs_rmap_update_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_rmap_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_rmap_intent		*rmap;
+	int				error;
+
+	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done),
+			rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork,
+			rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
+			rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
+			state);
+	kmem_cache_free(xfs_rmap_intent_cache, rmap);
+	return error;
+}
+
+/* Abort all pending RUIs. */
+STATIC void
+xfs_rmap_update_abort_intent(
+	struct xfs_log_item	*intent)
+{
+	xfs_rui_release(RUI_ITEM(intent));
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_rmap_intent		*rmap;
+
+	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+	kmem_cache_free(xfs_rmap_intent_cache, rmap);
+}
+
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_rmap_update_create_intent,
+	.abort_intent	= xfs_rmap_update_abort_intent,
+	.create_done	= xfs_rmap_update_create_done,
+	.finish_item	= xfs_rmap_update_finish_item,
+	.finish_cleanup = xfs_rmap_finish_one_cleanup,
+	.cancel_item	= xfs_rmap_update_cancel_item,
+};
+
+/* Is this recovered RUI ok? */
+static inline bool
+xfs_rui_validate_map(
+	struct xfs_mount		*mp,
+	struct xfs_map_extent		*rmap)
+{
+	if (!xfs_has_rmapbt(mp))
+		return false;
+
+	if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
+		return false;
+
+	switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+	case XFS_RMAP_EXTENT_MAP:
+	case XFS_RMAP_EXTENT_MAP_SHARED:
+	case XFS_RMAP_EXTENT_UNMAP:
+	case XFS_RMAP_EXTENT_UNMAP_SHARED:
+	case XFS_RMAP_EXTENT_CONVERT:
+	case XFS_RMAP_EXTENT_CONVERT_SHARED:
+	case XFS_RMAP_EXTENT_ALLOC:
+	case XFS_RMAP_EXTENT_FREE:
+		break;
+	default:
+		return false;
+	}
+
+	if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) &&
+	    !xfs_verify_ino(mp, rmap->me_owner))
+		return false;
+
+	if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len))
+		return false;
+
+	return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len);
+}
+
+/*
+ * Process an rmap update intent item that was recovered from the log.
+ * We need to update the rmapbt.
+ */
+STATIC int
+xfs_rui_item_recover(
+	struct xfs_log_item		*lip,
+	struct list_head		*capture_list)
+{
+	struct xfs_rui_log_item		*ruip = RUI_ITEM(lip);
+	struct xfs_map_extent		*rmap;
+	struct xfs_rud_log_item		*rudp;
+	struct xfs_trans		*tp;
+	struct xfs_btree_cur		*rcur = NULL;
+	struct xfs_mount		*mp = lip->li_log->l_mp;
+	enum xfs_rmap_intent_type	type;
+	xfs_exntst_t			state;
+	int				i;
+	int				whichfork;
+	int				error = 0;
+
+	/*
+	 * First check the validity of the extents described by the
+	 * RUI.  If any are bad, then assume that all are bad and
+	 * just toss the RUI.
+	 */
+	for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
+		if (!xfs_rui_validate_map(mp,
+					&ruip->rui_format.rui_extents[i])) {
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					&ruip->rui_format,
+					sizeof(ruip->rui_format));
+			return -EFSCORRUPTED;
+		}
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+			mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+	rudp = xfs_trans_get_rud(tp, ruip);
+
+	for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
+		rmap = &ruip->rui_format.rui_extents[i];
+		state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+				XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+		whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+				XFS_ATTR_FORK : XFS_DATA_FORK;
+		switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+		case XFS_RMAP_EXTENT_MAP:
+			type = XFS_RMAP_MAP;
+			break;
+		case XFS_RMAP_EXTENT_MAP_SHARED:
+			type = XFS_RMAP_MAP_SHARED;
+			break;
+		case XFS_RMAP_EXTENT_UNMAP:
+			type = XFS_RMAP_UNMAP;
+			break;
+		case XFS_RMAP_EXTENT_UNMAP_SHARED:
+			type = XFS_RMAP_UNMAP_SHARED;
+			break;
+		case XFS_RMAP_EXTENT_CONVERT:
+			type = XFS_RMAP_CONVERT;
+			break;
+		case XFS_RMAP_EXTENT_CONVERT_SHARED:
+			type = XFS_RMAP_CONVERT_SHARED;
+			break;
+		case XFS_RMAP_EXTENT_ALLOC:
+			type = XFS_RMAP_ALLOC;
+			break;
+		case XFS_RMAP_EXTENT_FREE:
+			type = XFS_RMAP_FREE;
+			break;
+		default:
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					&ruip->rui_format,
+					sizeof(ruip->rui_format));
+			error = -EFSCORRUPTED;
+			goto abort_error;
+		}
+		error = xfs_trans_log_finish_rmap_update(tp, rudp, type,
+				rmap->me_owner, whichfork,
+				rmap->me_startoff, rmap->me_startblock,
+				rmap->me_len, state, &rcur);
+		if (error == -EFSCORRUPTED)
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+					rmap, sizeof(*rmap));
+		if (error)
+			goto abort_error;
+
+	}
+
+	xfs_rmap_finish_one_cleanup(tp, rcur, error);
+	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+abort_error:
+	xfs_rmap_finish_one_cleanup(tp, rcur, error);
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+STATIC bool
+xfs_rui_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_rui_item_relog(
+	struct xfs_log_item		*intent,
+	struct xfs_trans		*tp)
+{
+	struct xfs_rud_log_item		*rudp;
+	struct xfs_rui_log_item		*ruip;
+	struct xfs_map_extent		*extp;
+	unsigned int			count;
+
+	count = RUI_ITEM(intent)->rui_format.rui_nextents;
+	extp = RUI_ITEM(intent)->rui_format.rui_extents;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
+	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+
+	ruip = xfs_rui_init(tp->t_mountp, count);
+	memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
+	atomic_set(&ruip->rui_next_extent, count);
+	xfs_trans_add_item(tp, &ruip->rui_item);
+	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+	return &ruip->rui_item;
+}
+
+static const struct xfs_item_ops xfs_rui_item_ops = {
+	.flags		= XFS_ITEM_INTENT,
+	.iop_size	= xfs_rui_item_size,
+	.iop_format	= xfs_rui_item_format,
+	.iop_unpin	= xfs_rui_item_unpin,
+	.iop_release	= xfs_rui_item_release,
+	.iop_recover	= xfs_rui_item_recover,
+	.iop_match	= xfs_rui_item_match,
+	.iop_relog	= xfs_rui_item_relog,
+};
+
+static inline void
+xfs_rui_copy_format(
+	struct xfs_rui_log_format	*dst,
+	const struct xfs_rui_log_format	*src)
+{
+	unsigned int			i;
+
+	memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents));
+
+	for (i = 0; i < src->rui_nextents; i++)
+		memcpy(&dst->rui_extents[i], &src->rui_extents[i],
+				sizeof(struct xfs_map_extent));
+}
+
+/*
+ * This routine is called to create an in-core extent rmap update
+ * item from the rui format structure which was logged on disk.
+ * It allocates an in-core rui, copies the extents from the format
+ * structure into it, and adds the rui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_rui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_rui_log_item		*ruip;
+	struct xfs_rui_log_format	*rui_formatp;
+	size_t				len;
+
+	rui_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+	xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
+	atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
+	/*
+	 * Insert the intent into the AIL directly and drop one reference so
+	 * that finishing or canceling the work will drop the other.
+	 */
+	xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn);
+	xfs_rui_release(ruip);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_rui_item_ops = {
+	.item_type		= XFS_LI_RUI,
+	.commit_pass2		= xlog_recover_rui_commit_pass2,
+};
+
+/*
+ * This routine is called when an RUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding RUI if it
+ * was still in the log. To do this it searches the AIL for the RUI with an id
+ * equal to that in the RUD format structure. If we find it we drop the RUD
+ * reference, which removes the RUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_rud_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_rud_log_format	*rud_formatp;
+
+	rud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				rud_formatp, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_rud_item_ops = {
+	.item_type		= XFS_LI_RUD,
+	.commit_pass2		= xlog_recover_rud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
new file mode 100644
index 000000000..802e5119e
--- /dev/null
+++ b/fs/xfs/xfs_rmap_item.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef	__XFS_RMAP_ITEM_H__
+#define	__XFS_RMAP_ITEM_H__
+
+/*
+ * There are (currently) three pairs of rmap btree redo item types: map, unmap,
+ * and convert.  The common abbreviations for these are RUI (rmap update
+ * intent) and RUD (rmap update done).  The redo item type is encoded in the
+ * flags field of each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated rmapbt updates.  Typically, the first
+ * transaction will record a bmbt update, followed by some number of
+ * transactions containing rmapbt updates, and finally transactions with any
+ * bnobt/cntbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * (rmapbt/bnobt/cntbt) metadata updates in the non-first transaction.
+ */
+
+/* kernel only RUI/RUD definitions */
+
+struct xfs_mount;
+struct kmem_cache;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_RUI_MAX_FAST_EXTENTS	16
+
+/*
+ * This is the "rmap update intent" log item.  It is used to log the fact that
+ * some reverse mappings need to change.  It is used in conjunction with the
+ * "rmap update done" log item described below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_rui_log_item {
+	struct xfs_log_item		rui_item;
+	atomic_t			rui_refcount;
+	atomic_t			rui_next_extent;
+	struct xfs_rui_log_format	rui_format;
+};
+
+static inline size_t
+xfs_rui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_rui_log_item, rui_format) +
+			xfs_rui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "rmap update done" log item.  It is used to log the fact that
+ * some rmapbt updates mentioned in an earlier rui item have been performed.
+ */
+struct xfs_rud_log_item {
+	struct xfs_log_item		rud_item;
+	struct xfs_rui_log_item		*rud_ruip;
+	struct xfs_rud_log_format	rud_format;
+};
+
+extern struct kmem_cache	*xfs_rui_cache;
+extern struct kmem_cache	*xfs_rud_cache;
+
+#endif	/* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
new file mode 100644
index 000000000..292d5e54a
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.c
@@ -0,0 +1,1409 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_icache.h"
+#include "xfs_rtalloc.h"
+#include "xfs_sb.h"
+
+/*
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+static int
+xfs_rtget_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+{
+	return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
+}
+
+/*
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
+ */
+STATIC int				/* error */
+xfs_rtany_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		low,		/* low log2 extent size */
+	int		high,		/* high log2 extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	int		*stat)		/* out: any good extents here? */
+{
+	int		error;		/* error value */
+	int		log;		/* loop counter, log2 of ext. size */
+	xfs_suminfo_t	sum;		/* summary data */
+
+	/* There are no extents at levels < m_rsum_cache[bbno]. */
+	if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno])
+		low = mp->m_rsum_cache[bbno];
+
+	/*
+	 * Loop over logs of extent sizes.
+	 */
+	for (log = low; log <= high; log++) {
+		/*
+		 * Get one summary datum.
+		 */
+		error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any, return success.
+		 */
+		if (sum) {
+			*stat = 1;
+			goto out;
+		}
+	}
+	/*
+	 * Found nothing, return failure.
+	 */
+	*stat = 0;
+out:
+	/* There were no extents at levels < log. */
+	if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno])
+		mp->m_rsum_cache[bbno] = log;
+	return 0;
+}
+
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int				/* error */
+xfs_rtcopy_summary(
+	xfs_mount_t	*omp,		/* old file system mount point */
+	xfs_mount_t	*nmp,		/* new file system mount point */
+	xfs_trans_t	*tp)		/* transaction pointer */
+{
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	struct xfs_buf	*bp;		/* summary buffer */
+	int		error;		/* error return value */
+	int		log;		/* summary level number (log length) */
+	xfs_suminfo_t	sum;		/* summary data */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+
+	bp = NULL;
+	for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+		for (bbno = omp->m_sb.sb_rbmblocks - 1;
+		     (xfs_srtblock_t)bbno >= 0;
+		     bbno--) {
+			error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+				&sumbno, &sum);
+			if (error)
+				return error;
+			if (sum == 0)
+				continue;
+			error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			ASSERT(sum > 0);
+		}
+	}
+	return 0;
+}
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtallocate_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* start block to allocate */
+	xfs_extlen_t	len,		/* length to allocate */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the allocated extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock = 0;	/* first block allocated > end */
+	xfs_rtblock_t	preblock = 0;	/* first block allocated < start */
+
+	end = start + len - 1;
+	/*
+	 * Assume we're allocating out of the middle of a free extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of free extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Decrement the summary information corresponding to the entire
+	 * (old) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If there are blocks not being allocated at the front of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being allocated at the end of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Modify the bitmap to mark this extent allocated.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 0);
+	return error;
+}
+
+/*
+ * Attempt to allocate an extent minlen<=len<=maxlen starting from
+ * bitmap block bbno.  If we don't get maxlen then use prod to trim
+ * the length, if given.  Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_block(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_rtblock_t	*nextp,		/* out: next block to try */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_rtblock_t	besti;		/* best rtblock found so far */
+	xfs_rtblock_t	bestlen;	/* best length found so far */
+	xfs_rtblock_t	end;		/* last rtblock in chunk */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current rtblock trying */
+	xfs_rtblock_t	next;		/* next rtblock to try */
+	int		stat;		/* status from internal calls */
+
+	/*
+	 * Loop over all the extents starting in this bitmap block,
+	 * looking for one that's long enough.
+	 */
+	for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
+		end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+	     i <= end;
+	     i++) {
+		/* Make sure we don't scan off the end of the rt volume. */
+		maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i;
+
+		/*
+		 * See if there's a free extent of maxlen starting at i.
+		 * If it's not so then next will contain the first non-free.
+		 */
+		error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+		if (error) {
+			return error;
+		}
+		if (stat) {
+			/*
+			 * i for maxlen is all free, allocate and return that.
+			 */
+			error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
+				rsb);
+			if (error) {
+				return error;
+			}
+			*len = maxlen;
+			*rtblock = i;
+			return 0;
+		}
+		/*
+		 * In the case where we have a variable-sized allocation
+		 * request, figure out how big this free piece is,
+		 * and if it's big enough for the minimum, and the best
+		 * so far, remember it.
+		 */
+		if (minlen < maxlen) {
+			xfs_rtblock_t	thislen;	/* this extent size */
+
+			thislen = next - i;
+			if (thislen >= minlen && thislen > bestlen) {
+				besti = i;
+				bestlen = thislen;
+			}
+		}
+		/*
+		 * If not done yet, find the start of the next free space.
+		 */
+		if (next < end) {
+			error = xfs_rtfind_forw(mp, tp, next, end, &i);
+			if (error) {
+				return error;
+			}
+		} else
+			break;
+	}
+	/*
+	 * Searched the whole thing & didn't find a maxlen free extent.
+	 */
+	if (minlen < maxlen && besti != -1) {
+		xfs_extlen_t	p;	/* amount to trim length by */
+
+		/*
+		 * If size should be a multiple of prod, make that so.
+		 */
+		if (prod > 1) {
+			div_u64_rem(bestlen, prod, &p);
+			if (p)
+				bestlen -= p;
+		}
+
+		/*
+		 * Allocate besti for bestlen & return that.
+		 */
+		error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = bestlen;
+		*rtblock = besti;
+		return 0;
+	}
+	/*
+	 * Allocation failed.  Set *nextp to the next block to try.
+	 */
+	*nextp = next;
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting at block
+ * bno.  If we don't get maxlen then use prod to trim the length, if given.
+ * Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_exact(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	xfs_extlen_t	i;		/* extent length trimmed due to prod */
+	int		isfree;		/* extent is free */
+	xfs_rtblock_t	next;		/* next block to try (dummy) */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * Check if the range in question (for maxlen) is free.
+	 */
+	error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+	if (error) {
+		return error;
+	}
+	if (isfree) {
+		/*
+		 * If it is, allocate it and return success.
+		 */
+		error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = maxlen;
+		*rtblock = bno;
+		return 0;
+	}
+	/*
+	 * If not, allocate what there is, if it's at least minlen.
+	 */
+	maxlen = next - bno;
+	if (maxlen < minlen) {
+		/*
+		 * Failed, return failure status.
+		 */
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	/*
+	 * Trim off tail of extent, if prod is specified.
+	 */
+	if (prod > 1 && (i = maxlen % prod)) {
+		maxlen -= i;
+		if (maxlen < minlen) {
+			/*
+			 * Now we can't do it, return failure status.
+			 */
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+	/*
+	 * Allocate what we can and return it.
+	 */
+	error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	*len = maxlen;
+	*rtblock = bno;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting as near
+ * to bno as possible.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_near(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		any;		/* any useful extents from summary */
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	int		error;		/* error value */
+	int		i;		/* bitmap block offset (loop control) */
+	int		j;		/* secondary loop control */
+	int		log2len;	/* log2 of minlen */
+	xfs_rtblock_t	n;		/* next block to try */
+	xfs_rtblock_t	r;		/* result block */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * If the block number given is off the end, silently set it to
+	 * the last block.
+	 */
+	if (bno >= mp->m_sb.sb_rextents)
+		bno = mp->m_sb.sb_rextents - 1;
+
+	/* Make sure we don't run off the end of the rt volume. */
+	maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno;
+	if (maxlen < minlen) {
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+
+	/*
+	 * Try the exact allocation first.
+	 */
+	error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
+		rbpp, rsb, prod, &r);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If the exact allocation worked, return that.
+	 */
+	if (r != NULLRTBLOCK) {
+		*rtblock = r;
+		return 0;
+	}
+	bbno = XFS_BITTOBLOCK(mp, bno);
+	i = 0;
+	ASSERT(minlen != 0);
+	log2len = xfs_highbit32(minlen);
+	/*
+	 * Loop over all bitmap blocks (bbno + i is current block).
+	 */
+	for (;;) {
+		/*
+		 * Get summary information of extents of all useful levels
+		 * starting in this bitmap block.
+		 */
+		error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
+			bbno + i, rbpp, rsb, &any);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any useful extents starting here, try
+		 * allocating one.
+		 */
+		if (any) {
+			/*
+			 * On the positive side of the starting location.
+			 */
+			if (i >= 0) {
+				/*
+				 * Try to allocate an extent starting in
+				 * this block.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it worked, return it.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+			/*
+			 * On the negative side of the starting location.
+			 */
+			else {		/* i < 0 */
+				/*
+				 * Loop backwards through the bitmap blocks from
+				 * the starting point-1 up to where we are now.
+				 * There should be an extent which ends in this
+				 * bitmap block and is long enough.
+				 */
+				for (j = -1; j > i; j--) {
+					/*
+					 * Grab the summary information for
+					 * this bitmap block.
+					 */
+					error = xfs_rtany_summary(mp, tp,
+						log2len, mp->m_rsumlevels - 1,
+						bbno + j, rbpp, rsb, &any);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If there's no extent given in the
+					 * summary that means the extent we
+					 * found must carry over from an
+					 * earlier block.  If there is an
+					 * extent given, we've already tried
+					 * that allocation, don't do it again.
+					 */
+					if (any)
+						continue;
+					error = xfs_rtallocate_extent_block(mp,
+						tp, bbno + j, minlen, maxlen,
+						len, &n, rbpp, rsb, prod, &r);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If it works, return the extent.
+					 */
+					if (r != NULLRTBLOCK) {
+						*rtblock = r;
+						return 0;
+					}
+				}
+				/*
+				 * There weren't intervening bitmap blocks
+				 * with a long enough extent, or the
+				 * allocation didn't work for some reason
+				 * (i.e. it's a little * too short).
+				 * Try to allocate from the summary block
+				 * that we found.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it works, return the extent.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+		}
+		/*
+		 * Loop control.  If we were on the positive side, and there's
+		 * still more blocks on the negative side, go there.
+		 */
+		if (i > 0 && (int)bbno - i >= 0)
+			i = -i;
+		/*
+		 * If positive, and no more negative, but there are more
+		 * positive, go there.
+		 */
+		else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1)
+			i++;
+		/*
+		 * If negative or 0 (just started), and there are positive
+		 * blocks to go, go there.  The 0 case moves to block 1.
+		 */
+		else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1)
+			i = 1 - i;
+		/*
+		 * If negative or 0 and there are more negative blocks,
+		 * go there.
+		 */
+		else if (i <= 0 && (int)bbno + i > 0)
+			i--;
+		/*
+		 * Must be done.  Return failure.
+		 */
+		else
+			break;
+	}
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, with no position
+ * specified.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_size(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	int		i;		/* bitmap block number */
+	int		l;		/* level number (loop control) */
+	xfs_rtblock_t	n;		/* next block to be tried */
+	xfs_rtblock_t	r;		/* result block number */
+	xfs_suminfo_t	sum;		/* summary information for extents */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	ASSERT(maxlen != 0);
+
+	/*
+	 * Loop over all the levels starting with maxlen.
+	 * At each level, look at all the bitmap blocks, to see if there
+	 * are extents starting there that are long enough (>= maxlen).
+	 * Note, only on the initial level can the allocation fail if
+	 * the summary says there's an extent.
+	 */
+	for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
+		/*
+		 * Loop over all the bitmap blocks.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary for this level/block.
+			 */
+			error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+				&sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * Nothing there, on to the next block.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try allocating the extent.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
+				maxlen, len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Didn't find any maxlen blocks.  Try smaller ones, unless
+	 * we're asking for a fixed size extent.
+	 */
+	if (minlen > --maxlen) {
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	ASSERT(minlen != 0);
+	ASSERT(maxlen != 0);
+
+	/*
+	 * Loop over sizes, from maxlen down to minlen.
+	 * This time, when we do the allocations, allow smaller ones
+	 * to succeed.
+	 */
+	for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
+		/*
+		 * Loop over all the bitmap blocks, try an allocation
+		 * starting in that block.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary information for this level/block.
+			 */
+			error =	xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+						  &sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If nothing there, go on to next.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try the allocation.  Make sure the specified
+			 * minlen/maxlen are in the possible range for
+			 * this summary level.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i,
+					XFS_RTMAX(minlen, 1 << l),
+					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
+					len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that extent.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Got nothing, return failure.
+	 */
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ */
+STATIC int
+xfs_growfs_rt_alloc(
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_extlen_t		oblocks,	/* old count of blocks */
+	xfs_extlen_t		nblocks,	/* new count of blocks */
+	struct xfs_inode	*ip)		/* inode (bitmap/summary) */
+{
+	xfs_fileoff_t		bno;		/* block number in file */
+	struct xfs_buf		*bp;	/* temporary buffer for zeroing */
+	xfs_daddr_t		d;		/* disk block address */
+	int			error;		/* error return value */
+	xfs_fsblock_t		fsbno;		/* filesystem block for bno */
+	struct xfs_bmbt_irec	map;		/* block map output */
+	int			nmap;		/* number of block maps */
+	int			resblks;	/* space reservation */
+	enum xfs_blft		buf_type;
+	struct xfs_trans	*tp;
+
+	if (ip == mp->m_rsumip)
+		buf_type = XFS_BLFT_RTSUMMARY_BUF;
+	else
+		buf_type = XFS_BLFT_RTBITMAP_BUF;
+
+	/*
+	 * Allocate space to the file, as necessary.
+	 */
+	while (oblocks < nblocks) {
+		resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+		/*
+		 * Reserve space & log for one extent added to the file.
+		 */
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
+				0, 0, &tp);
+		if (error)
+			return error;
+		/*
+		 * Lock the inode.
+		 */
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+		error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+				XFS_IEXT_ADD_NOSPLIT_CNT);
+		if (error == -EFBIG)
+			error = xfs_iext_count_upgrade(tp, ip,
+					XFS_IEXT_ADD_NOSPLIT_CNT);
+		if (error)
+			goto out_trans_cancel;
+
+		/*
+		 * Allocate blocks to the bitmap file.
+		 */
+		nmap = 1;
+		error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
+					XFS_BMAPI_METADATA, 0, &map, &nmap);
+		if (!error && nmap < 1)
+			error = -ENOSPC;
+		if (error)
+			goto out_trans_cancel;
+		/*
+		 * Free any blocks freed up in the transaction, then commit.
+		 */
+		error = xfs_trans_commit(tp);
+		if (error)
+			return error;
+		/*
+		 * Now we need to clear the allocated blocks.
+		 * Do this one block per transaction, to keep it simple.
+		 */
+		for (bno = map.br_startoff, fsbno = map.br_startblock;
+		     bno < map.br_startoff + map.br_blockcount;
+		     bno++, fsbno++) {
+			/*
+			 * Reserve log for one block zeroing.
+			 */
+			error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
+					0, 0, 0, &tp);
+			if (error)
+				return error;
+			/*
+			 * Lock the bitmap inode.
+			 */
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+			/*
+			 * Get a buffer for the block.
+			 */
+			d = XFS_FSB_TO_DADDR(mp, fsbno);
+			error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					mp->m_bsize, 0, &bp);
+			if (error)
+				goto out_trans_cancel;
+
+			xfs_trans_buf_set_type(tp, bp, buf_type);
+			bp->b_ops = &xfs_rtbuf_ops;
+			memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
+			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+			/*
+			 * Commit the transaction.
+			 */
+			error = xfs_trans_commit(tp);
+			if (error)
+				return error;
+		}
+		/*
+		 * Go on to the next extent, if any.
+		 */
+		oblocks = map.br_startoff + map.br_blockcount;
+	}
+
+	return 0;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+static void
+xfs_alloc_rsum_cache(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_extlen_t	rbmblocks)	/* number of rt bitmap blocks */
+{
+	/*
+	 * The rsum cache is initialized to all zeroes, which is trivially a
+	 * lower bound on the minimum level with any free extents. We can
+	 * continue without the cache if it couldn't be allocated.
+	 */
+	mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
+	if (!mp->m_rsum_cache)
+		xfs_warn(mp, "could not allocate realtime summary cache");
+}
+
+/*
+ * Visible (exported) functions.
+ */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_growfs_rt_t	*in)		/* growfs rt input struct */
+{
+	xfs_rtblock_t	bmbno;		/* bitmap block number */
+	struct xfs_buf	*bp;		/* temporary buffer */
+	int		error;		/* error return value */
+	xfs_mount_t	*nmp;		/* new (fake) mount structure */
+	xfs_rfsblock_t	nrblocks;	/* new number of realtime blocks */
+	xfs_extlen_t	nrbmblocks;	/* new number of rt bitmap blocks */
+	xfs_rtblock_t	nrextents;	/* new number of realtime extents */
+	uint8_t		nrextslog;	/* new log2 of sb_rextents */
+	xfs_extlen_t	nrsumblocks;	/* new number of summary blocks */
+	uint		nrsumlevels;	/* new rt summary levels */
+	uint		nrsumsize;	/* new size of rt summary, bytes */
+	xfs_sb_t	*nsbp;		/* new superblock */
+	xfs_extlen_t	rbmblocks;	/* current number of rt bitmap blocks */
+	xfs_extlen_t	rsumblocks;	/* current number of rt summary blks */
+	xfs_sb_t	*sbp;		/* old superblock */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+	uint8_t		*rsum_cache;	/* old summary cache */
+
+	sbp = &mp->m_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Needs to have been mounted with an rt device. */
+	if (!XFS_IS_REALTIME_MOUNT(mp))
+		return -EINVAL;
+	/*
+	 * Mount should fail if the rt bitmap/summary files don't load, but
+	 * we'll check anyway.
+	 */
+	if (!mp->m_rbmip || !mp->m_rsumip)
+		return -EINVAL;
+
+	/* Shrink not supported. */
+	if (in->newblocks <= sbp->sb_rblocks)
+		return -EINVAL;
+
+	/* Can only change rt extent size when adding rt volume. */
+	if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize)
+		return -EINVAL;
+
+	/* Range check the extent size. */
+	if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE ||
+	    XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE)
+		return -EINVAL;
+
+	/* Unsupported realtime features. */
+	if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
+		return -EOPNOTSUPP;
+
+	nrblocks = in->newblocks;
+	error = xfs_sb_validate_fsb_count(sbp, nrblocks);
+	if (error)
+		return error;
+	/*
+	 * Read in the last block of the device, make sure it exists.
+	 */
+	error = xfs_buf_read_uncached(mp->m_rtdev_targp,
+				XFS_FSB_TO_BB(mp, nrblocks - 1),
+				XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error)
+		return error;
+	xfs_buf_relse(bp);
+
+	/*
+	 * Calculate new parameters.  These are the final values to be reached.
+	 */
+	nrextents = nrblocks;
+	do_div(nrextents, in->extsize);
+	nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
+	nrextslog = xfs_highbit32(nrextents);
+	nrsumlevels = nrextslog + 1;
+	nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
+	nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+	nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+	/*
+	 * New summary size can't be more than half the size of
+	 * the log.  This prevents us from getting a log overflow,
+	 * since we'll log basically the whole summary file at once.
+	 */
+	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
+		return -EINVAL;
+	/*
+	 * Get the old block counts for bitmap and summary inodes.
+	 * These can't change since other growfs callers are locked out.
+	 */
+	rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_disk_size);
+	rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_disk_size);
+	/*
+	 * Allocate space to the bitmap and summary files, as necessary.
+	 */
+	error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
+	if (error)
+		return error;
+	error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
+	if (error)
+		return error;
+
+	rsum_cache = mp->m_rsum_cache;
+	if (nrbmblocks != sbp->sb_rbmblocks)
+		xfs_alloc_rsum_cache(mp, nrbmblocks);
+
+	/*
+	 * Allocate a new (fake) mount/sb.
+	 */
+	nmp = kmem_alloc(sizeof(*nmp), 0);
+	/*
+	 * Loop over the bitmap blocks.
+	 * We will do everything one bitmap block at a time.
+	 * Skip the current block if it is exactly full.
+	 * This also deals with the case where there were no rtextents before.
+	 */
+	for (bmbno = sbp->sb_rbmblocks -
+		     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
+	     bmbno < nrbmblocks;
+	     bmbno++) {
+		struct xfs_trans	*tp;
+		xfs_rfsblock_t		nrblocks_step;
+
+		*nmp = *mp;
+		nsbp = &nmp->m_sb;
+		/*
+		 * Calculate new sb and mount fields for this round.
+		 */
+		nsbp->sb_rextsize = in->extsize;
+		nsbp->sb_rbmblocks = bmbno + 1;
+		nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
+				nsbp->sb_rextsize;
+		nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
+		nsbp->sb_rextents = nsbp->sb_rblocks;
+		do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+		ASSERT(nsbp->sb_rextents != 0);
+		nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
+		nrsumsize =
+			(uint)sizeof(xfs_suminfo_t) * nrsumlevels *
+			nsbp->sb_rbmblocks;
+		nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+		nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+		/*
+		 * Start a transaction, get the log reservation.
+		 */
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0,
+				&tp);
+		if (error)
+			break;
+		/*
+		 * Lock out other callers by grabbing the bitmap inode lock.
+		 */
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		/*
+		 * Update the bitmap inode's size ondisk and incore.  We need
+		 * to update the incore size so that inode inactivation won't
+		 * punch what it thinks are "posteof" blocks.
+		 */
+		mp->m_rbmip->i_disk_size =
+			nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+		i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size);
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		/*
+		 * Get the summary inode into the transaction.
+		 */
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		/*
+		 * Update the summary inode's size.  We need to update the
+		 * incore size so that inode inactivation won't punch what it
+		 * thinks are "posteof" blocks.
+		 */
+		mp->m_rsumip->i_disk_size = nmp->m_rsumsize;
+		i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_disk_size);
+		xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
+		/*
+		 * Copy summary data from old to new sizes.
+		 * Do this when the real size (not block-aligned) changes.
+		 */
+		if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
+		    mp->m_rsumlevels != nmp->m_rsumlevels) {
+			error = xfs_rtcopy_summary(mp, nmp, tp);
+			if (error)
+				goto error_cancel;
+		}
+		/*
+		 * Update superblock fields.
+		 */
+		if (nsbp->sb_rextsize != sbp->sb_rextsize)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+				nsbp->sb_rextsize - sbp->sb_rextsize);
+		if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+				nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
+		if (nsbp->sb_rblocks != sbp->sb_rblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+				nsbp->sb_rblocks - sbp->sb_rblocks);
+		if (nsbp->sb_rextents != sbp->sb_rextents)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+				nsbp->sb_rextents - sbp->sb_rextents);
+		if (nsbp->sb_rextslog != sbp->sb_rextslog)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+				nsbp->sb_rextslog - sbp->sb_rextslog);
+		/*
+		 * Free new extent.
+		 */
+		bp = NULL;
+		error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
+			nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+		if (error) {
+error_cancel:
+			xfs_trans_cancel(tp);
+			break;
+		}
+		/*
+		 * Mark more blocks free in the superblock.
+		 */
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
+			nsbp->sb_rextents - sbp->sb_rextents);
+		/*
+		 * Update mp values into the real mp structure.
+		 */
+		mp->m_rsumlevels = nrsumlevels;
+		mp->m_rsumsize = nrsumsize;
+
+		error = xfs_trans_commit(tp);
+		if (error)
+			break;
+
+		/* Ensure the mount RT feature flag is now set. */
+		mp->m_features |= XFS_FEAT_REALTIME;
+	}
+	if (error)
+		goto out_free;
+
+	/* Update secondary superblocks now the physical grow has completed */
+	error = xfs_update_secondary_sbs(mp);
+
+out_free:
+	/*
+	 * Free the fake mp structure.
+	 */
+	kmem_free(nmp);
+
+	/*
+	 * If we had to allocate a new rsum_cache, we either need to free the
+	 * old one (if we succeeded) or free the new one and restore the old one
+	 * (if there was an error).
+	 */
+	if (rsum_cache != mp->m_rsum_cache) {
+		if (error) {
+			kmem_free(mp->m_rsum_cache);
+			mp->m_rsum_cache = rsum_cache;
+		} else {
+			kmem_free(rsum_cache);
+		}
+	}
+
+	return error;
+}
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.  The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	int		wasdel,		/* was a delayed allocation extent */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_mount_t	*mp = tp->t_mountp;
+	int		error;		/* error value */
+	xfs_rtblock_t	r;		/* result allocated block */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	struct xfs_buf	*sumbp;		/* summary file block buffer */
+
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+	ASSERT(minlen > 0 && minlen <= maxlen);
+
+	/*
+	 * If prod is set then figure out what to do to minlen and maxlen.
+	 */
+	if (prod > 1) {
+		xfs_extlen_t	i;
+
+		if ((i = maxlen % prod))
+			maxlen -= i;
+		if ((i = minlen % prod))
+			minlen += prod - i;
+		if (maxlen < minlen) {
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+
+retry:
+	sumbp = NULL;
+	if (bno == 0) {
+		error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
+				&sumbp,	&sb, prod, &r);
+	} else {
+		error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+	}
+
+	if (error)
+		return error;
+
+	/*
+	 * If it worked, update the superblock.
+	 */
+	if (r != NULLRTBLOCK) {
+		long	slen = (long)*len;
+
+		ASSERT(*len >= minlen && *len <= maxlen);
+		if (wasdel)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
+		else
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
+	} else if (prod > 1) {
+		prod = 1;
+		goto retry;
+	}
+
+	*rtblock = r;
+	return 0;
+}
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int				/* error */
+xfs_rtmount_init(
+	struct xfs_mount	*mp)	/* file system mount structure */
+{
+	struct xfs_buf		*bp;	/* buffer for last block of subvolume */
+	struct xfs_sb		*sbp;	/* filesystem superblock copy in mount */
+	xfs_daddr_t		d;	/* address of last block of subvolume */
+	int			error;
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rblocks == 0)
+		return 0;
+	if (mp->m_rtdev_targp == NULL) {
+		xfs_warn(mp,
+	"Filesystem has a realtime volume, use rtdev=device option");
+		return -ENODEV;
+	}
+	mp->m_rsumlevels = sbp->sb_rextslog + 1;
+	mp->m_rsumsize =
+		(uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
+		sbp->sb_rbmblocks;
+	mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+	mp->m_rbmip = mp->m_rsumip = NULL;
+	/*
+	 * Check that the realtime section is an ok size.
+	 */
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
+		xfs_warn(mp, "realtime mount -- %llu != %llu",
+			(unsigned long long) XFS_BB_TO_FSB(mp, d),
+			(unsigned long long) mp->m_sb.sb_rblocks);
+		return -EFBIG;
+	}
+	error = xfs_buf_read_uncached(mp->m_rtdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
+		xfs_warn(mp, "realtime device size check failed");
+		return error;
+	}
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+static int
+xfs_rtalloc_count_frextent(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*rec,
+	void				*priv)
+{
+	uint64_t			*valp = priv;
+
+	*valp += rec->ar_extcount;
+	return 0;
+}
+
+/*
+ * Reinitialize the number of free realtime extents from the realtime bitmap.
+ * Callers must ensure that there is no other activity in the filesystem.
+ */
+int
+xfs_rtalloc_reinit_frextents(
+	struct xfs_mount	*mp)
+{
+	uint64_t		val = 0;
+	int			error;
+
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
+			&val);
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	if (error)
+		return error;
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_frextents = val;
+	spin_unlock(&mp->m_sb_lock);
+	percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+	return 0;
+}
+
+/*
+ * Get the bitmap and summary inodes and the summary cache into the mount
+ * structure at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		error;		/* error return value */
+	xfs_sb_t	*sbp;
+
+	sbp = &mp->m_sb;
+	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+	if (error)
+		return error;
+	ASSERT(mp->m_rbmip != NULL);
+
+	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+	if (error) {
+		xfs_irele(mp->m_rbmip);
+		return error;
+	}
+	ASSERT(mp->m_rsumip != NULL);
+	xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
+	return 0;
+}
+
+void
+xfs_rtunmount_inodes(
+	struct xfs_mount	*mp)
+{
+	kmem_free(mp->m_rsum_cache);
+	if (mp->m_rbmip)
+		xfs_irele(mp->m_rbmip);
+	if (mp->m_rsumip)
+		xfs_irele(mp->m_rsumip);
+}
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	len,		/* allocation length (rtextents) */
+	xfs_rtblock_t	*pick)		/* result rt extent */
+{
+	xfs_rtblock_t	b;		/* result block */
+	int		log2;		/* log of sequence number */
+	uint64_t	resid;		/* residual after log removed */
+	uint64_t	seq;		/* sequence number of file creation */
+	uint64_t	*seqp;		/* pointer to seqno in inode */
+
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+	seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
+	if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
+		mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+		*seqp = 0;
+	}
+	seq = *seqp;
+	if ((log2 = xfs_highbit64(seq)) == -1)
+		b = 0;
+	else {
+		resid = seq - (1ULL << log2);
+		b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
+		    (log2 + 1);
+		if (b >= mp->m_sb.sb_rextents)
+			div64_u64_rem(b, mp->m_sb.sb_rextents, &b);
+		if (b + len > mp->m_sb.sb_rextents)
+			b = mp->m_sb.sb_rextents - len;
+	}
+	*seqp = seq + 1;
+	xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+	*pick = b;
+	return 0;
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
new file mode 100644
index 000000000..62c7ad79c
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.h
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_RTALLOC_H__
+#define	__XFS_RTALLOC_H__
+
+/* kernel only definitions and functions */
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * XXX: Most of the realtime allocation functions deal in units of realtime
+ * extents, not realtime blocks.  This looks funny when paired with the type
+ * name and screams for a larger cleanup.
+ */
+struct xfs_rtalloc_rec {
+	xfs_rtblock_t		ar_startext;
+	xfs_rtblock_t		ar_extcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*rec,
+	void				*priv);
+
+#ifdef CONFIG_XFS_RT
+/*
+ * Function prototypes for exported functions.
+ */
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.  The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to allocate */
+	xfs_extlen_t		minlen,	/* minimum length to allocate */
+	xfs_extlen_t		maxlen,	/* maximum length to allocate */
+	xfs_extlen_t		*len,	/* out: actual length allocated */
+	int			wasdel,	/* was a delayed allocation extent */
+	xfs_extlen_t		prod,	/* extent product factor */
+	xfs_rtblock_t		*rtblock); /* out: start block allocated */
+
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to free */
+	xfs_extlen_t		len);	/* length of extent freed */
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int					/* error */
+xfs_rtmount_init(
+	struct xfs_mount	*mp);	/* file system mount structure */
+void
+xfs_rtunmount_inodes(
+	struct xfs_mount	*mp);
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_extlen_t		len,	/* allocation length (rtextents) */
+	xfs_rtblock_t		*pick);	/* result rt extent */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
+
+/*
+ * From xfs_rtbitmap.c
+ */
+int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
+		  xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
+int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		      xfs_rtblock_t start, xfs_extlen_t len, int val,
+		      xfs_rtblock_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
+		    xfs_rtblock_t start, xfs_rtblock_t limit,
+		    xfs_rtblock_t *rtblock);
+int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
+		    xfs_rtblock_t start, xfs_rtblock_t limit,
+		    xfs_rtblock_t *rtblock);
+int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		       xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
+			     int log, xfs_rtblock_t bbno, int delta,
+			     struct xfs_buf **rbpp, xfs_fsblock_t *rsb,
+			     xfs_suminfo_t *sum);
+int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
+			 xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp,
+			 xfs_fsblock_t *rsb);
+int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		     xfs_rtblock_t start, xfs_extlen_t len,
+		     struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		const struct xfs_rtalloc_rec *low_rec,
+		const struct xfs_rtalloc_rec *high_rec,
+		xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
+			  xfs_rtalloc_query_range_fn fn,
+			  void *priv);
+bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
+int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
+			       xfs_rtblock_t start, xfs_extlen_t len,
+			       bool *is_free);
+int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
+#else
+# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
+# define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
+# define xfs_growfs_rt(mp,in)                           (ENOSYS)
+# define xfs_rtalloc_query_range(t,l,h,f,p)             (ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p)                 (ENOSYS)
+# define xfs_rtbuf_get(m,t,b,i,p)                       (ENOSYS)
+# define xfs_verify_rtbno(m, r)			(false)
+# define xfs_rtalloc_extent_is_free(m,t,s,l,i)          (ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m)                (0)
+static inline int		/* error */
+xfs_rtmount_init(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	if (mp->m_sb.sb_rblocks == 0)
+		return 0;
+
+	xfs_warn(mp, "Not built with CONFIG_XFS_RT");
+	return -ENOSYS;
+}
+# define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtunmount_inodes(m)
+#endif	/* CONFIG_XFS_RT */
+
+#endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
new file mode 100644
index 000000000..90a77cd3e
--- /dev/null
+++ b/fs/xfs/xfs_stats.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+
+struct xstats xfsstats;
+
+static int counter_val(struct xfsstats __percpu *stats, int idx)
+{
+	int val = 0, cpu;
+
+	for_each_possible_cpu(cpu)
+		val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx));
+	return val;
+}
+
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
+{
+	int		i, j;
+	int		len = 0;
+	uint64_t	xs_xstrat_bytes = 0;
+	uint64_t	xs_write_bytes = 0;
+	uint64_t	xs_read_bytes = 0;
+	uint64_t	defer_relog = 0;
+
+	static const struct xstats_entry {
+		char	*desc;
+		int	endpoint;
+	} xstats[] = {
+		{ "extent_alloc",	xfsstats_offset(xs_abt_lookup)	},
+		{ "abt",		xfsstats_offset(xs_blk_mapr)	},
+		{ "blk_map",		xfsstats_offset(xs_bmbt_lookup)	},
+		{ "bmbt",		xfsstats_offset(xs_dir_lookup)	},
+		{ "dir",		xfsstats_offset(xs_trans_sync)	},
+		{ "trans",		xfsstats_offset(xs_ig_attempts)	},
+		{ "ig",			xfsstats_offset(xs_log_writes)	},
+		{ "log",		xfsstats_offset(xs_try_logspace)},
+		{ "push_ail",		xfsstats_offset(xs_xstrat_quick)},
+		{ "xstrat",		xfsstats_offset(xs_write_calls)	},
+		{ "rw",			xfsstats_offset(xs_attr_get)	},
+		{ "attr",		xfsstats_offset(xs_iflush_count)},
+		{ "icluster",		xfsstats_offset(vn_active)	},
+		{ "vnodes",		xfsstats_offset(xb_get)		},
+		{ "buf",		xfsstats_offset(xs_abtb_2)	},
+		{ "abtb2",		xfsstats_offset(xs_abtc_2)	},
+		{ "abtc2",		xfsstats_offset(xs_bmbt_2)	},
+		{ "bmbt2",		xfsstats_offset(xs_ibt_2)	},
+		{ "ibt2",		xfsstats_offset(xs_fibt_2)	},
+		{ "fibt2",		xfsstats_offset(xs_rmap_2)	},
+		{ "rmapbt",		xfsstats_offset(xs_refcbt_2)	},
+		{ "refcntbt",		xfsstats_offset(xs_qm_dqreclaims)},
+		/* we print both series of quota information together */
+		{ "qm",			xfsstats_offset(xs_xstrat_bytes)},
+	};
+
+	/* Loop over all stats groups */
+
+	for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
+		len += scnprintf(buf + len, PATH_MAX - len, "%s",
+				xstats[i].desc);
+		/* inner loop does each group */
+		for (; j < xstats[i].endpoint; j++)
+			len += scnprintf(buf + len, PATH_MAX - len, " %u",
+					counter_val(stats, j));
+		len += scnprintf(buf + len, PATH_MAX - len, "\n");
+	}
+	/* extra precision counters */
+	for_each_possible_cpu(i) {
+		xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
+		xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
+		xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
+		defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
+	}
+
+	len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n",
+			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+	len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
+			defer_relog);
+	len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
+#if defined(DEBUG)
+		1);
+#else
+		0);
+#endif
+
+	return len;
+}
+
+void xfs_stats_clearall(struct xfsstats __percpu *stats)
+{
+	int		c;
+	uint32_t	vn_active;
+
+	xfs_notice(NULL, "Clearing xfsstats");
+	for_each_possible_cpu(c) {
+		preempt_disable();
+		/* save vn_active, it's a universal truth! */
+		vn_active = per_cpu_ptr(stats, c)->s.vn_active;
+		memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
+		per_cpu_ptr(stats, c)->s.vn_active = vn_active;
+		preempt_enable();
+	}
+}
+
+#ifdef CONFIG_PROC_FS
+/* legacy quota interfaces */
+#ifdef CONFIG_XFS_QUOTA
+
+#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims)
+#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot)
+
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+	/* maximum; incore; ratio free to inuse; freelist */
+	seq_printf(m, "%d\t%d\t%d\t%u\n",
+		   0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
+		   0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
+	return 0;
+}
+
+/* legacy quota stats interface no 2 */
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+	int j;
+
+	seq_puts(m, "qm");
+	for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
+		seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
+	seq_putc(m, '\n');
+	return 0;
+}
+#endif /* CONFIG_XFS_QUOTA */
+
+int
+xfs_init_procfs(void)
+{
+	if (!proc_mkdir("fs/xfs", NULL))
+		return -ENOMEM;
+
+	if (!proc_symlink("fs/xfs/stat", NULL,
+			  "/sys/fs/xfs/stats/stats"))
+		goto out;
+
+#ifdef CONFIG_XFS_QUOTA
+	if (!proc_create_single("fs/xfs/xqmstat", 0, NULL, xqmstat_proc_show))
+		goto out;
+	if (!proc_create_single("fs/xfs/xqm", 0, NULL, xqm_proc_show))
+		goto out;
+#endif
+	return 0;
+
+out:
+	remove_proc_subtree("fs/xfs", NULL);
+	return -ENOMEM;
+}
+
+void
+xfs_cleanup_procfs(void)
+{
+	remove_proc_subtree("fs/xfs", NULL);
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
new file mode 100644
index 000000000..43ffba74f
--- /dev/null
+++ b/fs/xfs/xfs_stats.h
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_STATS_H__
+#define __XFS_STATS_H__
+
+
+#include <linux/percpu.h>
+
+/*
+ * The btree stats arrays have fixed offsets for the different stats. We
+ * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and
+ * that allows us to use fixed offsets into the stats array for each btree
+ * stat. These index offsets are defined in the order they will be emitted
+ * in the stats files, so it is possible to add new btree stat types by
+ * appending to the enum list below.
+ */
+enum {
+	__XBTS_lookup = 0,
+	__XBTS_compare = 1,
+	__XBTS_insrec = 2,
+	__XBTS_delrec = 3,
+	__XBTS_newroot = 4,
+	__XBTS_killroot = 5,
+	__XBTS_increment = 6,
+	__XBTS_decrement = 7,
+	__XBTS_lshift = 8,
+	__XBTS_rshift = 9,
+	__XBTS_split = 10,
+	__XBTS_join = 11,
+	__XBTS_alloc = 12,
+	__XBTS_free = 13,
+	__XBTS_moves = 14,
+
+	__XBTS_MAX = 15,
+};
+
+/*
+ * XFS global statistics
+ */
+struct __xfsstats {
+	uint32_t		xs_allocx;
+	uint32_t		xs_allocb;
+	uint32_t		xs_freex;
+	uint32_t		xs_freeb;
+	uint32_t		xs_abt_lookup;
+	uint32_t		xs_abt_compare;
+	uint32_t		xs_abt_insrec;
+	uint32_t		xs_abt_delrec;
+	uint32_t		xs_blk_mapr;
+	uint32_t		xs_blk_mapw;
+	uint32_t		xs_blk_unmap;
+	uint32_t		xs_add_exlist;
+	uint32_t		xs_del_exlist;
+	uint32_t		xs_look_exlist;
+	uint32_t		xs_cmp_exlist;
+	uint32_t		xs_bmbt_lookup;
+	uint32_t		xs_bmbt_compare;
+	uint32_t		xs_bmbt_insrec;
+	uint32_t		xs_bmbt_delrec;
+	uint32_t		xs_dir_lookup;
+	uint32_t		xs_dir_create;
+	uint32_t		xs_dir_remove;
+	uint32_t		xs_dir_getdents;
+	uint32_t		xs_trans_sync;
+	uint32_t		xs_trans_async;
+	uint32_t		xs_trans_empty;
+	uint32_t		xs_ig_attempts;
+	uint32_t		xs_ig_found;
+	uint32_t		xs_ig_frecycle;
+	uint32_t		xs_ig_missed;
+	uint32_t		xs_ig_dup;
+	uint32_t		xs_ig_reclaims;
+	uint32_t		xs_ig_attrchg;
+	uint32_t		xs_log_writes;
+	uint32_t		xs_log_blocks;
+	uint32_t		xs_log_noiclogs;
+	uint32_t		xs_log_force;
+	uint32_t		xs_log_force_sleep;
+	uint32_t		xs_try_logspace;
+	uint32_t		xs_sleep_logspace;
+	uint32_t		xs_push_ail;
+	uint32_t		xs_push_ail_success;
+	uint32_t		xs_push_ail_pushbuf;
+	uint32_t		xs_push_ail_pinned;
+	uint32_t		xs_push_ail_locked;
+	uint32_t		xs_push_ail_flushing;
+	uint32_t		xs_push_ail_restarts;
+	uint32_t		xs_push_ail_flush;
+	uint32_t		xs_xstrat_quick;
+	uint32_t		xs_xstrat_split;
+	uint32_t		xs_write_calls;
+	uint32_t		xs_read_calls;
+	uint32_t		xs_attr_get;
+	uint32_t		xs_attr_set;
+	uint32_t		xs_attr_remove;
+	uint32_t		xs_attr_list;
+	uint32_t		xs_iflush_count;
+	uint32_t		xs_icluster_flushcnt;
+	uint32_t		xs_icluster_flushinode;
+	uint32_t		vn_active;	/* # vnodes not on free lists */
+	uint32_t		vn_alloc;	/* # times vn_alloc called */
+	uint32_t		vn_get;		/* # times vn_get called */
+	uint32_t		vn_hold;	/* # times vn_hold called */
+	uint32_t		vn_rele;	/* # times vn_rele called */
+	uint32_t		vn_reclaim;	/* # times vn_reclaim called */
+	uint32_t		vn_remove;	/* # times vn_remove called */
+	uint32_t		vn_free;	/* # times vn_free called */
+	uint32_t		xb_get;
+	uint32_t		xb_create;
+	uint32_t		xb_get_locked;
+	uint32_t		xb_get_locked_waited;
+	uint32_t		xb_busy_locked;
+	uint32_t		xb_miss_locked;
+	uint32_t		xb_page_retries;
+	uint32_t		xb_page_found;
+	uint32_t		xb_get_read;
+/* Version 2 btree counters */
+	uint32_t		xs_abtb_2[__XBTS_MAX];
+	uint32_t		xs_abtc_2[__XBTS_MAX];
+	uint32_t		xs_bmbt_2[__XBTS_MAX];
+	uint32_t		xs_ibt_2[__XBTS_MAX];
+	uint32_t		xs_fibt_2[__XBTS_MAX];
+	uint32_t		xs_rmap_2[__XBTS_MAX];
+	uint32_t		xs_refcbt_2[__XBTS_MAX];
+	uint32_t		xs_qm_dqreclaims;
+	uint32_t		xs_qm_dqreclaim_misses;
+	uint32_t		xs_qm_dquot_dups;
+	uint32_t		xs_qm_dqcachemisses;
+	uint32_t		xs_qm_dqcachehits;
+	uint32_t		xs_qm_dqwants;
+	uint32_t		xs_qm_dquot;
+	uint32_t		xs_qm_dquot_unused;
+/* Extra precision counters */
+	uint64_t		xs_xstrat_bytes;
+	uint64_t		xs_write_bytes;
+	uint64_t		xs_read_bytes;
+	uint64_t		defer_relog;
+};
+
+#define	xfsstats_offset(f)	(offsetof(struct __xfsstats, f)/sizeof(uint32_t))
+
+struct xfsstats {
+	union {
+		struct __xfsstats	s;
+		uint32_t		a[xfsstats_offset(xs_qm_dquot)];
+	};
+};
+
+/*
+ * simple wrapper for getting the array index of s struct member offset
+ */
+#define XFS_STATS_CALC_INDEX(member)	\
+	(offsetof(struct __xfsstats, member) / (int)sizeof(uint32_t))
+
+
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
+void xfs_stats_clearall(struct xfsstats __percpu *stats);
+extern struct xstats xfsstats;
+
+#define XFS_STATS_INC(mp, v)					\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++;	\
+} while (0)
+
+#define XFS_STATS_DEC(mp, v)					\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--;	\
+} while (0)
+
+#define XFS_STATS_ADD(mp, v, inc)					\
+do {									\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc);	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc);	\
+} while (0)
+
+#define XFS_STATS_INC_OFF(mp, off)				\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++;	\
+} while (0)
+
+#define XFS_STATS_DEC_OFF(mp, off)					\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off];	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off];	\
+} while (0)
+
+#define XFS_STATS_ADD_OFF(mp, off, inc)					\
+do {									\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc);	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc);	\
+} while (0)
+
+#if defined(CONFIG_PROC_FS)
+
+extern int xfs_init_procfs(void);
+extern void xfs_cleanup_procfs(void);
+
+
+#else	/* !CONFIG_PROC_FS */
+
+static inline int xfs_init_procfs(void)
+{
+	return 0;
+}
+
+static inline void xfs_cleanup_procfs(void)
+{
+}
+
+#endif	/* !CONFIG_PROC_FS */
+
+#endif /* __XFS_STATS_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
new file mode 100644
index 000000000..12662b169
--- /dev/null
+++ b/fs/xfs/xfs_super.c
@@ -0,0 +1,2407 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+#include "xfs_fsops.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_dir2.h"
+#include "xfs_extfree_item.h"
+#include "xfs_mru_cache.h"
+#include "xfs_inode_item.h"
+#include "xfs_icache.h"
+#include "xfs_trace.h"
+#include "xfs_icreate_item.h"
+#include "xfs_filestream.h"
+#include "xfs_quota.h"
+#include "xfs_sysfs.h"
+#include "xfs_ondisk.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_reflink.h"
+#include "xfs_pwork.h"
+#include "xfs_ag.h"
+#include "xfs_defer.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
+#include "xfs_iunlink_item.h"
+
+#include <linux/magic.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+
+static const struct super_operations xfs_super_operations;
+
+static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
+#ifdef DEBUG
+static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+static LIST_HEAD(xfs_mount_list);
+static DEFINE_SPINLOCK(xfs_mount_list_lock);
+
+static inline void xfs_mount_list_add(struct xfs_mount *mp)
+{
+	spin_lock(&xfs_mount_list_lock);
+	list_add(&mp->m_mount_list, &xfs_mount_list);
+	spin_unlock(&xfs_mount_list_lock);
+}
+
+static inline void xfs_mount_list_del(struct xfs_mount *mp)
+{
+	spin_lock(&xfs_mount_list_lock);
+	list_del(&mp->m_mount_list);
+	spin_unlock(&xfs_mount_list_lock);
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static inline void xfs_mount_list_add(struct xfs_mount *mp) {}
+static inline void xfs_mount_list_del(struct xfs_mount *mp) {}
+#endif
+
+enum xfs_dax_mode {
+	XFS_DAX_INODE = 0,
+	XFS_DAX_ALWAYS = 1,
+	XFS_DAX_NEVER = 2,
+};
+
+static void
+xfs_mount_set_dax_mode(
+	struct xfs_mount	*mp,
+	enum xfs_dax_mode	mode)
+{
+	switch (mode) {
+	case XFS_DAX_INODE:
+		mp->m_features &= ~(XFS_FEAT_DAX_ALWAYS | XFS_FEAT_DAX_NEVER);
+		break;
+	case XFS_DAX_ALWAYS:
+		mp->m_features |= XFS_FEAT_DAX_ALWAYS;
+		mp->m_features &= ~XFS_FEAT_DAX_NEVER;
+		break;
+	case XFS_DAX_NEVER:
+		mp->m_features |= XFS_FEAT_DAX_NEVER;
+		mp->m_features &= ~XFS_FEAT_DAX_ALWAYS;
+		break;
+	}
+}
+
+static const struct constant_table dax_param_enums[] = {
+	{"inode",	XFS_DAX_INODE },
+	{"always",	XFS_DAX_ALWAYS },
+	{"never",	XFS_DAX_NEVER },
+	{}
+};
+
+/*
+ * Table driven mount option parser.
+ */
+enum {
+	Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
+	Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
+	Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
+	Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
+	Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2,
+	Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
+	Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
+	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
+	Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
+};
+
+static const struct fs_parameter_spec xfs_fs_parameters[] = {
+	fsparam_u32("logbufs",		Opt_logbufs),
+	fsparam_string("logbsize",	Opt_logbsize),
+	fsparam_string("logdev",	Opt_logdev),
+	fsparam_string("rtdev",		Opt_rtdev),
+	fsparam_flag("wsync",		Opt_wsync),
+	fsparam_flag("noalign",		Opt_noalign),
+	fsparam_flag("swalloc",		Opt_swalloc),
+	fsparam_u32("sunit",		Opt_sunit),
+	fsparam_u32("swidth",		Opt_swidth),
+	fsparam_flag("nouuid",		Opt_nouuid),
+	fsparam_flag("grpid",		Opt_grpid),
+	fsparam_flag("nogrpid",		Opt_nogrpid),
+	fsparam_flag("bsdgroups",	Opt_bsdgroups),
+	fsparam_flag("sysvgroups",	Opt_sysvgroups),
+	fsparam_string("allocsize",	Opt_allocsize),
+	fsparam_flag("norecovery",	Opt_norecovery),
+	fsparam_flag("inode64",		Opt_inode64),
+	fsparam_flag("inode32",		Opt_inode32),
+	fsparam_flag("ikeep",		Opt_ikeep),
+	fsparam_flag("noikeep",		Opt_noikeep),
+	fsparam_flag("largeio",		Opt_largeio),
+	fsparam_flag("nolargeio",	Opt_nolargeio),
+	fsparam_flag("attr2",		Opt_attr2),
+	fsparam_flag("noattr2",		Opt_noattr2),
+	fsparam_flag("filestreams",	Opt_filestreams),
+	fsparam_flag("quota",		Opt_quota),
+	fsparam_flag("noquota",		Opt_noquota),
+	fsparam_flag("usrquota",	Opt_usrquota),
+	fsparam_flag("grpquota",	Opt_grpquota),
+	fsparam_flag("prjquota",	Opt_prjquota),
+	fsparam_flag("uquota",		Opt_uquota),
+	fsparam_flag("gquota",		Opt_gquota),
+	fsparam_flag("pquota",		Opt_pquota),
+	fsparam_flag("uqnoenforce",	Opt_uqnoenforce),
+	fsparam_flag("gqnoenforce",	Opt_gqnoenforce),
+	fsparam_flag("pqnoenforce",	Opt_pqnoenforce),
+	fsparam_flag("qnoenforce",	Opt_qnoenforce),
+	fsparam_flag("discard",		Opt_discard),
+	fsparam_flag("nodiscard",	Opt_nodiscard),
+	fsparam_flag("dax",		Opt_dax),
+	fsparam_enum("dax",		Opt_dax_enum, dax_param_enums),
+	{}
+};
+
+struct proc_xfs_info {
+	uint64_t	flag;
+	char		*str;
+};
+
+static int
+xfs_fs_show_options(
+	struct seq_file		*m,
+	struct dentry		*root)
+{
+	static struct proc_xfs_info xfs_info_set[] = {
+		/* the few simple ones we can get from the mount struct */
+		{ XFS_FEAT_IKEEP,		",ikeep" },
+		{ XFS_FEAT_WSYNC,		",wsync" },
+		{ XFS_FEAT_NOALIGN,		",noalign" },
+		{ XFS_FEAT_SWALLOC,		",swalloc" },
+		{ XFS_FEAT_NOUUID,		",nouuid" },
+		{ XFS_FEAT_NORECOVERY,		",norecovery" },
+		{ XFS_FEAT_ATTR2,		",attr2" },
+		{ XFS_FEAT_FILESTREAMS,		",filestreams" },
+		{ XFS_FEAT_GRPID,		",grpid" },
+		{ XFS_FEAT_DISCARD,		",discard" },
+		{ XFS_FEAT_LARGE_IOSIZE,	",largeio" },
+		{ XFS_FEAT_DAX_ALWAYS,		",dax=always" },
+		{ XFS_FEAT_DAX_NEVER,		",dax=never" },
+		{ 0, NULL }
+	};
+	struct xfs_mount	*mp = XFS_M(root->d_sb);
+	struct proc_xfs_info	*xfs_infop;
+
+	for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
+		if (mp->m_features & xfs_infop->flag)
+			seq_puts(m, xfs_infop->str);
+	}
+
+	seq_printf(m, ",inode%d", xfs_has_small_inums(mp) ? 32 : 64);
+
+	if (xfs_has_allocsize(mp))
+		seq_printf(m, ",allocsize=%dk",
+			   (1 << mp->m_allocsize_log) >> 10);
+
+	if (mp->m_logbufs > 0)
+		seq_printf(m, ",logbufs=%d", mp->m_logbufs);
+	if (mp->m_logbsize > 0)
+		seq_printf(m, ",logbsize=%dk", mp->m_logbsize >> 10);
+
+	if (mp->m_logname)
+		seq_show_option(m, "logdev", mp->m_logname);
+	if (mp->m_rtname)
+		seq_show_option(m, "rtdev", mp->m_rtname);
+
+	if (mp->m_dalign > 0)
+		seq_printf(m, ",sunit=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
+	if (mp->m_swidth > 0)
+		seq_printf(m, ",swidth=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
+
+	if (mp->m_qflags & XFS_UQUOTA_ENFD)
+		seq_puts(m, ",usrquota");
+	else if (mp->m_qflags & XFS_UQUOTA_ACCT)
+		seq_puts(m, ",uqnoenforce");
+
+	if (mp->m_qflags & XFS_PQUOTA_ENFD)
+		seq_puts(m, ",prjquota");
+	else if (mp->m_qflags & XFS_PQUOTA_ACCT)
+		seq_puts(m, ",pqnoenforce");
+
+	if (mp->m_qflags & XFS_GQUOTA_ENFD)
+		seq_puts(m, ",grpquota");
+	else if (mp->m_qflags & XFS_GQUOTA_ACCT)
+		seq_puts(m, ",gqnoenforce");
+
+	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
+		seq_puts(m, ",noquota");
+
+	return 0;
+}
+
+/*
+ * Set parameters for inode allocation heuristics, taking into account
+ * filesystem size and inode32/inode64 mount options; i.e. specifically
+ * whether or not XFS_FEAT_SMALL_INUMS is set.
+ *
+ * Inode allocation patterns are altered only if inode32 is requested
+ * (XFS_FEAT_SMALL_INUMS), and the filesystem is sufficiently large.
+ * If altered, XFS_OPSTATE_INODE32 is set as well.
+ *
+ * An agcount independent of that in the mount structure is provided
+ * because in the growfs case, mp->m_sb.sb_agcount is not yet updated
+ * to the potentially higher ag count.
+ *
+ * Returns the maximum AG index which may contain inodes.
+ */
+xfs_agnumber_t
+xfs_set_inode_alloc(
+	struct xfs_mount *mp,
+	xfs_agnumber_t	agcount)
+{
+	xfs_agnumber_t	index;
+	xfs_agnumber_t	maxagi = 0;
+	xfs_sb_t	*sbp = &mp->m_sb;
+	xfs_agnumber_t	max_metadata;
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
+
+	/*
+	 * Calculate how much should be reserved for inodes to meet
+	 * the max inode percentage.  Used only for inode32.
+	 */
+	if (M_IGEO(mp)->maxicount) {
+		uint64_t	icount;
+
+		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+		do_div(icount, 100);
+		icount += sbp->sb_agblocks - 1;
+		do_div(icount, sbp->sb_agblocks);
+		max_metadata = icount;
+	} else {
+		max_metadata = agcount;
+	}
+
+	/* Get the last possible inode in the filesystem */
+	agino =	XFS_AGB_TO_AGINO(mp, sbp->sb_agblocks - 1);
+	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+	/*
+	 * If user asked for no more than 32-bit inodes, and the fs is
+	 * sufficiently large, set XFS_OPSTATE_INODE32 if we must alter
+	 * the allocator to accommodate the request.
+	 */
+	if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32)
+		set_bit(XFS_OPSTATE_INODE32, &mp->m_opstate);
+	else
+		clear_bit(XFS_OPSTATE_INODE32, &mp->m_opstate);
+
+	for (index = 0; index < agcount; index++) {
+		struct xfs_perag	*pag;
+
+		ino = XFS_AGINO_TO_INO(mp, index, agino);
+
+		pag = xfs_perag_get(mp, index);
+
+		if (xfs_is_inode32(mp)) {
+			if (ino > XFS_MAXINUMBER_32) {
+				pag->pagi_inodeok = 0;
+				pag->pagf_metadata = 0;
+			} else {
+				pag->pagi_inodeok = 1;
+				maxagi++;
+				if (index < max_metadata)
+					pag->pagf_metadata = 1;
+				else
+					pag->pagf_metadata = 0;
+			}
+		} else {
+			pag->pagi_inodeok = 1;
+			pag->pagf_metadata = 0;
+		}
+
+		xfs_perag_put(pag);
+	}
+
+	return xfs_is_inode32(mp) ? maxagi : agcount;
+}
+
+static int
+xfs_setup_dax_always(
+	struct xfs_mount	*mp)
+{
+	if (!mp->m_ddev_targp->bt_daxdev &&
+	    (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) {
+		xfs_alert(mp,
+			"DAX unsupported by block device. Turning off DAX.");
+		goto disable_dax;
+	}
+
+	if (mp->m_super->s_blocksize != PAGE_SIZE) {
+		xfs_alert(mp,
+			"DAX not supported for blocksize. Turning off DAX.");
+		goto disable_dax;
+	}
+
+	if (xfs_has_reflink(mp) &&
+	    bdev_is_partition(mp->m_ddev_targp->bt_bdev)) {
+		xfs_alert(mp,
+			"DAX and reflink cannot work with multi-partitions!");
+		return -EINVAL;
+	}
+
+	xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+	return 0;
+
+disable_dax:
+	xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
+	return 0;
+}
+
+STATIC int
+xfs_blkdev_get(
+	xfs_mount_t		*mp,
+	const char		*name,
+	struct block_device	**bdevp)
+{
+	int			error = 0;
+
+	*bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				    mp);
+	if (IS_ERR(*bdevp)) {
+		error = PTR_ERR(*bdevp);
+		xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
+	}
+
+	return error;
+}
+
+STATIC void
+xfs_blkdev_put(
+	struct block_device	*bdev)
+{
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+STATIC void
+xfs_close_devices(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+		struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+
+		xfs_free_buftarg(mp->m_logdev_targp);
+		xfs_blkdev_put(logdev);
+	}
+	if (mp->m_rtdev_targp) {
+		struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+
+		xfs_free_buftarg(mp->m_rtdev_targp);
+		xfs_blkdev_put(rtdev);
+	}
+	xfs_free_buftarg(mp->m_ddev_targp);
+}
+
+/*
+ * The file system configurations are:
+ *	(1) device (partition) with data and internal log
+ *	(2) logical volume with data and log subvolumes.
+ *	(3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present.  The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in sb->s_bdev.
+ */
+STATIC int
+xfs_open_devices(
+	struct xfs_mount	*mp)
+{
+	struct block_device	*ddev = mp->m_super->s_bdev;
+	struct block_device	*logdev = NULL, *rtdev = NULL;
+	int			error;
+
+	/*
+	 * Open real time and log devices - order is important.
+	 */
+	if (mp->m_logname) {
+		error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+		if (error)
+			return error;
+	}
+
+	if (mp->m_rtname) {
+		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+		if (error)
+			goto out_close_logdev;
+
+		if (rtdev == ddev || rtdev == logdev) {
+			xfs_warn(mp,
+	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
+			error = -EINVAL;
+			goto out_close_rtdev;
+		}
+	}
+
+	/*
+	 * Setup xfs_mount buffer target pointers
+	 */
+	error = -ENOMEM;
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+	if (!mp->m_ddev_targp)
+		goto out_close_rtdev;
+
+	if (rtdev) {
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+		if (!mp->m_rtdev_targp)
+			goto out_free_ddev_targ;
+	}
+
+	if (logdev && logdev != ddev) {
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+		if (!mp->m_logdev_targp)
+			goto out_free_rtdev_targ;
+	} else {
+		mp->m_logdev_targp = mp->m_ddev_targp;
+	}
+
+	return 0;
+
+ out_free_rtdev_targ:
+	if (mp->m_rtdev_targp)
+		xfs_free_buftarg(mp->m_rtdev_targp);
+ out_free_ddev_targ:
+	xfs_free_buftarg(mp->m_ddev_targp);
+ out_close_rtdev:
+	xfs_blkdev_put(rtdev);
+ out_close_logdev:
+	if (logdev && logdev != ddev)
+		xfs_blkdev_put(logdev);
+	return error;
+}
+
+/*
+ * Setup xfs_mount buffer target pointers based on superblock
+ */
+STATIC int
+xfs_setup_devices(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
+	if (error)
+		return error;
+
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+		unsigned int	log_sector_size = BBSIZE;
+
+		if (xfs_has_sector(mp))
+			log_sector_size = mp->m_sb.sb_logsectsize;
+		error = xfs_setsize_buftarg(mp->m_logdev_targp,
+					    log_sector_size);
+		if (error)
+			return error;
+	}
+	if (mp->m_rtdev_targp) {
+		error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+					    mp->m_sb.sb_sectsize);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_init_mount_workqueues(
+	struct xfs_mount	*mp)
+{
+	mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			1, mp->m_super->s_id);
+	if (!mp->m_buf_workqueue)
+		goto out;
+
+	mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			0, mp->m_super->s_id);
+	if (!mp->m_unwritten_workqueue)
+		goto out_destroy_buf;
+
+	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			0, mp->m_super->s_id);
+	if (!mp->m_reclaim_workqueue)
+		goto out_destroy_unwritten;
+
+	mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s",
+			XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			0, mp->m_super->s_id);
+	if (!mp->m_blockgc_wq)
+		goto out_destroy_reclaim;
+
+	mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s",
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			1, mp->m_super->s_id);
+	if (!mp->m_inodegc_wq)
+		goto out_destroy_blockgc;
+
+	mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
+			XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
+	if (!mp->m_sync_workqueue)
+		goto out_destroy_inodegc;
+
+	return 0;
+
+out_destroy_inodegc:
+	destroy_workqueue(mp->m_inodegc_wq);
+out_destroy_blockgc:
+	destroy_workqueue(mp->m_blockgc_wq);
+out_destroy_reclaim:
+	destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_unwritten:
+	destroy_workqueue(mp->m_unwritten_workqueue);
+out_destroy_buf:
+	destroy_workqueue(mp->m_buf_workqueue);
+out:
+	return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_mount_workqueues(
+	struct xfs_mount	*mp)
+{
+	destroy_workqueue(mp->m_sync_workqueue);
+	destroy_workqueue(mp->m_blockgc_wq);
+	destroy_workqueue(mp->m_inodegc_wq);
+	destroy_workqueue(mp->m_reclaim_workqueue);
+	destroy_workqueue(mp->m_unwritten_workqueue);
+	destroy_workqueue(mp->m_buf_workqueue);
+}
+
+static void
+xfs_flush_inodes_worker(
+	struct work_struct	*work)
+{
+	struct xfs_mount	*mp = container_of(work, struct xfs_mount,
+						   m_flush_inodes_work);
+	struct super_block	*sb = mp->m_super;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		sync_inodes_sb(sb);
+		up_read(&sb->s_umount);
+	}
+}
+
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+	struct xfs_mount	*mp)
+{
+	/*
+	 * If flush_work() returns true then that means we waited for a flush
+	 * which was already in progress.  Don't bother running another scan.
+	 */
+	if (flush_work(&mp->m_flush_inodes_work))
+		return;
+
+	queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work);
+	flush_work(&mp->m_flush_inodes_work);
+}
+
+/* Catch misguided souls that try to use this interface on XFS */
+STATIC struct inode *
+xfs_fs_alloc_inode(
+	struct super_block	*sb)
+{
+	BUG();
+	return NULL;
+}
+
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can inactivate and reclaim the inode.
+ */
+STATIC void
+xfs_fs_destroy_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	trace_xfs_destroy_inode(ip);
+
+	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+	XFS_STATS_INC(ip->i_mount, vn_rele);
+	XFS_STATS_INC(ip->i_mount, vn_remove);
+	xfs_inode_mark_reclaimable(ip);
+}
+
+static void
+xfs_fs_dirty_inode(
+	struct inode			*inode,
+	int				flags)
+{
+	struct xfs_inode		*ip = XFS_I(inode);
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_trans		*tp;
+
+	if (!(inode->i_sb->s_flags & SB_LAZYTIME))
+		return;
+
+	/*
+	 * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC)
+	 * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed
+	 * in flags possibly together with I_DIRTY_SYNC.
+	 */
+	if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME))
+		return;
+
+	if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp))
+		return;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+	xfs_trans_commit(tp);
+}
+
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly initialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
+STATIC void
+xfs_fs_inode_init_once(
+	void			*inode)
+{
+	struct xfs_inode	*ip = inode;
+
+	memset(ip, 0, sizeof(struct xfs_inode));
+
+	/* vfs inode */
+	inode_init_once(VFS_I(ip));
+
+	/* xfs inode */
+	atomic_set(&ip->i_pincount, 0);
+	spin_lock_init(&ip->i_flags_lock);
+
+	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+		     "xfsino", ip->i_ino);
+}
+
+/*
+ * We do an unlocked check for XFS_IDONTCACHE here because we are already
+ * serialised against cache hits here via the inode->i_lock and igrab() in
+ * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be
+ * racing with us, and it avoids needing to grab a spinlock here for every inode
+ * we drop the final reference on.
+ */
+STATIC int
+xfs_fs_drop_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	/*
+	 * If this unlinked inode is in the middle of recovery, don't
+	 * drop the inode just yet; log recovery will take care of
+	 * that.  See the comment for this inode flag.
+	 */
+	if (ip->i_flags & XFS_IRECOVERY) {
+		ASSERT(xlog_recovery_needed(ip->i_mount->m_log));
+		return 0;
+	}
+
+	return generic_drop_inode(inode);
+}
+
+static void
+xfs_mount_free(
+	struct xfs_mount	*mp)
+{
+	kfree(mp->m_rtname);
+	kfree(mp->m_logname);
+	kmem_free(mp);
+}
+
+STATIC int
+xfs_fs_sync_fs(
+	struct super_block	*sb,
+	int			wait)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	int			error;
+
+	trace_xfs_fs_sync_fs(mp, __return_address);
+
+	/*
+	 * Doing anything during the async pass would be counterproductive.
+	 */
+	if (!wait)
+		return 0;
+
+	error = xfs_log_force(mp, XFS_LOG_SYNC);
+	if (error)
+		return error;
+
+	if (laptop_mode) {
+		/*
+		 * The disk must be active because we're syncing.
+		 * We schedule log work now (now that the disk is
+		 * active) instead of later (when it might not be).
+		 */
+		flush_delayed_work(&mp->m_log->l_work);
+	}
+
+	/*
+	 * If we are called with page faults frozen out, it means we are about
+	 * to freeze the transaction subsystem. Take the opportunity to shut
+	 * down inodegc because once SB_FREEZE_FS is set it's too late to
+	 * prevent inactivation races with freeze. The fs doesn't get called
+	 * again by the freezing process until after SB_FREEZE_FS has been set,
+	 * so it's now or never.  Same logic applies to speculative allocation
+	 * garbage collection.
+	 *
+	 * We don't care if this is a normal syncfs call that does this or
+	 * freeze that does this - we can run this multiple times without issue
+	 * and we won't race with a restart because a restart can only occur
+	 * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE.
+	 */
+	if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
+		xfs_inodegc_stop(mp);
+		xfs_blockgc_stop(mp);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_fs_statfs(
+	struct dentry		*dentry,
+	struct kstatfs		*statp)
+{
+	struct xfs_mount	*mp = XFS_M(dentry->d_sb);
+	xfs_sb_t		*sbp = &mp->m_sb;
+	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	uint64_t		fakeinos, id;
+	uint64_t		icount;
+	uint64_t		ifree;
+	uint64_t		fdblocks;
+	xfs_extlen_t		lsize;
+	int64_t			ffree;
+
+	/*
+	 * Expedite background inodegc but don't wait. We do not want to block
+	 * here waiting hours for a billion extent file to be truncated.
+	 */
+	xfs_inodegc_push(mp);
+
+	statp->f_type = XFS_SUPER_MAGIC;
+	statp->f_namelen = MAXNAMELEN - 1;
+
+	id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
+	statp->f_fsid = u64_to_fsid(id);
+
+	icount = percpu_counter_sum(&mp->m_icount);
+	ifree = percpu_counter_sum(&mp->m_ifree);
+	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
+	spin_lock(&mp->m_sb_lock);
+	statp->f_bsize = sbp->sb_blocksize;
+	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+	statp->f_blocks = sbp->sb_dblocks - lsize;
+	spin_unlock(&mp->m_sb_lock);
+
+	/* make sure statp->f_bfree does not underflow */
+	statp->f_bfree = max_t(int64_t, 0,
+				fdblocks - xfs_fdblocks_unavailable(mp));
+	statp->f_bavail = statp->f_bfree;
+
+	fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
+	statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
+	if (M_IGEO(mp)->maxicount)
+		statp->f_files = min_t(typeof(statp->f_files),
+					statp->f_files,
+					M_IGEO(mp)->maxicount);
+
+	/* If sb_icount overshot maxicount, report actual allocation */
+	statp->f_files = max_t(typeof(statp->f_files),
+					statp->f_files,
+					sbp->sb_icount);
+
+	/* make sure statp->f_ffree does not underflow */
+	ffree = statp->f_files - (icount - ifree);
+	statp->f_ffree = max_t(int64_t, ffree, 0);
+
+
+	if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+			      (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
+		xfs_qm_statvfs(ip, statp);
+
+	if (XFS_IS_REALTIME_MOUNT(mp) &&
+	    (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+		s64	freertx;
+
+		statp->f_blocks = sbp->sb_rblocks;
+		freertx = percpu_counter_sum_positive(&mp->m_frextents);
+		statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
+	}
+
+	return 0;
+}
+
+STATIC void
+xfs_save_resvblks(struct xfs_mount *mp)
+{
+	uint64_t resblks = 0;
+
+	mp->m_resblks_save = mp->m_resblks;
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC void
+xfs_restore_resvblks(struct xfs_mount *mp)
+{
+	uint64_t resblks;
+
+	if (mp->m_resblks_save) {
+		resblks = mp->m_resblks_save;
+		mp->m_resblks_save = 0;
+	} else
+		resblks = xfs_default_resblks(mp);
+
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+/*
+ * Second stage of a freeze. The data is already frozen so we only
+ * need to take care of the metadata. Once that's done sync the superblock
+ * to the log to dirty it in case of a crash while frozen. This ensures that we
+ * will recover the unlinked inode lists on the next mount.
+ */
+STATIC int
+xfs_fs_freeze(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	unsigned int		flags;
+	int			ret;
+
+	/*
+	 * The filesystem is now frozen far enough that memory reclaim
+	 * cannot safely operate on the filesystem. Hence we need to
+	 * set a GFP_NOFS context here to avoid recursion deadlocks.
+	 */
+	flags = memalloc_nofs_save();
+	xfs_save_resvblks(mp);
+	ret = xfs_log_quiesce(mp);
+	memalloc_nofs_restore(flags);
+
+	/*
+	 * For read-write filesystems, we need to restart the inodegc on error
+	 * because we stopped it at SB_FREEZE_PAGEFAULT level and a thaw is not
+	 * going to be run to restart it now.  We are at SB_FREEZE_FS level
+	 * here, so we can restart safely without racing with a stop in
+	 * xfs_fs_sync_fs().
+	 */
+	if (ret && !xfs_is_readonly(mp)) {
+		xfs_blockgc_start(mp);
+		xfs_inodegc_start(mp);
+	}
+
+	return ret;
+}
+
+STATIC int
+xfs_fs_unfreeze(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_restore_resvblks(mp);
+	xfs_log_work_queue(mp);
+
+	/*
+	 * Don't reactivate the inodegc worker on a readonly filesystem because
+	 * inodes are sent directly to reclaim.  Don't reactivate the blockgc
+	 * worker because there are no speculative preallocations on a readonly
+	 * filesystem.
+	 */
+	if (!xfs_is_readonly(mp)) {
+		xfs_blockgc_start(mp);
+		xfs_inodegc_start(mp);
+	}
+
+	return 0;
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+	struct xfs_mount	*mp)
+{
+	/* Fail a mount where the logbuf is smaller than the log stripe */
+	if (xfs_has_logv2(mp)) {
+		if (mp->m_logbsize <= 0 &&
+		    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
+			mp->m_logbsize = mp->m_sb.sb_logsunit;
+		} else if (mp->m_logbsize > 0 &&
+			   mp->m_logbsize < mp->m_sb.sb_logsunit) {
+			xfs_warn(mp,
+		"logbuf size must be greater than or equal to log stripe size");
+			return -EINVAL;
+		}
+	} else {
+		/* Fail a mount if the logbuf is larger than 32K */
+		if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
+			xfs_warn(mp,
+		"logbuf size for version 1 logs must be 16K or 32K");
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * V5 filesystems always use attr2 format for attributes.
+	 */
+	if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) {
+		xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
+			     "attr2 is always enabled for V5 filesystems.");
+		return -EINVAL;
+	}
+
+	/*
+	 * prohibit r/w mounts of read-only filesystems
+	 */
+	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) {
+		xfs_warn(mp,
+			"cannot mount a read-only filesystem as read-write");
+		return -EROFS;
+	}
+
+	if ((mp->m_qflags & XFS_GQUOTA_ACCT) &&
+	    (mp->m_qflags & XFS_PQUOTA_ACCT) &&
+	    !xfs_has_pquotino(mp)) {
+		xfs_warn(mp,
+		  "Super block does not support project and group quota together");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+xfs_init_percpu_counters(
+	struct xfs_mount	*mp)
+{
+	int		error;
+
+	error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
+	if (error)
+		return -ENOMEM;
+
+	error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL);
+	if (error)
+		goto free_icount;
+
+	error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
+	if (error)
+		goto free_ifree;
+
+	error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
+	if (error)
+		goto free_fdblocks;
+
+	error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+	if (error)
+		goto free_delalloc;
+
+	return 0;
+
+free_delalloc:
+	percpu_counter_destroy(&mp->m_delalloc_blks);
+free_fdblocks:
+	percpu_counter_destroy(&mp->m_fdblocks);
+free_ifree:
+	percpu_counter_destroy(&mp->m_ifree);
+free_icount:
+	percpu_counter_destroy(&mp->m_icount);
+	return -ENOMEM;
+}
+
+void
+xfs_reinit_percpu_counters(
+	struct xfs_mount	*mp)
+{
+	percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
+	percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
+	percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+	percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+}
+
+static void
+xfs_destroy_percpu_counters(
+	struct xfs_mount	*mp)
+{
+	percpu_counter_destroy(&mp->m_icount);
+	percpu_counter_destroy(&mp->m_ifree);
+	percpu_counter_destroy(&mp->m_fdblocks);
+	ASSERT(xfs_is_shutdown(mp) ||
+	       percpu_counter_sum(&mp->m_delalloc_blks) == 0);
+	percpu_counter_destroy(&mp->m_delalloc_blks);
+	percpu_counter_destroy(&mp->m_frextents);
+}
+
+static int
+xfs_inodegc_init_percpu(
+	struct xfs_mount	*mp)
+{
+	struct xfs_inodegc	*gc;
+	int			cpu;
+
+	mp->m_inodegc = alloc_percpu(struct xfs_inodegc);
+	if (!mp->m_inodegc)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+#if defined(DEBUG) || defined(XFS_WARN)
+		gc->cpu = cpu;
+#endif
+		init_llist_head(&gc->list);
+		gc->items = 0;
+		INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+	}
+	return 0;
+}
+
+static void
+xfs_inodegc_free_percpu(
+	struct xfs_mount	*mp)
+{
+	if (!mp->m_inodegc)
+		return;
+	free_percpu(mp->m_inodegc);
+}
+
+static void
+xfs_fs_put_super(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	/* if ->fill_super failed, we have no mount to tear down */
+	if (!sb->s_fs_info)
+		return;
+
+	xfs_notice(mp, "Unmounting Filesystem");
+	xfs_filestream_unmount(mp);
+	xfs_unmountfs(mp);
+
+	xfs_freesb(mp);
+	free_percpu(mp->m_stats.xs_stats);
+	xfs_mount_list_del(mp);
+	xfs_inodegc_free_percpu(mp);
+	xfs_destroy_percpu_counters(mp);
+	xfs_destroy_mount_workqueues(mp);
+	xfs_close_devices(mp);
+
+	sb->s_fs_info = NULL;
+	xfs_mount_free(mp);
+}
+
+static long
+xfs_fs_nr_cached_objects(
+	struct super_block	*sb,
+	struct shrink_control	*sc)
+{
+	/* Paranoia: catch incorrect calls during mount setup or teardown */
+	if (WARN_ON_ONCE(!sb->s_fs_info))
+		return 0;
+	return xfs_reclaim_inodes_count(XFS_M(sb));
+}
+
+static long
+xfs_fs_free_cached_objects(
+	struct super_block	*sb,
+	struct shrink_control	*sc)
+{
+	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
+}
+
+static const struct super_operations xfs_super_operations = {
+	.alloc_inode		= xfs_fs_alloc_inode,
+	.destroy_inode		= xfs_fs_destroy_inode,
+	.dirty_inode		= xfs_fs_dirty_inode,
+	.drop_inode		= xfs_fs_drop_inode,
+	.put_super		= xfs_fs_put_super,
+	.sync_fs		= xfs_fs_sync_fs,
+	.freeze_fs		= xfs_fs_freeze,
+	.unfreeze_fs		= xfs_fs_unfreeze,
+	.statfs			= xfs_fs_statfs,
+	.show_options		= xfs_fs_show_options,
+	.nr_cached_objects	= xfs_fs_nr_cached_objects,
+	.free_cached_objects	= xfs_fs_free_cached_objects,
+};
+
+static int
+suffix_kstrtoint(
+	const char	*s,
+	unsigned int	base,
+	int		*res)
+{
+	int		last, shift_left_factor = 0, _res;
+	char		*value;
+	int		ret = 0;
+
+	value = kstrdup(s, GFP_KERNEL);
+	if (!value)
+		return -ENOMEM;
+
+	last = strlen(value) - 1;
+	if (value[last] == 'K' || value[last] == 'k') {
+		shift_left_factor = 10;
+		value[last] = '\0';
+	}
+	if (value[last] == 'M' || value[last] == 'm') {
+		shift_left_factor = 20;
+		value[last] = '\0';
+	}
+	if (value[last] == 'G' || value[last] == 'g') {
+		shift_left_factor = 30;
+		value[last] = '\0';
+	}
+
+	if (kstrtoint(value, base, &_res))
+		ret = -EINVAL;
+	kfree(value);
+	*res = _res << shift_left_factor;
+	return ret;
+}
+
+static inline void
+xfs_fs_warn_deprecated(
+	struct fs_context	*fc,
+	struct fs_parameter	*param,
+	uint64_t		flag,
+	bool			value)
+{
+	/* Don't print the warning if reconfiguring and current mount point
+	 * already had the flag set
+	 */
+	if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
+            !!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
+		return;
+	xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
+}
+
+/*
+ * Set mount state from a mount option.
+ *
+ * NOTE: mp->m_super is NULL here!
+ */
+static int
+xfs_fs_parse_param(
+	struct fs_context	*fc,
+	struct fs_parameter	*param)
+{
+	struct xfs_mount	*parsing_mp = fc->s_fs_info;
+	struct fs_parse_result	result;
+	int			size = 0;
+	int			opt;
+
+	opt = fs_parse(fc, xfs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_logbufs:
+		parsing_mp->m_logbufs = result.uint_32;
+		return 0;
+	case Opt_logbsize:
+		if (suffix_kstrtoint(param->string, 10, &parsing_mp->m_logbsize))
+			return -EINVAL;
+		return 0;
+	case Opt_logdev:
+		kfree(parsing_mp->m_logname);
+		parsing_mp->m_logname = kstrdup(param->string, GFP_KERNEL);
+		if (!parsing_mp->m_logname)
+			return -ENOMEM;
+		return 0;
+	case Opt_rtdev:
+		kfree(parsing_mp->m_rtname);
+		parsing_mp->m_rtname = kstrdup(param->string, GFP_KERNEL);
+		if (!parsing_mp->m_rtname)
+			return -ENOMEM;
+		return 0;
+	case Opt_allocsize:
+		if (suffix_kstrtoint(param->string, 10, &size))
+			return -EINVAL;
+		parsing_mp->m_allocsize_log = ffs(size) - 1;
+		parsing_mp->m_features |= XFS_FEAT_ALLOCSIZE;
+		return 0;
+	case Opt_grpid:
+	case Opt_bsdgroups:
+		parsing_mp->m_features |= XFS_FEAT_GRPID;
+		return 0;
+	case Opt_nogrpid:
+	case Opt_sysvgroups:
+		parsing_mp->m_features &= ~XFS_FEAT_GRPID;
+		return 0;
+	case Opt_wsync:
+		parsing_mp->m_features |= XFS_FEAT_WSYNC;
+		return 0;
+	case Opt_norecovery:
+		parsing_mp->m_features |= XFS_FEAT_NORECOVERY;
+		return 0;
+	case Opt_noalign:
+		parsing_mp->m_features |= XFS_FEAT_NOALIGN;
+		return 0;
+	case Opt_swalloc:
+		parsing_mp->m_features |= XFS_FEAT_SWALLOC;
+		return 0;
+	case Opt_sunit:
+		parsing_mp->m_dalign = result.uint_32;
+		return 0;
+	case Opt_swidth:
+		parsing_mp->m_swidth = result.uint_32;
+		return 0;
+	case Opt_inode32:
+		parsing_mp->m_features |= XFS_FEAT_SMALL_INUMS;
+		return 0;
+	case Opt_inode64:
+		parsing_mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
+		return 0;
+	case Opt_nouuid:
+		parsing_mp->m_features |= XFS_FEAT_NOUUID;
+		return 0;
+	case Opt_largeio:
+		parsing_mp->m_features |= XFS_FEAT_LARGE_IOSIZE;
+		return 0;
+	case Opt_nolargeio:
+		parsing_mp->m_features &= ~XFS_FEAT_LARGE_IOSIZE;
+		return 0;
+	case Opt_filestreams:
+		parsing_mp->m_features |= XFS_FEAT_FILESTREAMS;
+		return 0;
+	case Opt_noquota:
+		parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+		parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+		return 0;
+	case Opt_quota:
+	case Opt_uquota:
+	case Opt_usrquota:
+		parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD);
+		return 0;
+	case Opt_qnoenforce:
+	case Opt_uqnoenforce:
+		parsing_mp->m_qflags |= XFS_UQUOTA_ACCT;
+		parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+		return 0;
+	case Opt_pquota:
+	case Opt_prjquota:
+		parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD);
+		return 0;
+	case Opt_pqnoenforce:
+		parsing_mp->m_qflags |= XFS_PQUOTA_ACCT;
+		parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+		return 0;
+	case Opt_gquota:
+	case Opt_grpquota:
+		parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD);
+		return 0;
+	case Opt_gqnoenforce:
+		parsing_mp->m_qflags |= XFS_GQUOTA_ACCT;
+		parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD;
+		return 0;
+	case Opt_discard:
+		parsing_mp->m_features |= XFS_FEAT_DISCARD;
+		return 0;
+	case Opt_nodiscard:
+		parsing_mp->m_features &= ~XFS_FEAT_DISCARD;
+		return 0;
+#ifdef CONFIG_FS_DAX
+	case Opt_dax:
+		xfs_mount_set_dax_mode(parsing_mp, XFS_DAX_ALWAYS);
+		return 0;
+	case Opt_dax_enum:
+		xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
+		return 0;
+#endif
+	/* Following mount options will be removed in September 2025 */
+	case Opt_ikeep:
+		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true);
+		parsing_mp->m_features |= XFS_FEAT_IKEEP;
+		return 0;
+	case Opt_noikeep:
+		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false);
+		parsing_mp->m_features &= ~XFS_FEAT_IKEEP;
+		return 0;
+	case Opt_attr2:
+		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true);
+		parsing_mp->m_features |= XFS_FEAT_ATTR2;
+		return 0;
+	case Opt_noattr2:
+		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
+		parsing_mp->m_features |= XFS_FEAT_NOATTR2;
+		return 0;
+	default:
+		xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+xfs_fs_validate_params(
+	struct xfs_mount	*mp)
+{
+	/* No recovery flag requires a read-only mount */
+	if (xfs_has_norecovery(mp) && !xfs_is_readonly(mp)) {
+		xfs_warn(mp, "no-recovery mounts must be read-only.");
+		return -EINVAL;
+	}
+
+	/*
+	 * We have not read the superblock at this point, so only the attr2
+	 * mount option can set the attr2 feature by this stage.
+	 */
+	if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) {
+		xfs_warn(mp, "attr2 and noattr2 cannot both be specified.");
+		return -EINVAL;
+	}
+
+
+	if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) {
+		xfs_warn(mp,
+	"sunit and swidth options incompatible with the noalign option");
+		return -EINVAL;
+	}
+
+	if (!IS_ENABLED(CONFIG_XFS_QUOTA) && mp->m_qflags != 0) {
+		xfs_warn(mp, "quota support not available in this kernel.");
+		return -EINVAL;
+	}
+
+	if ((mp->m_dalign && !mp->m_swidth) ||
+	    (!mp->m_dalign && mp->m_swidth)) {
+		xfs_warn(mp, "sunit and swidth must be specified together");
+		return -EINVAL;
+	}
+
+	if (mp->m_dalign && (mp->m_swidth % mp->m_dalign != 0)) {
+		xfs_warn(mp,
+	"stripe width (%d) must be a multiple of the stripe unit (%d)",
+			mp->m_swidth, mp->m_dalign);
+		return -EINVAL;
+	}
+
+	if (mp->m_logbufs != -1 &&
+	    mp->m_logbufs != 0 &&
+	    (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+	     mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+		xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
+			mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+		return -EINVAL;
+	}
+
+	if (mp->m_logbsize != -1 &&
+	    mp->m_logbsize !=  0 &&
+	    (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+	     mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+	     !is_power_of_2(mp->m_logbsize))) {
+		xfs_warn(mp,
+			"invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+			mp->m_logbsize);
+		return -EINVAL;
+	}
+
+	if (xfs_has_allocsize(mp) &&
+	    (mp->m_allocsize_log > XFS_MAX_IO_LOG ||
+	     mp->m_allocsize_log < XFS_MIN_IO_LOG)) {
+		xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
+			mp->m_allocsize_log, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+xfs_fs_fill_super(
+	struct super_block	*sb,
+	struct fs_context	*fc)
+{
+	struct xfs_mount	*mp = sb->s_fs_info;
+	struct inode		*root;
+	int			flags = 0, error;
+
+	mp->m_super = sb;
+
+	/*
+	 * Copy VFS mount flags from the context now that all parameter parsing
+	 * is guaranteed to have been completed by either the old mount API or
+	 * the newer fsopen/fsconfig API.
+	 */
+	if (fc->sb_flags & SB_RDONLY)
+		set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+	if (fc->sb_flags & SB_DIRSYNC)
+		mp->m_features |= XFS_FEAT_DIRSYNC;
+	if (fc->sb_flags & SB_SYNCHRONOUS)
+		mp->m_features |= XFS_FEAT_WSYNC;
+
+	error = xfs_fs_validate_params(mp);
+	if (error)
+		goto out_free_names;
+
+	sb_min_blocksize(sb, BBSIZE);
+	sb->s_xattr = xfs_xattr_handlers;
+	sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
+	sb->s_qcop = &xfs_quotactl_operations;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
+#endif
+	sb->s_op = &xfs_super_operations;
+
+	/*
+	 * Delay mount work if the debug hook is set. This is debug
+	 * instrumention to coordinate simulation of xfs mount failures with
+	 * VFS superblock operations
+	 */
+	if (xfs_globals.mount_delay) {
+		xfs_notice(mp, "Delaying mount for %d seconds.",
+			xfs_globals.mount_delay);
+		msleep(xfs_globals.mount_delay * 1000);
+	}
+
+	if (fc->sb_flags & SB_SILENT)
+		flags |= XFS_MFSI_QUIET;
+
+	error = xfs_open_devices(mp);
+	if (error)
+		goto out_free_names;
+
+	error = xfs_init_mount_workqueues(mp);
+	if (error)
+		goto out_close_devices;
+
+	error = xfs_init_percpu_counters(mp);
+	if (error)
+		goto out_destroy_workqueues;
+
+	error = xfs_inodegc_init_percpu(mp);
+	if (error)
+		goto out_destroy_counters;
+
+	/*
+	 * All percpu data structures requiring cleanup when a cpu goes offline
+	 * must be allocated before adding this @mp to the cpu-dead handler's
+	 * mount list.
+	 */
+	xfs_mount_list_add(mp);
+
+	/* Allocate stats memory before we do operations that might use it */
+	mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
+	if (!mp->m_stats.xs_stats) {
+		error = -ENOMEM;
+		goto out_destroy_inodegc;
+	}
+
+	error = xfs_readsb(mp, flags);
+	if (error)
+		goto out_free_stats;
+
+	error = xfs_finish_flags(mp);
+	if (error)
+		goto out_free_sb;
+
+	error = xfs_setup_devices(mp);
+	if (error)
+		goto out_free_sb;
+
+	/* V4 support is undergoing deprecation. */
+	if (!xfs_has_crc(mp)) {
+#ifdef CONFIG_XFS_SUPPORT_V4
+		xfs_warn_once(mp,
+	"Deprecated V4 format (crc=0) will not be supported after September 2030.");
+#else
+		xfs_warn(mp,
+	"Deprecated V4 format (crc=0) not supported by kernel.");
+		error = -EINVAL;
+		goto out_free_sb;
+#endif
+	}
+
+	/* Filesystem claims it needs repair, so refuse the mount. */
+	if (xfs_has_needsrepair(mp)) {
+		xfs_warn(mp, "Filesystem needs repair.  Please run xfs_repair.");
+		error = -EFSCORRUPTED;
+		goto out_free_sb;
+	}
+
+	/*
+	 * Don't touch the filesystem if a user tool thinks it owns the primary
+	 * superblock.  mkfs doesn't clear the flag from secondary supers, so
+	 * we don't check them at all.
+	 */
+	if (mp->m_sb.sb_inprogress) {
+		xfs_warn(mp, "Offline file system operation in progress!");
+		error = -EFSCORRUPTED;
+		goto out_free_sb;
+	}
+
+	/*
+	 * Until this is fixed only page-sized or smaller data blocks work.
+	 */
+	if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
+		xfs_warn(mp,
+		"File system with blocksize %d bytes. "
+		"Only pagesize (%ld) or less will currently work.",
+				mp->m_sb.sb_blocksize, PAGE_SIZE);
+		error = -ENOSYS;
+		goto out_free_sb;
+	}
+
+	/* Ensure this filesystem fits in the page cache limits */
+	if (xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_dblocks) ||
+	    xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_rblocks)) {
+		xfs_warn(mp,
+		"file system too large to be mounted on this system.");
+		error = -EFBIG;
+		goto out_free_sb;
+	}
+
+	/*
+	 * XFS block mappings use 54 bits to store the logical block offset.
+	 * This should suffice to handle the maximum file size that the VFS
+	 * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT
+	 * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes
+	 * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON
+	 * to check this assertion.
+	 *
+	 * Avoid integer overflow by comparing the maximum bmbt offset to the
+	 * maximum pagecache offset in units of fs blocks.
+	 */
+	if (!xfs_verify_fileoff(mp, XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE))) {
+		xfs_warn(mp,
+"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!",
+			 XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE),
+			 XFS_MAX_FILEOFF);
+		error = -EINVAL;
+		goto out_free_sb;
+	}
+
+	error = xfs_filestream_mount(mp);
+	if (error)
+		goto out_free_sb;
+
+	/*
+	 * we must configure the block size in the superblock before we run the
+	 * full mount process as the mount process can lookup and cache inodes.
+	 */
+	sb->s_magic = XFS_SUPER_MAGIC;
+	sb->s_blocksize = mp->m_sb.sb_blocksize;
+	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_max_links = XFS_MAXLINK;
+	sb->s_time_gran = 1;
+	if (xfs_has_bigtime(mp)) {
+		sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN);
+		sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX);
+	} else {
+		sb->s_time_min = XFS_LEGACY_TIME_MIN;
+		sb->s_time_max = XFS_LEGACY_TIME_MAX;
+	}
+	trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
+	sb->s_iflags |= SB_I_CGROUPWB;
+
+	set_posix_acl_flag(sb);
+
+	/* version 5 superblocks support inode version counters. */
+	if (xfs_has_crc(mp))
+		sb->s_flags |= SB_I_VERSION;
+
+	if (xfs_has_dax_always(mp)) {
+		error = xfs_setup_dax_always(mp);
+		if (error)
+			goto out_filestream_unmount;
+	}
+
+	if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) {
+		xfs_warn(mp,
+	"mounting with \"discard\" option, but the device does not support discard");
+		mp->m_features &= ~XFS_FEAT_DISCARD;
+	}
+
+	if (xfs_has_reflink(mp)) {
+		if (mp->m_sb.sb_rblocks) {
+			xfs_alert(mp,
+	"reflink not compatible with realtime device!");
+			error = -EINVAL;
+			goto out_filestream_unmount;
+		}
+
+		if (xfs_globals.always_cow) {
+			xfs_info(mp, "using DEBUG-only always_cow mode.");
+			mp->m_always_cow = true;
+		}
+	}
+
+	if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) {
+		xfs_alert(mp,
+	"reverse mapping btree not compatible with realtime device!");
+		error = -EINVAL;
+		goto out_filestream_unmount;
+	}
+
+	if (xfs_has_large_extent_counts(mp))
+		xfs_warn(mp,
+	"EXPERIMENTAL Large extent counts feature in use. Use at your own risk!");
+
+	error = xfs_mountfs(mp);
+	if (error)
+		goto out_filestream_unmount;
+
+	root = igrab(VFS_I(mp->m_rootip));
+	if (!root) {
+		error = -ENOENT;
+		goto out_unmount;
+	}
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		error = -ENOMEM;
+		goto out_unmount;
+	}
+
+	return 0;
+
+ out_filestream_unmount:
+	xfs_filestream_unmount(mp);
+ out_free_sb:
+	xfs_freesb(mp);
+ out_free_stats:
+	free_percpu(mp->m_stats.xs_stats);
+ out_destroy_inodegc:
+	xfs_mount_list_del(mp);
+	xfs_inodegc_free_percpu(mp);
+ out_destroy_counters:
+	xfs_destroy_percpu_counters(mp);
+ out_destroy_workqueues:
+	xfs_destroy_mount_workqueues(mp);
+ out_close_devices:
+	xfs_close_devices(mp);
+ out_free_names:
+	sb->s_fs_info = NULL;
+	xfs_mount_free(mp);
+	return error;
+
+ out_unmount:
+	xfs_filestream_unmount(mp);
+	xfs_unmountfs(mp);
+	goto out_free_sb;
+}
+
+static int
+xfs_fs_get_tree(
+	struct fs_context	*fc)
+{
+	return get_tree_bdev(fc, xfs_fs_fill_super);
+}
+
+static int
+xfs_remount_rw(
+	struct xfs_mount	*mp)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+	int error;
+
+	if (xfs_has_norecovery(mp)) {
+		xfs_warn(mp,
+			"ro->rw transition prohibited on norecovery mount");
+		return -EINVAL;
+	}
+
+	if (xfs_sb_is_v5(sbp) &&
+	    xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+		xfs_warn(mp,
+	"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+			(sbp->sb_features_ro_compat &
+				XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+		return -EINVAL;
+	}
+
+	clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+
+	/*
+	 * If this is the first remount to writeable state we might have some
+	 * superblock changes to update.
+	 */
+	if (mp->m_update_sb) {
+		error = xfs_sync_sb(mp, false);
+		if (error) {
+			xfs_warn(mp, "failed to write sb changes");
+			return error;
+		}
+		mp->m_update_sb = false;
+	}
+
+	/*
+	 * Fill out the reserve pool if it is empty. Use the stashed value if
+	 * it is non-zero, otherwise go with the default.
+	 */
+	xfs_restore_resvblks(mp);
+	xfs_log_work_queue(mp);
+	xfs_blockgc_start(mp);
+
+	/* Create the per-AG metadata reservation pool .*/
+	error = xfs_fs_reserve_ag_blocks(mp);
+	if (error && error != -ENOSPC)
+		return error;
+
+	/* Re-enable the background inode inactivation worker. */
+	xfs_inodegc_start(mp);
+
+	return 0;
+}
+
+static int
+xfs_remount_ro(
+	struct xfs_mount	*mp)
+{
+	struct xfs_icwalk	icw = {
+		.icw_flags	= XFS_ICWALK_FLAG_SYNC,
+	};
+	int			error;
+
+	/* Flush all the dirty data to disk. */
+	error = sync_filesystem(mp->m_super);
+	if (error)
+		return error;
+
+	/*
+	 * Cancel background eofb scanning so it cannot race with the final
+	 * log force+buftarg wait and deadlock the remount.
+	 */
+	xfs_blockgc_stop(mp);
+
+	/*
+	 * Clear out all remaining COW staging extents and speculative post-EOF
+	 * preallocations so that we don't leave inodes requiring inactivation
+	 * cleanups during reclaim on a read-only mount.  We must process every
+	 * cached inode, so this requires a synchronous cache scan.
+	 */
+	error = xfs_blockgc_free_space(mp, &icw);
+	if (error) {
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		return error;
+	}
+
+	/*
+	 * Stop the inodegc background worker.  xfs_fs_reconfigure already
+	 * flushed all pending inodegc work when it sync'd the filesystem.
+	 * The VFS holds s_umount, so we know that inodes cannot enter
+	 * xfs_fs_destroy_inode during a remount operation.  In readonly mode
+	 * we send inodes straight to reclaim, so no inodes will be queued.
+	 */
+	xfs_inodegc_stop(mp);
+
+	/* Free the per-AG metadata reservation pool. */
+	error = xfs_fs_unreserve_ag_blocks(mp);
+	if (error) {
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		return error;
+	}
+
+	/*
+	 * Before we sync the metadata, we need to free up the reserve block
+	 * pool so that the used block count in the superblock on disk is
+	 * correct at the end of the remount. Stash the current* reserve pool
+	 * size so that if we get remounted rw, we can return it to the same
+	 * size.
+	 */
+	xfs_save_resvblks(mp);
+
+	xfs_log_clean(mp);
+	set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+
+	return 0;
+}
+
+/*
+ * Logically we would return an error here to prevent users from believing
+ * they might have changed mount options using remount which can't be changed.
+ *
+ * But unfortunately mount(8) adds all options from mtab and fstab to the mount
+ * arguments in some cases so we can't blindly reject options, but have to
+ * check for each specified option if it actually differs from the currently
+ * set option and only reject it if that's the case.
+ *
+ * Until that is implemented we return success for every remount request, and
+ * silently ignore all options that we can't actually change.
+ */
+static int
+xfs_fs_reconfigure(
+	struct fs_context *fc)
+{
+	struct xfs_mount	*mp = XFS_M(fc->root->d_sb);
+	struct xfs_mount        *new_mp = fc->s_fs_info;
+	int			flags = fc->sb_flags;
+	int			error;
+
+	/* version 5 superblocks always support version counters. */
+	if (xfs_has_crc(mp))
+		fc->sb_flags |= SB_I_VERSION;
+
+	error = xfs_fs_validate_params(new_mp);
+	if (error)
+		return error;
+
+	/* inode32 -> inode64 */
+	if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
+		mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
+		mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
+	}
+
+	/* inode64 -> inode32 */
+	if (!xfs_has_small_inums(mp) && xfs_has_small_inums(new_mp)) {
+		mp->m_features |= XFS_FEAT_SMALL_INUMS;
+		mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
+	}
+
+	/* ro -> rw */
+	if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) {
+		error = xfs_remount_rw(mp);
+		if (error)
+			return error;
+	}
+
+	/* rw -> ro */
+	if (!xfs_is_readonly(mp) && (flags & SB_RDONLY)) {
+		error = xfs_remount_ro(mp);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+static void xfs_fs_free(
+	struct fs_context	*fc)
+{
+	struct xfs_mount	*mp = fc->s_fs_info;
+
+	/*
+	 * mp is stored in the fs_context when it is initialized.
+	 * mp is transferred to the superblock on a successful mount,
+	 * but if an error occurs before the transfer we have to free
+	 * it here.
+	 */
+	if (mp)
+		xfs_mount_free(mp);
+}
+
+static const struct fs_context_operations xfs_context_ops = {
+	.parse_param = xfs_fs_parse_param,
+	.get_tree    = xfs_fs_get_tree,
+	.reconfigure = xfs_fs_reconfigure,
+	.free        = xfs_fs_free,
+};
+
+/*
+ * WARNING: do not initialise any parameters in this function that depend on
+ * mount option parsing having already been performed as this can be called from
+ * fsopen() before any parameters have been set.
+ */
+static int xfs_init_fs_context(
+	struct fs_context	*fc)
+{
+	struct xfs_mount	*mp;
+
+	mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO);
+	if (!mp)
+		return -ENOMEM;
+
+	spin_lock_init(&mp->m_sb_lock);
+	spin_lock_init(&mp->m_agirotor_lock);
+	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
+	spin_lock_init(&mp->m_perag_lock);
+	mutex_init(&mp->m_growlock);
+	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+	mp->m_kobj.kobject.kset = xfs_kset;
+	/*
+	 * We don't create the finobt per-ag space reservation until after log
+	 * recovery, so we must set this to true so that an ifree transaction
+	 * started during log recovery will not depend on space reservations
+	 * for finobt expansion.
+	 */
+	mp->m_finobt_nores = true;
+
+	/*
+	 * These can be overridden by the mount option parsing.
+	 */
+	mp->m_logbufs = -1;
+	mp->m_logbsize = -1;
+	mp->m_allocsize_log = 16; /* 64k */
+
+	fc->s_fs_info = mp;
+	fc->ops = &xfs_context_ops;
+
+	return 0;
+}
+
+static struct file_system_type xfs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "xfs",
+	.init_fs_context	= xfs_init_fs_context,
+	.parameters		= xfs_fs_parameters,
+	.kill_sb		= kill_block_super,
+	.fs_flags		= FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+};
+MODULE_ALIAS_FS("xfs");
+
+STATIC int __init
+xfs_init_caches(void)
+{
+	int		error;
+
+	xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+					 SLAB_HWCACHE_ALIGN |
+					 SLAB_RECLAIM_ACCOUNT |
+					 SLAB_MEM_SPREAD,
+					 NULL);
+	if (!xfs_buf_cache)
+		goto out;
+
+	xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
+						sizeof(struct xlog_ticket),
+						0, 0, NULL);
+	if (!xfs_log_ticket_cache)
+		goto out_destroy_buf_cache;
+
+	error = xfs_btree_init_cur_caches();
+	if (error)
+		goto out_destroy_log_ticket_cache;
+
+	error = xfs_defer_init_item_caches();
+	if (error)
+		goto out_destroy_btree_cur_cache;
+
+	xfs_da_state_cache = kmem_cache_create("xfs_da_state",
+					      sizeof(struct xfs_da_state),
+					      0, 0, NULL);
+	if (!xfs_da_state_cache)
+		goto out_destroy_defer_item_cache;
+
+	xfs_ifork_cache = kmem_cache_create("xfs_ifork",
+					   sizeof(struct xfs_ifork),
+					   0, 0, NULL);
+	if (!xfs_ifork_cache)
+		goto out_destroy_da_state_cache;
+
+	xfs_trans_cache = kmem_cache_create("xfs_trans",
+					   sizeof(struct xfs_trans),
+					   0, 0, NULL);
+	if (!xfs_trans_cache)
+		goto out_destroy_ifork_cache;
+
+
+	/*
+	 * The size of the cache-allocated buf log item is the maximum
+	 * size possible under XFS.  This wastes a little bit of memory,
+	 * but it is much faster.
+	 */
+	xfs_buf_item_cache = kmem_cache_create("xfs_buf_item",
+					      sizeof(struct xfs_buf_log_item),
+					      0, 0, NULL);
+	if (!xfs_buf_item_cache)
+		goto out_destroy_trans_cache;
+
+	xfs_efd_cache = kmem_cache_create("xfs_efd_item",
+			xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS),
+			0, 0, NULL);
+	if (!xfs_efd_cache)
+		goto out_destroy_buf_item_cache;
+
+	xfs_efi_cache = kmem_cache_create("xfs_efi_item",
+			xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS),
+			0, 0, NULL);
+	if (!xfs_efi_cache)
+		goto out_destroy_efd_cache;
+
+	xfs_inode_cache = kmem_cache_create("xfs_inode",
+					   sizeof(struct xfs_inode), 0,
+					   (SLAB_HWCACHE_ALIGN |
+					    SLAB_RECLAIM_ACCOUNT |
+					    SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+					   xfs_fs_inode_init_once);
+	if (!xfs_inode_cache)
+		goto out_destroy_efi_cache;
+
+	xfs_ili_cache = kmem_cache_create("xfs_ili",
+					 sizeof(struct xfs_inode_log_item), 0,
+					 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					 NULL);
+	if (!xfs_ili_cache)
+		goto out_destroy_inode_cache;
+
+	xfs_icreate_cache = kmem_cache_create("xfs_icr",
+					     sizeof(struct xfs_icreate_item),
+					     0, 0, NULL);
+	if (!xfs_icreate_cache)
+		goto out_destroy_ili_cache;
+
+	xfs_rud_cache = kmem_cache_create("xfs_rud_item",
+					 sizeof(struct xfs_rud_log_item),
+					 0, 0, NULL);
+	if (!xfs_rud_cache)
+		goto out_destroy_icreate_cache;
+
+	xfs_rui_cache = kmem_cache_create("xfs_rui_item",
+			xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
+			0, 0, NULL);
+	if (!xfs_rui_cache)
+		goto out_destroy_rud_cache;
+
+	xfs_cud_cache = kmem_cache_create("xfs_cud_item",
+					 sizeof(struct xfs_cud_log_item),
+					 0, 0, NULL);
+	if (!xfs_cud_cache)
+		goto out_destroy_rui_cache;
+
+	xfs_cui_cache = kmem_cache_create("xfs_cui_item",
+			xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
+			0, 0, NULL);
+	if (!xfs_cui_cache)
+		goto out_destroy_cud_cache;
+
+	xfs_bud_cache = kmem_cache_create("xfs_bud_item",
+					 sizeof(struct xfs_bud_log_item),
+					 0, 0, NULL);
+	if (!xfs_bud_cache)
+		goto out_destroy_cui_cache;
+
+	xfs_bui_cache = kmem_cache_create("xfs_bui_item",
+			xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
+			0, 0, NULL);
+	if (!xfs_bui_cache)
+		goto out_destroy_bud_cache;
+
+	xfs_attrd_cache = kmem_cache_create("xfs_attrd_item",
+					    sizeof(struct xfs_attrd_log_item),
+					    0, 0, NULL);
+	if (!xfs_attrd_cache)
+		goto out_destroy_bui_cache;
+
+	xfs_attri_cache = kmem_cache_create("xfs_attri_item",
+					    sizeof(struct xfs_attri_log_item),
+					    0, 0, NULL);
+	if (!xfs_attri_cache)
+		goto out_destroy_attrd_cache;
+
+	xfs_iunlink_cache = kmem_cache_create("xfs_iul_item",
+					     sizeof(struct xfs_iunlink_item),
+					     0, 0, NULL);
+	if (!xfs_iunlink_cache)
+		goto out_destroy_attri_cache;
+
+	return 0;
+
+ out_destroy_attri_cache:
+	kmem_cache_destroy(xfs_attri_cache);
+ out_destroy_attrd_cache:
+	kmem_cache_destroy(xfs_attrd_cache);
+ out_destroy_bui_cache:
+	kmem_cache_destroy(xfs_bui_cache);
+ out_destroy_bud_cache:
+	kmem_cache_destroy(xfs_bud_cache);
+ out_destroy_cui_cache:
+	kmem_cache_destroy(xfs_cui_cache);
+ out_destroy_cud_cache:
+	kmem_cache_destroy(xfs_cud_cache);
+ out_destroy_rui_cache:
+	kmem_cache_destroy(xfs_rui_cache);
+ out_destroy_rud_cache:
+	kmem_cache_destroy(xfs_rud_cache);
+ out_destroy_icreate_cache:
+	kmem_cache_destroy(xfs_icreate_cache);
+ out_destroy_ili_cache:
+	kmem_cache_destroy(xfs_ili_cache);
+ out_destroy_inode_cache:
+	kmem_cache_destroy(xfs_inode_cache);
+ out_destroy_efi_cache:
+	kmem_cache_destroy(xfs_efi_cache);
+ out_destroy_efd_cache:
+	kmem_cache_destroy(xfs_efd_cache);
+ out_destroy_buf_item_cache:
+	kmem_cache_destroy(xfs_buf_item_cache);
+ out_destroy_trans_cache:
+	kmem_cache_destroy(xfs_trans_cache);
+ out_destroy_ifork_cache:
+	kmem_cache_destroy(xfs_ifork_cache);
+ out_destroy_da_state_cache:
+	kmem_cache_destroy(xfs_da_state_cache);
+ out_destroy_defer_item_cache:
+	xfs_defer_destroy_item_caches();
+ out_destroy_btree_cur_cache:
+	xfs_btree_destroy_cur_caches();
+ out_destroy_log_ticket_cache:
+	kmem_cache_destroy(xfs_log_ticket_cache);
+ out_destroy_buf_cache:
+	kmem_cache_destroy(xfs_buf_cache);
+ out:
+	return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_caches(void)
+{
+	/*
+	 * Make sure all delayed rcu free are flushed before we
+	 * destroy caches.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(xfs_iunlink_cache);
+	kmem_cache_destroy(xfs_attri_cache);
+	kmem_cache_destroy(xfs_attrd_cache);
+	kmem_cache_destroy(xfs_bui_cache);
+	kmem_cache_destroy(xfs_bud_cache);
+	kmem_cache_destroy(xfs_cui_cache);
+	kmem_cache_destroy(xfs_cud_cache);
+	kmem_cache_destroy(xfs_rui_cache);
+	kmem_cache_destroy(xfs_rud_cache);
+	kmem_cache_destroy(xfs_icreate_cache);
+	kmem_cache_destroy(xfs_ili_cache);
+	kmem_cache_destroy(xfs_inode_cache);
+	kmem_cache_destroy(xfs_efi_cache);
+	kmem_cache_destroy(xfs_efd_cache);
+	kmem_cache_destroy(xfs_buf_item_cache);
+	kmem_cache_destroy(xfs_trans_cache);
+	kmem_cache_destroy(xfs_ifork_cache);
+	kmem_cache_destroy(xfs_da_state_cache);
+	xfs_defer_destroy_item_caches();
+	xfs_btree_destroy_cur_caches();
+	kmem_cache_destroy(xfs_log_ticket_cache);
+	kmem_cache_destroy(xfs_buf_cache);
+}
+
+STATIC int __init
+xfs_init_workqueues(void)
+{
+	/*
+	 * The allocation workqueue can be used in memory reclaim situations
+	 * (writepage path), and parallelism is only limited by the number of
+	 * AGs in all the filesystems mounted. Hence use the default large
+	 * max_active value for this workqueue.
+	 */
+	xfs_alloc_wq = alloc_workqueue("xfsalloc",
+			XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
+	if (!xfs_alloc_wq)
+		return -ENOMEM;
+
+	xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND),
+			0);
+	if (!xfs_discard_wq)
+		goto out_free_alloc_wq;
+
+	return 0;
+out_free_alloc_wq:
+	destroy_workqueue(xfs_alloc_wq);
+	return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_workqueues(void)
+{
+	destroy_workqueue(xfs_discard_wq);
+	destroy_workqueue(xfs_alloc_wq);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+xfs_cpu_dead(
+	unsigned int		cpu)
+{
+	struct xfs_mount	*mp, *n;
+
+	spin_lock(&xfs_mount_list_lock);
+	list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
+		spin_unlock(&xfs_mount_list_lock);
+		xfs_inodegc_cpu_dead(mp, cpu);
+		xlog_cil_pcp_dead(mp->m_log, cpu);
+		spin_lock(&xfs_mount_list_lock);
+	}
+	spin_unlock(&xfs_mount_list_lock);
+	return 0;
+}
+
+static int __init
+xfs_cpu_hotplug_init(void)
+{
+	int	error;
+
+	error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL,
+			xfs_cpu_dead);
+	if (error < 0)
+		xfs_alert(NULL,
+"Failed to initialise CPU hotplug, error %d. XFS is non-functional.",
+			error);
+	return error;
+}
+
+static void
+xfs_cpu_hotplug_destroy(void)
+{
+	cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static inline int xfs_cpu_hotplug_init(void) { return 0; }
+static inline void xfs_cpu_hotplug_destroy(void) {}
+#endif
+
+STATIC int __init
+init_xfs_fs(void)
+{
+	int			error;
+
+	xfs_check_ondisk_structs();
+
+	printk(KERN_INFO XFS_VERSION_STRING " with "
+			 XFS_BUILD_OPTIONS " enabled\n");
+
+	xfs_dir_startup();
+
+	error = xfs_cpu_hotplug_init();
+	if (error)
+		goto out;
+
+	error = xfs_init_caches();
+	if (error)
+		goto out_destroy_hp;
+
+	error = xfs_init_workqueues();
+	if (error)
+		goto out_destroy_caches;
+
+	error = xfs_mru_cache_init();
+	if (error)
+		goto out_destroy_wq;
+
+	error = xfs_init_procfs();
+	if (error)
+		goto out_mru_cache_uninit;
+
+	error = xfs_sysctl_register();
+	if (error)
+		goto out_cleanup_procfs;
+
+	xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
+	if (!xfs_kset) {
+		error = -ENOMEM;
+		goto out_sysctl_unregister;
+	}
+
+	xfsstats.xs_kobj.kobject.kset = xfs_kset;
+
+	xfsstats.xs_stats = alloc_percpu(struct xfsstats);
+	if (!xfsstats.xs_stats) {
+		error = -ENOMEM;
+		goto out_kset_unregister;
+	}
+
+	error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL,
+			       "stats");
+	if (error)
+		goto out_free_stats;
+
+#ifdef DEBUG
+	xfs_dbg_kobj.kobject.kset = xfs_kset;
+	error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
+	if (error)
+		goto out_remove_stats_kobj;
+#endif
+
+	error = xfs_qm_init();
+	if (error)
+		goto out_remove_dbg_kobj;
+
+	error = register_filesystem(&xfs_fs_type);
+	if (error)
+		goto out_qm_exit;
+	return 0;
+
+ out_qm_exit:
+	xfs_qm_exit();
+ out_remove_dbg_kobj:
+#ifdef DEBUG
+	xfs_sysfs_del(&xfs_dbg_kobj);
+ out_remove_stats_kobj:
+#endif
+	xfs_sysfs_del(&xfsstats.xs_kobj);
+ out_free_stats:
+	free_percpu(xfsstats.xs_stats);
+ out_kset_unregister:
+	kset_unregister(xfs_kset);
+ out_sysctl_unregister:
+	xfs_sysctl_unregister();
+ out_cleanup_procfs:
+	xfs_cleanup_procfs();
+ out_mru_cache_uninit:
+	xfs_mru_cache_uninit();
+ out_destroy_wq:
+	xfs_destroy_workqueues();
+ out_destroy_caches:
+	xfs_destroy_caches();
+ out_destroy_hp:
+	xfs_cpu_hotplug_destroy();
+ out:
+	return error;
+}
+
+STATIC void __exit
+exit_xfs_fs(void)
+{
+	xfs_qm_exit();
+	unregister_filesystem(&xfs_fs_type);
+#ifdef DEBUG
+	xfs_sysfs_del(&xfs_dbg_kobj);
+#endif
+	xfs_sysfs_del(&xfsstats.xs_kobj);
+	free_percpu(xfsstats.xs_stats);
+	kset_unregister(xfs_kset);
+	xfs_sysctl_unregister();
+	xfs_cleanup_procfs();
+	xfs_mru_cache_uninit();
+	xfs_destroy_workqueues();
+	xfs_destroy_caches();
+	xfs_uuid_table_free();
+	xfs_cpu_hotplug_destroy();
+}
+
+module_init(init_xfs_fs);
+module_exit(exit_xfs_fs);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
+MODULE_LICENSE("GPL");
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
new file mode 100644
index 000000000..364e2c264
--- /dev/null
+++ b/fs/xfs/xfs_super.h
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SUPER_H__
+#define __XFS_SUPER_H__
+
+#include <linux/exportfs.h>
+
+#ifdef CONFIG_XFS_QUOTA
+extern int xfs_qm_init(void);
+extern void xfs_qm_exit(void);
+# define XFS_QUOTA_STRING	"quota, "
+#else
+# define xfs_qm_init()	(0)
+# define xfs_qm_exit()	do { } while (0)
+# define XFS_QUOTA_STRING
+#endif
+
+#ifdef CONFIG_XFS_POSIX_ACL
+# define XFS_ACL_STRING		"ACLs, "
+# define set_posix_acl_flag(sb)	((sb)->s_flags |= SB_POSIXACL)
+#else
+# define XFS_ACL_STRING
+# define set_posix_acl_flag(sb)	do { } while (0)
+#endif
+
+#define XFS_SECURITY_STRING	"security attributes, "
+
+#ifdef CONFIG_XFS_RT
+# define XFS_REALTIME_STRING	"realtime, "
+#else
+# define XFS_REALTIME_STRING
+#endif
+
+#ifdef CONFIG_XFS_ONLINE_SCRUB
+# define XFS_SCRUB_STRING	"scrub, "
+#else
+# define XFS_SCRUB_STRING
+#endif
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+# define XFS_REPAIR_STRING	"repair, "
+#else
+# define XFS_REPAIR_STRING
+#endif
+
+#ifdef CONFIG_XFS_WARN
+# define XFS_WARN_STRING	"verbose warnings, "
+#else
+# define XFS_WARN_STRING
+#endif
+
+#ifdef CONFIG_XFS_ASSERT_FATAL
+# define XFS_ASSERT_FATAL_STRING	"fatal assert, "
+#else
+# define XFS_ASSERT_FATAL_STRING
+#endif
+
+#ifdef DEBUG
+# define XFS_DBG_STRING		"debug"
+#else
+# define XFS_DBG_STRING		"no debug"
+#endif
+
+#define XFS_VERSION_STRING	"SGI XFS"
+#define XFS_BUILD_OPTIONS	XFS_ACL_STRING \
+				XFS_SECURITY_STRING \
+				XFS_REALTIME_STRING \
+				XFS_SCRUB_STRING \
+				XFS_REPAIR_STRING \
+				XFS_WARN_STRING \
+				XFS_QUOTA_STRING \
+				XFS_ASSERT_FATAL_STRING \
+				XFS_DBG_STRING /* DBG must be last */
+
+#ifdef DEBUG
+# define XFS_WQFLAGS(wqflags)	(WQ_SYSFS | (wqflags))
+#else
+# define XFS_WQFLAGS(wqflags)	(wqflags)
+#endif
+
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_buftarg;
+struct block_device;
+
+extern void xfs_flush_inodes(struct xfs_mount *mp);
+extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
+					   xfs_agnumber_t agcount);
+
+extern const struct export_operations xfs_export_operations;
+extern const struct quotactl_ops xfs_quotactl_operations;
+extern const struct dax_holder_operations xfs_dax_holder_operations;
+
+extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
+
+extern struct workqueue_struct *xfs_discard_wq;
+
+#define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
+
+#endif	/* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
new file mode 100644
index 000000000..8389f3ef8
--- /dev/null
+++ b/fs/xfs/xfs_symlink.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_symlink.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_error.h"
+
+/* ----- Kernel only functions below ----- */
+int
+xfs_readlink_bmap_ilocked(
+	struct xfs_inode	*ip,
+	char			*link)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	struct xfs_buf		*bp;
+	xfs_daddr_t		d;
+	char			*cur_chunk;
+	int			pathlen = ip->i_disk_size;
+	int			nmaps = XFS_SYMLINK_MAPS;
+	int			byte_cnt;
+	int			n;
+	int			error = 0;
+	int			fsblocks = 0;
+	int			offset;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	fsblocks = xfs_symlink_blocks(mp, pathlen);
+	error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+	if (error)
+		goto out;
+
+	offset = 0;
+	for (n = 0; n < nmaps; n++) {
+		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+		error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+				&bp, &xfs_symlink_buf_ops);
+		if (error)
+			return error;
+		byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+		if (pathlen < byte_cnt)
+			byte_cnt = pathlen;
+
+		cur_chunk = bp->b_addr;
+		if (xfs_has_crc(mp)) {
+			if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
+							byte_cnt, bp)) {
+				error = -EFSCORRUPTED;
+				xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
+					offset, byte_cnt, ip->i_ino);
+				xfs_buf_relse(bp);
+				goto out;
+
+			}
+
+			cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+		}
+
+		memcpy(link + offset, cur_chunk, byte_cnt);
+
+		pathlen -= byte_cnt;
+		offset += byte_cnt;
+
+		xfs_buf_relse(bp);
+	}
+	ASSERT(pathlen == 0);
+
+	link[ip->i_disk_size] = '\0';
+	error = 0;
+
+ out:
+	return error;
+}
+
+int
+xfs_readlink(
+	struct xfs_inode	*ip,
+	char			*link)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fsize_t		pathlen;
+	int			error = -EFSCORRUPTED;
+
+	trace_xfs_readlink(ip);
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	pathlen = ip->i_disk_size;
+	if (!pathlen)
+		goto out;
+
+	if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
+		xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+			 __func__, (unsigned long long) ip->i_ino,
+			 (long long) pathlen);
+		ASSERT(0);
+		goto out;
+	}
+
+	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		/*
+		 * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED
+		 * if if_data is junk.
+		 */
+		if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data))
+			goto out;
+
+		memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1);
+		error = 0;
+	} else {
+		error = xfs_readlink_bmap_ilocked(ip, link);
+	}
+
+ out:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+int
+xfs_symlink(
+	struct user_namespace	*mnt_userns,
+	struct xfs_inode	*dp,
+	struct xfs_name		*link_name,
+	const char		*target_path,
+	umode_t			mode,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_trans	*tp = NULL;
+	struct xfs_inode	*ip = NULL;
+	int			error = 0;
+	int			pathlen;
+	bool                    unlock_dp_on_error = false;
+	xfs_fileoff_t		first_fsb;
+	xfs_filblks_t		fs_blocks;
+	int			nmaps;
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	xfs_daddr_t		d;
+	const char		*cur_chunk;
+	int			byte_cnt;
+	int			n;
+	struct xfs_buf		*bp;
+	prid_t			prid;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	uint			resblks;
+	xfs_ino_t		ino;
+
+	*ipp = NULL;
+
+	trace_xfs_symlink(dp, link_name);
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/*
+	 * Check component lengths of the target path name.
+	 */
+	pathlen = strlen(target_path);
+	if (pathlen >= XFS_SYMLINK_MAXLEN)      /* total string too long */
+		return -ENAMETOOLONG;
+	ASSERT(pathlen > 0);
+
+	prid = xfs_get_initial_prid(dp);
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+			mapped_fsgid(mnt_userns, &init_user_ns), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+			&udqp, &gdqp, &pdqp);
+	if (error)
+		return error;
+
+	/*
+	 * The symlink will fit into the inode data fork?
+	 * There can't be any attributes so we get the whole variable part.
+	 */
+	if (pathlen <= XFS_LITINO(mp))
+		fs_blocks = 0;
+	else
+		fs_blocks = xfs_symlink_blocks(mp, pathlen);
+	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+
+	error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp,
+			pdqp, resblks, &tp);
+	if (error)
+		goto out_release_dquots;
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+	unlock_dp_on_error = true;
+
+	/*
+	 * Check whether the directory allows new symlinks or not.
+	 */
+	if (dp->i_diflags & XFS_DIFLAG_NOSYMLINKS) {
+		error = -EPERM;
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Allocate an inode for the symlink.
+	 */
+	error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino);
+	if (!error)
+		error = xfs_init_new_inode(mnt_userns, tp, dp, ino,
+				S_IFLNK | (mode & ~S_IFMT), 1, 0, prid,
+				false, &ip);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Now we join the directory inode to the transaction.  We do not do it
+	 * earlier because xfs_dir_ialloc might commit the previous transaction
+	 * (and release all the locks).  An error from here on will result in
+	 * the transaction cancel unlocking dp so don't do it explicitly in the
+	 * error path.
+	 */
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	unlock_dp_on_error = false;
+
+	/*
+	 * Also attach the dquot(s) to it, if applicable.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+	resblks -= XFS_IALLOC_SPACE_RES(mp);
+	/*
+	 * If the symlink will fit into the inode, write it inline.
+	 */
+	if (pathlen <= xfs_inode_data_fork_size(ip)) {
+		xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
+
+		ip->i_disk_size = pathlen;
+		ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+	} else {
+		int	offset;
+
+		first_fsb = 0;
+		nmaps = XFS_SYMLINK_MAPS;
+
+		error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
+				  XFS_BMAPI_METADATA, resblks, mval, &nmaps);
+		if (error)
+			goto out_trans_cancel;
+
+		resblks -= fs_blocks;
+		ip->i_disk_size = pathlen;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		cur_chunk = target_path;
+		offset = 0;
+		for (n = 0; n < nmaps; n++) {
+			char	*buf;
+
+			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+			error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					       BTOBB(byte_cnt), 0, &bp);
+			if (error)
+				goto out_trans_cancel;
+			bp->b_ops = &xfs_symlink_buf_ops;
+
+			byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+			byte_cnt = min(byte_cnt, pathlen);
+
+			buf = bp->b_addr;
+			buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
+						   byte_cnt, bp);
+
+			memcpy(buf, cur_chunk, byte_cnt);
+
+			cur_chunk += byte_cnt;
+			pathlen -= byte_cnt;
+			offset += byte_cnt;
+
+			xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+			xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+							(char *)bp->b_addr);
+		}
+		ASSERT(pathlen == 0);
+	}
+	i_size_write(VFS_I(ip), ip->i_disk_size);
+
+	/*
+	 * Create the directory entry for the symlink.
+	 */
+	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, resblks);
+	if (error)
+		goto out_trans_cancel;
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * symlink transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_release_inode;
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	*ipp = ip;
+	return 0;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to finish the
+	 * setup of the inode and release the inode.  This prevents recursive
+	 * transactions and deadlocks from xfs_inactive.
+	 */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		xfs_irele(ip);
+	}
+out_release_dquots:
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	if (unlock_dp_on_error)
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ *
+ * Note: zero length symlinks are not allowed to exist. When we set the size to
+ * zero, also change it to a regular file so that it does not get written to
+ * disk as a zero length symlink. The inode is on the unlinked list already, so
+ * userspace cannot find this inode anymore, so this change is not user visible
+ * but allows us to catch corrupt zero-length symlinks in the verifiers.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+	struct xfs_inode *ip)
+{
+	struct xfs_buf	*bp;
+	int		done;
+	int		error;
+	int		i;
+	xfs_mount_t	*mp;
+	xfs_bmbt_irec_t	mval[XFS_SYMLINK_MAPS];
+	int		nmaps;
+	int		size;
+	xfs_trans_t	*tp;
+
+	mp = ip->i_mount;
+	ASSERT(!xfs_need_iread_extents(&ip->i_df));
+	/*
+	 * We're freeing a symlink that has some
+	 * blocks allocated to it.  Free the
+	 * blocks here.  We know that we've got
+	 * either 1 or 2 extents and that we can
+	 * free them all in one bunmapi call.
+	 */
+	ASSERT(ip->i_df.if_nextents > 0 && ip->i_df.if_nextents <= 2);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Lock the inode, fix the size, turn it into a regular file and join it
+	 * to the transaction.  Hold it so in the normal path, we still have it
+	 * locked for the second transaction.  In the error paths we need it
+	 * held so the cancel won't rele it, see below.
+	 */
+	size = (int)ip->i_disk_size;
+	ip->i_disk_size = 0;
+	VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Find the block(s) so we can inval and unmap them.
+	 */
+	done = 0;
+	nmaps = ARRAY_SIZE(mval);
+	error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
+				mval, &nmaps, 0);
+	if (error)
+		goto error_trans_cancel;
+	/*
+	 * Invalidate the block(s). No validation is done.
+	 */
+	for (i = 0; i < nmaps; i++) {
+		error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+				XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+				XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0,
+				&bp);
+		if (error)
+			goto error_trans_cancel;
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Unmap the dead block(s) to the dfops.
+	 */
+	error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, &done);
+	if (error)
+		goto error_trans_cancel;
+	ASSERT(done);
+
+	/*
+	 * Commit the transaction. This first logs the EFI and the inode, then
+	 * rolls and commits the transaction that frees the extents.
+	 */
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	error = xfs_trans_commit(tp);
+	if (error) {
+		ASSERT(xfs_is_shutdown(mp));
+		goto error_unlock;
+	}
+
+	/*
+	 * Remove the memory for extent descriptions (just bookkeeping).
+	 */
+	if (ip->i_df.if_bytes)
+		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+	ASSERT(ip->i_df.if_bytes == 0);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+
+error_trans_cancel:
+	xfs_trans_cancel(tp);
+error_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			pathlen;
+
+	trace_xfs_inactive_symlink(ip);
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	pathlen = (int)ip->i_disk_size;
+	ASSERT(pathlen);
+
+	if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) {
+		xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+			 __func__, (unsigned long long)ip->i_ino, pathlen);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Inline fork state gets removed by xfs_difree() so we have nothing to
+	 * do here in that case.
+	 */
+	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return 0;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/* remove the remote symlink */
+	return xfs_inactive_symlink_rmt(ip);
+}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
new file mode 100644
index 000000000..2586b7e39
--- /dev/null
+++ b/fs/xfs/xfs_symlink.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2012 Red Hat, Inc. All rights reserved.
+ */
+#ifndef __XFS_SYMLINK_H
+#define __XFS_SYMLINK_H 1
+
+/* Kernel only symlink definitions */
+
+int xfs_symlink(struct user_namespace *mnt_userns, struct xfs_inode *dp,
+		struct xfs_name *link_name, const char *target_path,
+		umode_t mode, struct xfs_inode **ipp);
+int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link);
+int xfs_readlink(struct xfs_inode *ip, char *link);
+int xfs_inactive_symlink(struct xfs_inode *ip);
+
+#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
new file mode 100644
index 000000000..546a6cd96
--- /dev/null
+++ b/fs/xfs/xfs_sysctl.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_error.h"
+
+static struct ctl_table_header *xfs_table_header;
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+xfs_stats_clear_proc_handler(
+	struct ctl_table	*ctl,
+	int			write,
+	void			*buffer,
+	size_t			*lenp,
+	loff_t			*ppos)
+{
+	int		ret, *valp = ctl->data;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+
+	if (!ret && write && *valp) {
+		xfs_stats_clearall(xfsstats.xs_stats);
+		xfs_stats_clear = 0;
+	}
+
+	return ret;
+}
+
+STATIC int
+xfs_panic_mask_proc_handler(
+	struct ctl_table	*ctl,
+	int			write,
+	void			*buffer,
+	size_t			*lenp,
+	loff_t			*ppos)
+{
+	int		ret, *valp = ctl->data;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		xfs_panic_mask = *valp;
+#ifdef DEBUG
+		xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+	}
+	return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
+STATIC int
+xfs_deprecated_dointvec_minmax(
+	struct ctl_table	*ctl,
+	int			write,
+	void			*buffer,
+	size_t			*lenp,
+	loff_t			*ppos)
+{
+	if (write) {
+		printk_ratelimited(KERN_WARNING
+				"XFS: %s sysctl option is deprecated.\n",
+				ctl->procname);
+	}
+	return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table xfs_table[] = {
+	{
+		.procname	= "irix_sgid_inherit",
+		.data		= &xfs_params.sgid_inherit.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_deprecated_dointvec_minmax,
+		.extra1		= &xfs_params.sgid_inherit.min,
+		.extra2		= &xfs_params.sgid_inherit.max
+	},
+	{
+		.procname	= "irix_symlink_mode",
+		.data		= &xfs_params.symlink_mode.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_deprecated_dointvec_minmax,
+		.extra1		= &xfs_params.symlink_mode.min,
+		.extra2		= &xfs_params.symlink_mode.max
+	},
+	{
+		.procname	= "panic_mask",
+		.data		= &xfs_params.panic_mask.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_panic_mask_proc_handler,
+		.extra1		= &xfs_params.panic_mask.min,
+		.extra2		= &xfs_params.panic_mask.max
+	},
+
+	{
+		.procname	= "error_level",
+		.data		= &xfs_params.error_level.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.error_level.min,
+		.extra2		= &xfs_params.error_level.max
+	},
+	{
+		.procname	= "xfssyncd_centisecs",
+		.data		= &xfs_params.syncd_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.syncd_timer.min,
+		.extra2		= &xfs_params.syncd_timer.max
+	},
+	{
+		.procname	= "inherit_sync",
+		.data		= &xfs_params.inherit_sync.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_sync.min,
+		.extra2		= &xfs_params.inherit_sync.max
+	},
+	{
+		.procname	= "inherit_nodump",
+		.data		= &xfs_params.inherit_nodump.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nodump.min,
+		.extra2		= &xfs_params.inherit_nodump.max
+	},
+	{
+		.procname	= "inherit_noatime",
+		.data		= &xfs_params.inherit_noatim.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_noatim.min,
+		.extra2		= &xfs_params.inherit_noatim.max
+	},
+	{
+		.procname	= "inherit_nosymlinks",
+		.data		= &xfs_params.inherit_nosym.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nosym.min,
+		.extra2		= &xfs_params.inherit_nosym.max
+	},
+	{
+		.procname	= "rotorstep",
+		.data		= &xfs_params.rotorstep.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.rotorstep.min,
+		.extra2		= &xfs_params.rotorstep.max
+	},
+	{
+		.procname	= "inherit_nodefrag",
+		.data		= &xfs_params.inherit_nodfrg.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nodfrg.min,
+		.extra2		= &xfs_params.inherit_nodfrg.max
+	},
+	{
+		.procname	= "filestream_centisecs",
+		.data		= &xfs_params.fstrm_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.fstrm_timer.min,
+		.extra2		= &xfs_params.fstrm_timer.max,
+	},
+	{
+		.procname	= "speculative_prealloc_lifetime",
+		.data		= &xfs_params.blockgc_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.blockgc_timer.min,
+		.extra2		= &xfs_params.blockgc_timer.max,
+	},
+	{
+		.procname	= "speculative_cow_prealloc_lifetime",
+		.data		= &xfs_params.blockgc_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_deprecated_dointvec_minmax,
+		.extra1		= &xfs_params.blockgc_timer.min,
+		.extra2		= &xfs_params.blockgc_timer.max,
+	},
+	/* please keep this the last entry */
+#ifdef CONFIG_PROC_FS
+	{
+		.procname	= "stats_clear",
+		.data		= &xfs_params.stats_clear.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_stats_clear_proc_handler,
+		.extra1		= &xfs_params.stats_clear.min,
+		.extra2		= &xfs_params.stats_clear.max
+	},
+#endif /* CONFIG_PROC_FS */
+
+	{}
+};
+
+static struct ctl_table xfs_dir_table[] = {
+	{
+		.procname	= "xfs",
+		.mode		= 0555,
+		.child		= xfs_table
+	},
+	{}
+};
+
+static struct ctl_table xfs_root_table[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= xfs_dir_table
+	},
+	{}
+};
+
+int
+xfs_sysctl_register(void)
+{
+	xfs_table_header = register_sysctl_table(xfs_root_table);
+	if (!xfs_table_header)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_sysctl_unregister(void)
+{
+	unregister_sysctl_table(xfs_table_header);
+}
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
new file mode 100644
index 000000000..f78ad6b10
--- /dev/null
+++ b/fs/xfs/xfs_sysctl.h
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SYSCTL_H__
+#define __XFS_SYSCTL_H__
+
+#include <linux/sysctl.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+typedef struct xfs_sysctl_val {
+	int min;
+	int val;
+	int max;
+} xfs_sysctl_val_t;
+
+typedef struct xfs_param {
+	xfs_sysctl_val_t sgid_inherit;	/* Inherit S_ISGID if process' GID is
+					 * not a member of parent dir GID. */
+	xfs_sysctl_val_t symlink_mode;	/* Link creat mode affected by umask */
+	xfs_sysctl_val_t panic_mask;	/* bitmask to cause panic on errors. */
+	xfs_sysctl_val_t error_level;	/* Degree of reporting for problems  */
+	xfs_sysctl_val_t syncd_timer;	/* Interval between xfssyncd wakeups */
+	xfs_sysctl_val_t stats_clear;	/* Reset all XFS statistics to zero. */
+	xfs_sysctl_val_t inherit_sync;	/* Inherit the "sync" inode flag. */
+	xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
+	xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
+	xfs_sysctl_val_t xfs_buf_timer;	/* Interval between xfsbufd wakeups. */
+	xfs_sysctl_val_t xfs_buf_age;	/* Metadata buffer age before flush. */
+	xfs_sysctl_val_t inherit_nosym;	/* Inherit the "nosymlinks" flag. */
+	xfs_sysctl_val_t rotorstep;	/* inode32 AG rotoring control knob */
+	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
+	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
+	xfs_sysctl_val_t blockgc_timer;	/* Interval between blockgc scans */
+} xfs_param_t;
+
+/*
+ * xfs_error_level:
+ *
+ * How much error reporting will be done when internal problems are
+ * encountered.  These problems normally return an EFSCORRUPTED to their
+ * caller, with no other information reported.
+ *
+ * 0	No error reports
+ * 1	Report EFSCORRUPTED errors that will cause a filesystem shutdown
+ * 5	Report all EFSCORRUPTED errors (all of the above errors, plus any
+ *	additional errors that are known to not cause shutdowns)
+ *
+ * xfs_panic_mask bit 0x8 turns the error reports into panics
+ */
+
+enum {
+	/* XFS_REFCACHE_SIZE = 1 */
+	/* XFS_REFCACHE_PURGE = 2 */
+	/* XFS_RESTRICT_CHOWN = 3 */
+	XFS_SGID_INHERIT = 4,
+	XFS_SYMLINK_MODE = 5,
+	XFS_PANIC_MASK = 6,
+	XFS_ERRLEVEL = 7,
+	XFS_SYNCD_TIMER = 8,
+	/* XFS_PROBE_DMAPI = 9 */
+	/* XFS_PROBE_IOOPS = 10 */
+	/* XFS_PROBE_QUOTA = 11 */
+	XFS_STATS_CLEAR = 12,
+	XFS_INHERIT_SYNC = 13,
+	XFS_INHERIT_NODUMP = 14,
+	XFS_INHERIT_NOATIME = 15,
+	XFS_BUF_TIMER = 16,
+	XFS_BUF_AGE = 17,
+	/* XFS_IO_BYPASS = 18 */
+	XFS_INHERIT_NOSYM = 19,
+	XFS_ROTORSTEP = 20,
+	XFS_INHERIT_NODFRG = 21,
+	XFS_FILESTREAM_TIMER = 22,
+};
+
+extern xfs_param_t	xfs_params;
+
+struct xfs_globals {
+#ifdef DEBUG
+	int	pwork_threads;		/* parallel workqueue threads */
+	bool	larp;			/* log attribute replay */
+#endif
+	int	log_recovery_delay;	/* log recovery delay (secs) */
+	int	mount_delay;		/* mount setup delay (secs) */
+	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
+	bool	always_cow;		/* use COW fork for all overwrites */
+};
+extern struct xfs_globals	xfs_globals;
+
+#ifdef CONFIG_SYSCTL
+extern int xfs_sysctl_register(void);
+extern void xfs_sysctl_unregister(void);
+#else
+# define xfs_sysctl_register()		(0)
+# define xfs_sysctl_unregister()	do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+#endif /* __XFS_SYSCTL_H__ */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
new file mode 100644
index 000000000..f7faf6e70
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.c
@@ -0,0 +1,731 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sysfs.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_mount.h"
+
+struct xfs_sysfs_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobject, char *buf);
+	ssize_t (*store)(struct kobject *kobject, const char *buf,
+			 size_t count);
+};
+
+static inline struct xfs_sysfs_attr *
+to_attr(struct attribute *attr)
+{
+	return container_of(attr, struct xfs_sysfs_attr, attr);
+}
+
+#define XFS_SYSFS_ATTR_RW(name) \
+	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
+#define XFS_SYSFS_ATTR_RO(name) \
+	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+#define XFS_SYSFS_ATTR_WO(name) \
+	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name)
+
+#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
+
+STATIC ssize_t
+xfs_sysfs_object_show(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	char			*buf)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0;
+}
+
+STATIC ssize_t
+xfs_sysfs_object_store(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	const char		*buf,
+	size_t			count)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0;
+}
+
+static const struct sysfs_ops xfs_sysfs_ops = {
+	.show = xfs_sysfs_object_show,
+	.store = xfs_sysfs_object_store,
+};
+
+static struct attribute *xfs_mp_attrs[] = {
+	NULL,
+};
+ATTRIBUTE_GROUPS(xfs_mp);
+
+struct kobj_type xfs_mp_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+	.default_groups = xfs_mp_groups,
+};
+
+#ifdef DEBUG
+/* debug */
+
+STATIC ssize_t
+bug_on_assert_store(
+	struct kobject		*kobject,
+	const char		*buf,
+	size_t			count)
+{
+	int			ret;
+	int			val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val == 1)
+		xfs_globals.bug_on_assert = true;
+	else if (val == 0)
+		xfs_globals.bug_on_assert = false;
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+STATIC ssize_t
+bug_on_assert_show(
+	struct kobject		*kobject,
+	char			*buf)
+{
+	return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert);
+}
+XFS_SYSFS_ATTR_RW(bug_on_assert);
+
+STATIC ssize_t
+log_recovery_delay_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0 || val > 60)
+		return -EINVAL;
+
+	xfs_globals.log_recovery_delay = val;
+
+	return count;
+}
+
+STATIC ssize_t
+log_recovery_delay_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay);
+}
+XFS_SYSFS_ATTR_RW(log_recovery_delay);
+
+STATIC ssize_t
+mount_delay_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0 || val > 60)
+		return -EINVAL;
+
+	xfs_globals.mount_delay = val;
+
+	return count;
+}
+
+STATIC ssize_t
+mount_delay_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay);
+}
+XFS_SYSFS_ATTR_RW(mount_delay);
+
+static ssize_t
+always_cow_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	ssize_t		ret;
+
+	ret = kstrtobool(buf, &xfs_globals.always_cow);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+static ssize_t
+always_cow_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return sysfs_emit(buf, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
+#ifdef DEBUG
+/*
+ * Override how many threads the parallel work queue is allowed to create.
+ * This has to be a debug-only global (instead of an errortag) because one of
+ * the main users of parallel workqueues is mount time quotacheck.
+ */
+STATIC ssize_t
+pwork_threads_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < -1 || val > num_possible_cpus())
+		return -EINVAL;
+
+	xfs_globals.pwork_threads = val;
+
+	return count;
+}
+
+STATIC ssize_t
+pwork_threads_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
+}
+XFS_SYSFS_ATTR_RW(pwork_threads);
+
+static ssize_t
+larp_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	ssize_t		ret;
+
+	ret = kstrtobool(buf, &xfs_globals.larp);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+STATIC ssize_t
+larp_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
+}
+XFS_SYSFS_ATTR_RW(larp);
+#endif /* DEBUG */
+
+static struct attribute *xfs_dbg_attrs[] = {
+	ATTR_LIST(bug_on_assert),
+	ATTR_LIST(log_recovery_delay),
+	ATTR_LIST(mount_delay),
+	ATTR_LIST(always_cow),
+#ifdef DEBUG
+	ATTR_LIST(pwork_threads),
+	ATTR_LIST(larp),
+#endif
+	NULL,
+};
+ATTRIBUTE_GROUPS(xfs_dbg);
+
+struct kobj_type xfs_dbg_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+	.default_groups = xfs_dbg_groups,
+};
+
+#endif /* DEBUG */
+
+/* stats */
+
+static inline struct xstats *
+to_xstats(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+
+	return container_of(kobj, struct xstats, xs_kobj);
+}
+
+STATIC ssize_t
+stats_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	struct xstats	*stats = to_xstats(kobject);
+
+	return xfs_stats_format(stats->xs_stats, buf);
+}
+XFS_SYSFS_ATTR_RO(stats);
+
+STATIC ssize_t
+stats_clear_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+	struct xstats	*stats = to_xstats(kobject);
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val != 1)
+		return -EINVAL;
+
+	xfs_stats_clearall(stats->xs_stats);
+	return count;
+}
+XFS_SYSFS_ATTR_WO(stats_clear);
+
+static struct attribute *xfs_stats_attrs[] = {
+	ATTR_LIST(stats),
+	ATTR_LIST(stats_clear),
+	NULL,
+};
+ATTRIBUTE_GROUPS(xfs_stats);
+
+struct kobj_type xfs_stats_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+	.default_groups = xfs_stats_groups,
+};
+
+/* xlog */
+
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+
+	return container_of(kobj, struct xlog, l_kobj);
+}
+
+STATIC ssize_t
+log_head_lsn_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	int cycle;
+	int block;
+	struct xlog *log = to_xlog(kobject);
+
+	spin_lock(&log->l_icloglock);
+	cycle = log->l_curr_cycle;
+	block = log->l_curr_block;
+	spin_unlock(&log->l_icloglock);
+
+	return sysfs_emit(buf, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_head_lsn);
+
+STATIC ssize_t
+log_tail_lsn_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	int cycle;
+	int block;
+	struct xlog *log = to_xlog(kobject);
+
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
+	return sysfs_emit(buf, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_tail_lsn);
+
+STATIC ssize_t
+reserve_grant_head_show(
+	struct kobject	*kobject,
+	char		*buf)
+
+{
+	int cycle;
+	int bytes;
+	struct xlog *log = to_xlog(kobject);
+
+	xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
+	return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(reserve_grant_head);
+
+STATIC ssize_t
+write_grant_head_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	int cycle;
+	int bytes;
+	struct xlog *log = to_xlog(kobject);
+
+	xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
+	return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(write_grant_head);
+
+static struct attribute *xfs_log_attrs[] = {
+	ATTR_LIST(log_head_lsn),
+	ATTR_LIST(log_tail_lsn),
+	ATTR_LIST(reserve_grant_head),
+	ATTR_LIST(write_grant_head),
+	NULL,
+};
+ATTRIBUTE_GROUPS(xfs_log);
+
+struct kobj_type xfs_log_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+	.default_groups = xfs_log_groups,
+};
+
+/*
+ * Metadata IO error configuration
+ *
+ * The sysfs structure here is:
+ *	...xfs/<dev>/error/<class>/<errno>/<error_attrs>
+ *
+ * where <class> allows us to discriminate between data IO and metadata IO,
+ * and any other future type of IO (e.g. special inode or directory error
+ * handling) we care to support.
+ */
+static inline struct xfs_error_cfg *
+to_error_cfg(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	return container_of(kobj, struct xfs_error_cfg, kobj);
+}
+
+static inline struct xfs_mount *
+err_to_mp(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	return container_of(kobj, struct xfs_mount, m_error_kobj);
+}
+
+static ssize_t
+max_retries_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	int		retries;
+	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+	if (cfg->max_retries == XFS_ERR_RETRY_FOREVER)
+		retries = -1;
+	else
+		retries = cfg->max_retries;
+
+	return sysfs_emit(buf, "%d\n", retries);
+}
+
+static ssize_t
+max_retries_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < -1)
+		return -EINVAL;
+
+	if (val == -1)
+		cfg->max_retries = XFS_ERR_RETRY_FOREVER;
+	else
+		cfg->max_retries = val;
+	return count;
+}
+XFS_SYSFS_ATTR_RW(max_retries);
+
+static ssize_t
+retry_timeout_seconds_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	int		timeout;
+	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+		timeout = -1;
+	else
+		timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
+
+	return sysfs_emit(buf, "%d\n", timeout);
+}
+
+static ssize_t
+retry_timeout_seconds_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	/* 1 day timeout maximum, -1 means infinite */
+	if (val < -1 || val > 86400)
+		return -EINVAL;
+
+	if (val == -1)
+		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+	else {
+		cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+		ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
+	}
+	return count;
+}
+XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
+
+static ssize_t
+fail_at_unmount_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	struct xfs_mount	*mp = err_to_mp(kobject);
+
+	return sysfs_emit(buf, "%d\n", mp->m_fail_unmount);
+}
+
+static ssize_t
+fail_at_unmount_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	struct xfs_mount	*mp = err_to_mp(kobject);
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0 || val > 1)
+		return -EINVAL;
+
+	mp->m_fail_unmount = val;
+	return count;
+}
+XFS_SYSFS_ATTR_RW(fail_at_unmount);
+
+static struct attribute *xfs_error_attrs[] = {
+	ATTR_LIST(max_retries),
+	ATTR_LIST(retry_timeout_seconds),
+	NULL,
+};
+ATTRIBUTE_GROUPS(xfs_error);
+
+static struct kobj_type xfs_error_cfg_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+	.default_groups = xfs_error_groups,
+};
+
+static struct kobj_type xfs_error_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+};
+
+/*
+ * Error initialization tables. These need to be ordered in the same
+ * order as the enums used to index the array. All class init tables need to
+ * define a "default" behaviour as the first entry, all other entries can be
+ * empty.
+ */
+struct xfs_error_init {
+	char		*name;
+	int		max_retries;
+	int		retry_timeout;	/* in seconds */
+};
+
+static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
+	{ .name = "default",
+	  .max_retries = XFS_ERR_RETRY_FOREVER,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
+	},
+	{ .name = "EIO",
+	  .max_retries = XFS_ERR_RETRY_FOREVER,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
+	},
+	{ .name = "ENOSPC",
+	  .max_retries = XFS_ERR_RETRY_FOREVER,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
+	},
+	{ .name = "ENODEV",
+	  .max_retries = 0,	/* We can't recover from devices disappearing */
+	  .retry_timeout = 0,
+	},
+};
+
+static int
+xfs_error_sysfs_init_class(
+	struct xfs_mount	*mp,
+	int			class,
+	const char		*parent_name,
+	struct xfs_kobj		*parent_kobj,
+	const struct xfs_error_init init[])
+{
+	struct xfs_error_cfg	*cfg;
+	int			error;
+	int			i;
+
+	ASSERT(class < XFS_ERR_CLASS_MAX);
+
+	error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype,
+				&mp->m_error_kobj, parent_name);
+	if (error)
+		return error;
+
+	for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) {
+		cfg = &mp->m_error_cfg[class][i];
+		error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
+					parent_kobj, init[i].name);
+		if (error)
+			goto out_error;
+
+		cfg->max_retries = init[i].max_retries;
+		if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
+			cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+		else
+			cfg->retry_timeout = msecs_to_jiffies(
+					init[i].retry_timeout * MSEC_PER_SEC);
+	}
+	return 0;
+
+out_error:
+	/* unwind the entries that succeeded */
+	for (i--; i >= 0; i--) {
+		cfg = &mp->m_error_cfg[class][i];
+		xfs_sysfs_del(&cfg->kobj);
+	}
+	xfs_sysfs_del(parent_kobj);
+	return error;
+}
+
+int
+xfs_error_sysfs_init(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	/* .../xfs/<dev>/error/ */
+	error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
+				&mp->m_kobj, "error");
+	if (error)
+		return error;
+
+	error = sysfs_create_file(&mp->m_error_kobj.kobject,
+				  ATTR_LIST(fail_at_unmount));
+
+	if (error)
+		goto out_error;
+
+	/* .../xfs/<dev>/error/metadata/ */
+	error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
+				"metadata", &mp->m_error_meta_kobj,
+				xfs_error_meta_init);
+	if (error)
+		goto out_error;
+
+	return 0;
+
+out_error:
+	xfs_sysfs_del(&mp->m_error_kobj);
+	return error;
+}
+
+void
+xfs_error_sysfs_del(
+	struct xfs_mount	*mp)
+{
+	struct xfs_error_cfg	*cfg;
+	int			i, j;
+
+	for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
+		for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
+			cfg = &mp->m_error_cfg[i][j];
+
+			xfs_sysfs_del(&cfg->kobj);
+		}
+	}
+	xfs_sysfs_del(&mp->m_error_meta_kobj);
+	xfs_sysfs_del(&mp->m_error_kobj);
+}
+
+struct xfs_error_cfg *
+xfs_error_get_cfg(
+	struct xfs_mount	*mp,
+	int			error_class,
+	int			error)
+{
+	struct xfs_error_cfg	*cfg;
+
+	if (error < 0)
+		error = -error;
+
+	switch (error) {
+	case EIO:
+		cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
+		break;
+	case ENOSPC:
+		cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC];
+		break;
+	case ENODEV:
+		cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV];
+		break;
+	default:
+		cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
+		break;
+	}
+
+	return cfg;
+}
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
new file mode 100644
index 000000000..513095e35
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.h
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+
+#ifndef __XFS_SYSFS_H__
+#define __XFS_SYSFS_H__
+
+extern struct kobj_type xfs_mp_ktype;	/* xfs_mount */
+extern struct kobj_type xfs_dbg_ktype;	/* debug */
+extern struct kobj_type xfs_log_ktype;	/* xlog */
+extern struct kobj_type xfs_stats_ktype;	/* stats */
+
+static inline struct xfs_kobj *
+to_kobj(struct kobject *kobject)
+{
+	return container_of(kobject, struct xfs_kobj, kobject);
+}
+
+static inline void
+xfs_sysfs_release(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	complete(&kobj->complete);
+}
+
+static inline int
+xfs_sysfs_init(
+	struct xfs_kobj		*kobj,
+	struct kobj_type	*ktype,
+	struct xfs_kobj		*parent_kobj,
+	const char		*name)
+{
+	struct kobject		*parent;
+	int err;
+
+	parent = parent_kobj ? &parent_kobj->kobject : NULL;
+	init_completion(&kobj->complete);
+	err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
+	if (err)
+		kobject_put(&kobj->kobject);
+
+	return err;
+}
+
+static inline void
+xfs_sysfs_del(
+	struct xfs_kobj	*kobj)
+{
+	kobject_del(&kobj->kobject);
+	kobject_put(&kobj->kobject);
+	wait_for_completion(&kobj->complete);
+}
+
+int	xfs_error_sysfs_init(struct xfs_mount *mp);
+void	xfs_error_sysfs_del(struct xfs_mount *mp);
+
+#endif	/* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
new file mode 100644
index 000000000..d269ef57f
--- /dev/null
+++ b/fs/xfs/xfs_trace.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_da_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_filestream.h"
+#include "xfs_fsmap.h"
+#include "xfs_btree_staging.h"
+#include "xfs_icache.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_error.h"
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "xfs_trace.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
new file mode 100644
index 000000000..372d871bc
--- /dev/null
+++ b/fs/xfs/xfs_trace.h
@@ -0,0 +1,4246 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * NOTE: none of these tracepoints shall be considered a stable kernel ABI
+ * as they can change at any time.
+ *
+ * Current conventions for printing numbers measuring specific units:
+ *
+ * agno: allocation group number
+ *
+ * agino: per-AG inode number
+ * ino: filesystem inode number
+ *
+ * agbno: per-AG block number in fs blocks
+ * startblock: physical block number for file mappings.  This is either a
+ *             segmented fsblock for data device mappings, or a rfsblock
+ *             for realtime device mappings
+ * fsbcount: number of blocks in an extent, in fs blocks
+ *
+ * daddr: physical block number in 512b blocks
+ * bbcount: number of blocks in a physical extent, in 512b blocks
+ *
+ * owner: reverse-mapping owner, usually inodes
+ *
+ * fileoff: file offset, in fs blocks
+ * pos: file offset, in bytes
+ * bytecount: number of bytes
+ *
+ * disize: ondisk file size, in bytes
+ * isize: incore file size, in bytes
+ *
+ * forkoff: inode fork offset, in bytes
+ *
+ * ireccount: number of inode records
+ *
+ * Numbers describing space allocations (blocks, extents, inodes) should be
+ * formatted in hexadecimal.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs
+
+#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_H
+
+#include <linux/tracepoint.h>
+
+struct xfs_agf;
+struct xfs_alloc_arg;
+struct xfs_attr_list_context;
+struct xfs_buf_log_item;
+struct xfs_da_args;
+struct xfs_da_node_entry;
+struct xfs_dquot;
+struct xfs_log_item;
+struct xlog;
+struct xlog_ticket;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xlog_rec_header;
+struct xlog_in_core;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
+struct xfs_bmbt_irec;
+struct xfs_btree_cur;
+struct xfs_refcount_irec;
+struct xfs_fsmap;
+struct xfs_rmap_irec;
+struct xfs_icreate_log;
+struct xfs_owner_info;
+struct xfs_trans_res;
+struct xfs_inobt_rec_incore;
+union xfs_btree_ptr;
+struct xfs_dqtrx;
+struct xfs_icwalk;
+
+#define XFS_ATTR_FILTER_FLAGS \
+	{ XFS_ATTR_ROOT,	"ROOT" }, \
+	{ XFS_ATTR_SECURE,	"SECURE" }, \
+	{ XFS_ATTR_INCOMPLETE,	"INCOMPLETE" }
+
+DECLARE_EVENT_CLASS(xfs_attr_list_class,
+	TP_PROTO(struct xfs_attr_list_context *ctx),
+	TP_ARGS(ctx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u32, hashval)
+		__field(u32, blkno)
+		__field(u32, offset)
+		__field(void *, buffer)
+		__field(int, bufsize)
+		__field(int, count)
+		__field(int, firstu)
+		__field(int, dupcnt)
+		__field(unsigned int, attr_filter)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+		__entry->ino = ctx->dp->i_ino;
+		__entry->hashval = ctx->cursor.hashval;
+		__entry->blkno = ctx->cursor.blkno;
+		__entry->offset = ctx->cursor.offset;
+		__entry->buffer = ctx->buffer;
+		__entry->bufsize = ctx->bufsize;
+		__entry->count = ctx->count;
+		__entry->firstu = ctx->firstu;
+		__entry->attr_filter = ctx->attr_filter;
+	),
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+		  "buffer %p size %u count %u firstu %u filter %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->hashval,
+		   __entry->blkno,
+		   __entry->offset,
+		   __entry->dupcnt,
+		   __entry->buffer,
+		   __entry->bufsize,
+		   __entry->count,
+		   __entry->firstu,
+		   __print_flags(__entry->attr_filter, "|",
+				 XFS_ATTR_FILTER_FLAGS)
+	)
+)
+
+#define DEFINE_ATTR_LIST_EVENT(name) \
+DEFINE_EVENT(xfs_attr_list_class, name, \
+	TP_PROTO(struct xfs_attr_list_context *ctx), \
+	TP_ARGS(ctx))
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
+
+TRACE_EVENT(xlog_intent_recovery_failed,
+	TP_PROTO(struct xfs_mount *mp, int error, void *function),
+	TP_ARGS(mp, error, function),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, error)
+		__field(void *, function)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->error = error;
+		__entry->function = function;
+	),
+	TP_printk("dev %d:%d error %d function %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->error, __entry->function)
+);
+
+DECLARE_EVENT_CLASS(xfs_perag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, refcount, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->refcount = refcount;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno 0x%x refcount %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name)	\
+DEFINE_EVENT(xfs_perag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
+		 unsigned long caller_ip),					\
+	TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+
+TRACE_EVENT(xfs_inodegc_worker,
+	TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
+	TP_ARGS(mp, shrinker_hits),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, shrinker_hits)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->shrinker_hits = shrinker_hits;
+	),
+	TP_printk("dev %d:%d shrinker_hits %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->shrinker_hits)
+);
+
+DECLARE_EVENT_CLASS(xfs_fs_class,
+	TP_PROTO(struct xfs_mount *mp, void *caller_ip),
+	TP_ARGS(mp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long long, mflags)
+		__field(unsigned long, opstate)
+		__field(unsigned long, sbflags)
+		__field(void *, caller_ip)
+	),
+	TP_fast_assign(
+		if (mp) {
+			__entry->dev = mp->m_super->s_dev;
+			__entry->mflags = mp->m_features;
+			__entry->opstate = mp->m_opstate;
+			__entry->sbflags = mp->m_super->s_flags;
+		}
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d m_features 0x%llx opstate (%s) s_flags 0x%lx caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->mflags,
+		  __print_flags(__entry->opstate, "|", XFS_OPSTATE_STRINGS),
+		  __entry->sbflags,
+		  __entry->caller_ip)
+);
+
+#define DEFINE_FS_EVENT(name)	\
+DEFINE_EVENT(xfs_fs_class, name,					\
+	TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+	TP_ARGS(mp, caller_ip))
+DEFINE_FS_EVENT(xfs_inodegc_flush);
+DEFINE_FS_EVENT(xfs_inodegc_push);
+DEFINE_FS_EVENT(xfs_inodegc_start);
+DEFINE_FS_EVENT(xfs_inodegc_stop);
+DEFINE_FS_EVENT(xfs_inodegc_queue);
+DEFINE_FS_EVENT(xfs_inodegc_throttle);
+DEFINE_FS_EVENT(xfs_fs_sync_fs);
+DEFINE_FS_EVENT(xfs_blockgc_start);
+DEFINE_FS_EVENT(xfs_blockgc_stop);
+DEFINE_FS_EVENT(xfs_blockgc_worker);
+DEFINE_FS_EVENT(xfs_blockgc_flush_all);
+
+TRACE_EVENT(xfs_inodegc_shrinker_scan,
+	TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc,
+		 void *caller_ip),
+	TP_ARGS(mp, sc, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, nr_to_scan)
+		__field(void *, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->nr_to_scan = sc->nr_to_scan;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d nr_to_scan %lu caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_to_scan,
+		  __entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
+	TP_ARGS(mp, agno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+	),
+	TP_printk("dev %d:%d agno 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno)
+);
+#define DEFINE_AG_EVENT(name)	\
+DEFINE_EVENT(xfs_ag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),	\
+	TP_ARGS(mp, agno))
+
+DEFINE_AG_EVENT(xfs_read_agf);
+DEFINE_AG_EVENT(xfs_alloc_read_agf);
+DEFINE_AG_EVENT(xfs_read_agi);
+DEFINE_AG_EVENT(xfs_ialloc_read_agi);
+
+TRACE_EVENT(xfs_attr_list_node_descend,
+	TP_PROTO(struct xfs_attr_list_context *ctx,
+		 struct xfs_da_node_entry *btree),
+	TP_ARGS(ctx, btree),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u32, hashval)
+		__field(u32, blkno)
+		__field(u32, offset)
+		__field(void *, buffer)
+		__field(int, bufsize)
+		__field(int, count)
+		__field(int, firstu)
+		__field(int, dupcnt)
+		__field(unsigned int, attr_filter)
+		__field(u32, bt_hashval)
+		__field(u32, bt_before)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+		__entry->ino = ctx->dp->i_ino;
+		__entry->hashval = ctx->cursor.hashval;
+		__entry->blkno = ctx->cursor.blkno;
+		__entry->offset = ctx->cursor.offset;
+		__entry->buffer = ctx->buffer;
+		__entry->bufsize = ctx->bufsize;
+		__entry->count = ctx->count;
+		__entry->firstu = ctx->firstu;
+		__entry->attr_filter = ctx->attr_filter;
+		__entry->bt_hashval = be32_to_cpu(btree->hashval);
+		__entry->bt_before = be32_to_cpu(btree->before);
+	),
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+		  "buffer %p size %u count %u firstu %u filter %s "
+		  "node hashval %u, node before %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->hashval,
+		   __entry->blkno,
+		   __entry->offset,
+		   __entry->dupcnt,
+		   __entry->buffer,
+		   __entry->bufsize,
+		   __entry->count,
+		   __entry->firstu,
+		   __print_flags(__entry->attr_filter, "|",
+				 XFS_ATTR_FILTER_FLAGS),
+		   __entry->bt_hashval,
+		   __entry->bt_before)
+);
+
+DECLARE_EVENT_CLASS(xfs_bmap_class,
+	TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state,
+		 unsigned long caller_ip),
+	TP_ARGS(ip, cur, state, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(void *, leaf)
+		__field(int, pos)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+		__field(int, bmap_state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		struct xfs_ifork	*ifp;
+		struct xfs_bmbt_irec	r;
+
+		ifp = xfs_iext_state_to_fork(ip, state);
+		xfs_iext_get_extent(ifp, cur, &r);
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->leaf = cur->leaf;
+		__entry->pos = cur->pos;
+		__entry->startoff = r.br_startoff;
+		__entry->startblock = r.br_startblock;
+		__entry->blockcount = r.br_blockcount;
+		__entry->state = r.br_state;
+		__entry->bmap_state = state;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx state %s cur %p/%d "
+		  "fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx flag %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+		  __entry->leaf,
+		  __entry->pos,
+		  __entry->startoff,
+		  (int64_t)__entry->startblock,
+		  __entry->blockcount,
+		  __entry->state,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_BMAP_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_class, name, \
+	TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip, cur, state, caller_ip))
+DEFINE_BMAP_EVENT(xfs_iext_insert);
+DEFINE_BMAP_EVENT(xfs_iext_remove);
+DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
+DEFINE_BMAP_EVENT(xfs_bmap_post_update);
+DEFINE_BMAP_EVENT(xfs_read_extent);
+DEFINE_BMAP_EVENT(xfs_write_extent);
+
+DECLARE_EVENT_CLASS(xfs_buf_class,
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
+	TP_ARGS(bp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(int, nblks)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(unsigned, flags)
+		__field(unsigned long, caller_ip)
+		__field(const void *, buf_ops)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = xfs_buf_daddr(bp);
+		__entry->nblks = bp->b_length;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = bp->b_sema.count;
+		__entry->flags = bp->b_flags;
+		__entry->caller_ip = caller_ip;
+		__entry->buf_ops = bp->b_ops;
+	),
+	TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
+		  "lock %d flags %s bufops %pS caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->nblks,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  __entry->buf_ops,
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_EVENT(name) \
+DEFINE_EVENT(xfs_buf_class, name, \
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
+	TP_ARGS(bp, caller_ip))
+DEFINE_BUF_EVENT(xfs_buf_init);
+DEFINE_BUF_EVENT(xfs_buf_free);
+DEFINE_BUF_EVENT(xfs_buf_hold);
+DEFINE_BUF_EVENT(xfs_buf_rele);
+DEFINE_BUF_EVENT(xfs_buf_iodone);
+DEFINE_BUF_EVENT(xfs_buf_submit);
+DEFINE_BUF_EVENT(xfs_buf_lock);
+DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
+DEFINE_BUF_EVENT(xfs_buf_trylock);
+DEFINE_BUF_EVENT(xfs_buf_unlock);
+DEFINE_BUF_EVENT(xfs_buf_iowait);
+DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
+DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+DEFINE_BUF_EVENT(xfs_buf_item_relse);
+DEFINE_BUF_EVENT(xfs_buf_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+
+/* not really buffer traces, but the buf provides useful information */
+DEFINE_BUF_EVENT(xfs_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_reset_dqcounts);
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(xfs_buf_flags_class,
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
+	TP_ARGS(bp, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(unsigned int, length)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(unsigned, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = xfs_buf_daddr(bp);
+		__entry->length = bp->b_length;
+		__entry->flags = flags;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = bp->b_sema.count;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
+		  "lock %d flags %s caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_FLAGS_EVENT(name) \
+DEFINE_EVENT(xfs_buf_flags_class, name, \
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
+	TP_ARGS(bp, flags, caller_ip))
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+
+TRACE_EVENT(xfs_buf_ioerror,
+	TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip),
+	TP_ARGS(bp, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(unsigned int, length)
+		__field(unsigned, flags)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(int, error)
+		__field(xfs_failaddr_t, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = xfs_buf_daddr(bp);
+		__entry->length = bp->b_length;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = bp->b_sema.count;
+		__entry->error = error;
+		__entry->flags = bp->b_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
+		  "lock %d error %d flags %s caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __entry->error,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_buf_item_class,
+	TP_PROTO(struct xfs_buf_log_item *bip),
+	TP_ARGS(bip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, buf_bno)
+		__field(unsigned int, buf_len)
+		__field(int, buf_hold)
+		__field(int, buf_pincount)
+		__field(int, buf_lockval)
+		__field(unsigned, buf_flags)
+		__field(unsigned, bli_recur)
+		__field(int, bli_refcount)
+		__field(unsigned, bli_flags)
+		__field(unsigned long, li_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = bip->bli_buf->b_target->bt_dev;
+		__entry->bli_flags = bip->bli_flags;
+		__entry->bli_recur = bip->bli_recur;
+		__entry->bli_refcount = atomic_read(&bip->bli_refcount);
+		__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
+		__entry->buf_len = bip->bli_buf->b_length;
+		__entry->buf_flags = bip->bli_buf->b_flags;
+		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
+		__entry->buf_lockval = bip->bli_buf->b_sema.count;
+		__entry->li_flags = bip->bli_item.li_flags;
+	),
+	TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
+		  "lock %d flags %s recur %d refcount %d bliflags %s "
+		  "liflags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->buf_bno,
+		  __entry->buf_len,
+		  __entry->buf_hold,
+		  __entry->buf_pincount,
+		  __entry->buf_lockval,
+		  __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
+		  __entry->bli_recur,
+		  __entry->bli_refcount,
+		  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
+		  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_BUF_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_buf_item_class, name, \
+	TP_PROTO(struct xfs_buf_log_item *bip), \
+	TP_ARGS(bip))
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_release);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+
+DECLARE_EVENT_CLASS(xfs_filestream_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno),
+	TP_ARGS(mp, ino, agno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_agnumber_t, agno)
+		__field(int, streams)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ino = ino;
+		__entry->agno = agno;
+		__entry->streams = xfs_filestream_peek_ag(mp, agno);
+	),
+	TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->agno,
+		  __entry->streams)
+)
+#define DEFINE_FILESTREAM_EVENT(name) \
+DEFINE_EVENT(xfs_filestream_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), \
+	TP_ARGS(mp, ino, agno))
+DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
+
+TRACE_EVENT(xfs_filestream_pick,
+	TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
+		 xfs_extlen_t free, int nscan),
+	TP_ARGS(ip, agno, free, nscan),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_agnumber_t, agno)
+		__field(int, streams)
+		__field(xfs_extlen_t, free)
+		__field(int, nscan)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->agno = agno;
+		__entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+		__entry->free = free;
+		__entry->nscan = nscan;
+	),
+	TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d nscan %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->agno,
+		  __entry->streams,
+		  __entry->free,
+		  __entry->nscan)
+);
+
+DECLARE_EVENT_CLASS(xfs_lock_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
+		 unsigned long caller_ip),
+	TP_ARGS(ip,  lock_flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, lock_flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lock_flags = lock_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx flags %s caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_LOCK_EVENT(name) \
+DEFINE_EVENT(xfs_lock_class, name, \
+	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip,  lock_flags, caller_ip))
+DEFINE_LOCK_EVENT(xfs_ilock);
+DEFINE_LOCK_EVENT(xfs_ilock_nowait);
+DEFINE_LOCK_EVENT(xfs_ilock_demote);
+DEFINE_LOCK_EVENT(xfs_iunlock);
+
+DECLARE_EVENT_CLASS(xfs_inode_class,
+	TP_PROTO(struct xfs_inode *ip),
+	TP_ARGS(ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned long, iflags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->iflags = ip->i_flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->iflags)
+)
+
+#define DEFINE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_inode_class, name, \
+	TP_PROTO(struct xfs_inode *ip), \
+	TP_ARGS(ip))
+DEFINE_INODE_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_recycle);
+DEFINE_INODE_EVENT(xfs_iget_recycle_fail);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
+
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
+DEFINE_INODE_EVENT(xfs_collapse_file_space);
+DEFINE_INODE_EVENT(xfs_insert_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_get_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_dir_fsync);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_update_time);
+
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+
+DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
+DEFINE_INODE_EVENT(xfs_inode_set_reclaimable);
+DEFINE_INODE_EVENT(xfs_inode_reclaiming);
+DEFINE_INODE_EVENT(xfs_inode_set_need_inactive);
+DEFINE_INODE_EVENT(xfs_inode_inactivating);
+
+/*
+ * ftrace's __print_symbolic requires that all enum values be wrapped in the
+ * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
+ * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
+ * code.
+ */
+TRACE_DEFINE_ENUM(PE_SIZE_PTE);
+TRACE_DEFINE_ENUM(PE_SIZE_PMD);
+TRACE_DEFINE_ENUM(PE_SIZE_PUD);
+
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
+
+TRACE_EVENT(xfs_filemap_fault,
+	TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
+		 bool write_fault),
+	TP_ARGS(ip, pe_size, write_fault),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(enum page_entry_size, pe_size)
+		__field(bool, write_fault)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->pe_size = pe_size;
+		__entry->write_fault = write_fault;
+	),
+	TP_printk("dev %d:%d ino 0x%llx %s write_fault %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->pe_size,
+			{ PE_SIZE_PTE,	"PTE" },
+			{ PE_SIZE_PMD,	"PMD" },
+			{ PE_SIZE_PUD,	"PUD" }),
+		  __entry->write_fault)
+)
+
+DECLARE_EVENT_CLASS(xfs_iref_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
+	TP_ARGS(ip, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, count)
+		__field(int, pincount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->count = atomic_read(&VFS_I(ip)->i_count);
+		__entry->pincount = atomic_read(&ip->i_pincount);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->count,
+		  __entry->pincount,
+		  (char *)__entry->caller_ip)
+)
+
+TRACE_EVENT(xfs_iomap_prealloc_size,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift,
+		 unsigned int writeio_blocks),
+	TP_ARGS(ip, blocks, shift, writeio_blocks),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsblock_t, blocks)
+		__field(int, shift)
+		__field(unsigned int, writeio_blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->blocks = blocks;
+		__entry->shift = shift;
+		__entry->writeio_blocks = writeio_blocks;
+	),
+	TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
+		  "m_allocsize_blocks %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
+		  __entry->blocks, __entry->shift, __entry->writeio_blocks)
+)
+
+TRACE_EVENT(xfs_irec_merge_pre,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+		 uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
+	TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(uint16_t, holemask)
+		__field(xfs_agino_t, nagino)
+		__field(uint16_t, nholemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agino = agino;
+		__entry->holemask = holemask;
+		__entry->nagino = nagino;
+		__entry->nholemask = holemask;
+	),
+	TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agino,
+		  __entry->holemask,
+		  __entry->nagino,
+		  __entry->nholemask)
+)
+
+TRACE_EVENT(xfs_irec_merge_post,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+		 uint16_t holemask),
+	TP_ARGS(mp, agno, agino, holemask),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(uint16_t, holemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agino = agino;
+		__entry->holemask = holemask;
+	),
+	TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x",
+		  MAJOR(__entry->dev),
+		  MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agino,
+		  __entry->holemask)
+)
+
+#define DEFINE_IREF_EVENT(name) \
+DEFINE_EVENT(xfs_iref_class, name, \
+	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
+	TP_ARGS(ip, caller_ip))
+DEFINE_IREF_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_inode_pin);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+	TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
+	TP_ARGS(dp, name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, dp_ino)
+		__field(int, namelen)
+		__dynamic_array(char, name, name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(dp)->i_sb->s_dev;
+		__entry->dp_ino = dp->i_ino;
+		__entry->namelen = name->len;
+		memcpy(__get_str(name), name->name, name->len);
+	),
+	TP_printk("dev %d:%d dp ino 0x%llx name %.*s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dp_ino,
+		  __entry->namelen,
+		  __get_str(name))
+)
+
+#define DEFINE_NAMESPACE_EVENT(name) \
+DEFINE_EVENT(xfs_namespace_class, name, \
+	TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \
+	TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+
+TRACE_EVENT(xfs_rename,
+	TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+		 struct xfs_name *src_name, struct xfs_name *target_name),
+	TP_ARGS(src_dp, target_dp, src_name, target_name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_dp_ino)
+		__field(xfs_ino_t, target_dp_ino)
+		__field(int, src_namelen)
+		__field(int, target_namelen)
+		__dynamic_array(char, src_name, src_name->len)
+		__dynamic_array(char, target_name, target_name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+		__entry->src_dp_ino = src_dp->i_ino;
+		__entry->target_dp_ino = target_dp->i_ino;
+		__entry->src_namelen = src_name->len;
+		__entry->target_namelen = target_name->len;
+		memcpy(__get_str(src_name), src_name->name, src_name->len);
+		memcpy(__get_str(target_name), target_name->name,
+			target_name->len);
+	),
+	TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+		  " src name %.*s target name %.*s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->src_dp_ino,
+		  __entry->target_dp_ino,
+		  __entry->src_namelen,
+		  __get_str(src_name),
+		  __entry->target_namelen,
+		  __get_str(target_name))
+)
+
+DECLARE_EVENT_CLASS(xfs_dquot_class,
+	TP_PROTO(struct xfs_dquot *dqp),
+	TP_ARGS(dqp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u32, id)
+		__field(xfs_dqtype_t, type)
+		__field(unsigned, flags)
+		__field(unsigned, nrefs)
+		__field(unsigned long long, res_bcount)
+		__field(unsigned long long, res_rtbcount)
+		__field(unsigned long long, res_icount)
+
+		__field(unsigned long long, bcount)
+		__field(unsigned long long, rtbcount)
+		__field(unsigned long long, icount)
+
+		__field(unsigned long long, blk_hardlimit)
+		__field(unsigned long long, blk_softlimit)
+		__field(unsigned long long, rtb_hardlimit)
+		__field(unsigned long long, rtb_softlimit)
+		__field(unsigned long long, ino_hardlimit)
+		__field(unsigned long long, ino_softlimit)
+	),
+	TP_fast_assign(
+		__entry->dev = dqp->q_mount->m_super->s_dev;
+		__entry->id = dqp->q_id;
+		__entry->type = dqp->q_type;
+		__entry->flags = dqp->q_flags;
+		__entry->nrefs = dqp->q_nrefs;
+
+		__entry->res_bcount = dqp->q_blk.reserved;
+		__entry->res_rtbcount = dqp->q_rtb.reserved;
+		__entry->res_icount = dqp->q_ino.reserved;
+
+		__entry->bcount = dqp->q_blk.count;
+		__entry->rtbcount = dqp->q_rtb.count;
+		__entry->icount = dqp->q_ino.count;
+
+		__entry->blk_hardlimit = dqp->q_blk.hardlimit;
+		__entry->blk_softlimit = dqp->q_blk.softlimit;
+		__entry->rtb_hardlimit = dqp->q_rtb.hardlimit;
+		__entry->rtb_softlimit = dqp->q_rtb.softlimit;
+		__entry->ino_hardlimit = dqp->q_ino.hardlimit;
+		__entry->ino_softlimit = dqp->q_ino.softlimit;
+	),
+	TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u "
+		  "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx "
+		  "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+		  "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx "
+		  "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->id,
+		  __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+		  __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+		  __entry->nrefs,
+		  __entry->res_bcount,
+		  __entry->res_rtbcount,
+		  __entry->res_icount,
+		  __entry->bcount,
+		  __entry->blk_hardlimit,
+		  __entry->blk_softlimit,
+		  __entry->rtbcount,
+		  __entry->rtb_hardlimit,
+		  __entry->rtb_softlimit,
+		  __entry->icount,
+		  __entry->ino_hardlimit,
+		  __entry->ino_softlimit)
+)
+
+#define DEFINE_DQUOT_EVENT(name) \
+DEFINE_EVENT(xfs_dquot_class, name, \
+	TP_PROTO(struct xfs_dquot *dqp), \
+	TP_ARGS(dqp))
+DEFINE_DQUOT_EVENT(xfs_dqadjust);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
+DEFINE_DQUOT_EVENT(xfs_dqattach_found);
+DEFINE_DQUOT_EVENT(xfs_dqattach_get);
+DEFINE_DQUOT_EVENT(xfs_dqalloc);
+DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
+DEFINE_DQUOT_EVENT(xfs_dqread);
+DEFINE_DQUOT_EVENT(xfs_dqread_fail);
+DEFINE_DQUOT_EVENT(xfs_dqget_hit);
+DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
+DEFINE_DQUOT_EVENT(xfs_dqget_dup);
+DEFINE_DQUOT_EVENT(xfs_dqput);
+DEFINE_DQUOT_EVENT(xfs_dqput_free);
+DEFINE_DQUOT_EVENT(xfs_dqrele);
+DEFINE_DQUOT_EVENT(xfs_dqflush);
+DEFINE_DQUOT_EVENT(xfs_dqflush_force);
+DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
+DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
+
+TRACE_EVENT(xfs_trans_mod_dquot,
+	TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
+		 unsigned int field, int64_t delta),
+	TP_ARGS(tp, dqp, field, delta),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_dqtype_t, type)
+		__field(unsigned int, flags)
+		__field(unsigned int, dqid)
+		__field(unsigned int, field)
+		__field(int64_t, delta)
+	),
+	TP_fast_assign(
+		__entry->dev = tp->t_mountp->m_super->s_dev;
+		__entry->type = dqp->q_type;
+		__entry->flags = dqp->q_flags;
+		__entry->dqid = dqp->q_id;
+		__entry->field = field;
+		__entry->delta = delta;
+	),
+	TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dqid,
+		  __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+		  __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+		  __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS),
+		  __entry->delta)
+);
+
+DECLARE_EVENT_CLASS(xfs_dqtrx_class,
+	TP_PROTO(struct xfs_dqtrx *qtrx),
+	TP_ARGS(qtrx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_dqtype_t, type)
+		__field(unsigned int, flags)
+		__field(u32, dqid)
+
+		__field(uint64_t, blk_res)
+		__field(int64_t,  bcount_delta)
+		__field(int64_t,  delbcnt_delta)
+
+		__field(uint64_t, rtblk_res)
+		__field(uint64_t, rtblk_res_used)
+		__field(int64_t,  rtbcount_delta)
+		__field(int64_t,  delrtb_delta)
+
+		__field(uint64_t, ino_res)
+		__field(uint64_t, ino_res_used)
+		__field(int64_t,  icount_delta)
+	),
+	TP_fast_assign(
+		__entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev;
+		__entry->type = qtrx->qt_dquot->q_type;
+		__entry->flags = qtrx->qt_dquot->q_flags;
+		__entry->dqid = qtrx->qt_dquot->q_id;
+
+		__entry->blk_res = qtrx->qt_blk_res;
+		__entry->bcount_delta = qtrx->qt_bcount_delta;
+		__entry->delbcnt_delta = qtrx->qt_delbcnt_delta;
+
+		__entry->rtblk_res = qtrx->qt_rtblk_res;
+		__entry->rtblk_res_used = qtrx->qt_rtblk_res_used;
+		__entry->rtbcount_delta = qtrx->qt_rtbcount_delta;
+		__entry->delrtb_delta = qtrx->qt_delrtb_delta;
+
+		__entry->ino_res = qtrx->qt_ino_res;
+		__entry->ino_res_used = qtrx->qt_ino_res_used;
+		__entry->icount_delta = qtrx->qt_icount_delta;
+	),
+	TP_printk("dev %d:%d dquot id 0x%x type %s flags %s "
+		  "blk_res %llu bcount_delta %lld delbcnt_delta %lld "
+		  "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld "
+		  "ino_res %llu ino_res_used %llu icount_delta %lld",
+		MAJOR(__entry->dev), MINOR(__entry->dev),
+		__entry->dqid,
+		  __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+		  __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+
+		__entry->blk_res,
+		__entry->bcount_delta,
+		__entry->delbcnt_delta,
+
+		__entry->rtblk_res,
+		__entry->rtblk_res_used,
+		__entry->rtbcount_delta,
+		__entry->delrtb_delta,
+
+		__entry->ino_res,
+		__entry->ino_res_used,
+		__entry->icount_delta)
+)
+
+#define DEFINE_DQTRX_EVENT(name) \
+DEFINE_EVENT(xfs_dqtrx_class, name, \
+	TP_PROTO(struct xfs_dqtrx *qtrx), \
+	TP_ARGS(qtrx))
+DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas);
+DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before);
+DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after);
+
+DECLARE_EVENT_CLASS(xfs_loggrant_class,
+	TP_PROTO(struct xlog *log, struct xlog_ticket *tic),
+	TP_ARGS(log, tic),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(char, ocnt)
+		__field(char, cnt)
+		__field(int, curr_res)
+		__field(int, unit_res)
+		__field(unsigned int, flags)
+		__field(int, reserveq)
+		__field(int, writeq)
+		__field(int, grant_reserve_cycle)
+		__field(int, grant_reserve_bytes)
+		__field(int, grant_write_cycle)
+		__field(int, grant_write_bytes)
+		__field(int, curr_cycle)
+		__field(int, curr_block)
+		__field(xfs_lsn_t, tail_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->ocnt = tic->t_ocnt;
+		__entry->cnt = tic->t_cnt;
+		__entry->curr_res = tic->t_curr_res;
+		__entry->unit_res = tic->t_unit_res;
+		__entry->flags = tic->t_flags;
+		__entry->reserveq = list_empty(&log->l_reserve_head.waiters);
+		__entry->writeq = list_empty(&log->l_write_head.waiters);
+		xlog_crack_grant_head(&log->l_reserve_head.grant,
+				&__entry->grant_reserve_cycle,
+				&__entry->grant_reserve_bytes);
+		xlog_crack_grant_head(&log->l_write_head.grant,
+				&__entry->grant_write_cycle,
+				&__entry->grant_write_bytes);
+		__entry->curr_cycle = log->l_curr_cycle;
+		__entry->curr_block = log->l_curr_block;
+		__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
+	),
+	TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
+		  "t_unit_res %u t_flags %s reserveq %s "
+		  "writeq %s grant_reserve_cycle %d "
+		  "grant_reserve_bytes %d grant_write_cycle %d "
+		  "grant_write_bytes %d curr_cycle %d curr_block %d "
+		  "tail_cycle %d tail_block %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ocnt,
+		  __entry->cnt,
+		  __entry->curr_res,
+		  __entry->unit_res,
+		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
+		  __entry->reserveq ? "empty" : "active",
+		  __entry->writeq ? "empty" : "active",
+		  __entry->grant_reserve_cycle,
+		  __entry->grant_reserve_bytes,
+		  __entry->grant_write_cycle,
+		  __entry->grant_write_bytes,
+		  __entry->curr_cycle,
+		  __entry->curr_block,
+		  CYCLE_LSN(__entry->tail_lsn),
+		  BLOCK_LSN(__entry->tail_lsn)
+	)
+)
+
+#define DEFINE_LOGGRANT_EVENT(name) \
+DEFINE_EVENT(xfs_loggrant_class, name, \
+	TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
+	TP_ARGS(log, tic))
+DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
+
+DECLARE_EVENT_CLASS(xfs_log_item_class,
+	TP_PROTO(struct xfs_log_item *lip),
+	TP_ARGS(lip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(void *, lip)
+		__field(uint, type)
+		__field(unsigned long, flags)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = lip->li_log->l_mp->m_super->s_dev;
+		__entry->lip = lip;
+		__entry->type = lip->li_type;
+		__entry->flags = lip->li_flags;
+		__entry->lsn = lip->li_lsn;
+	),
+	TP_printk("dev %d:%d lip %p lsn %d/%d type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lip,
+		  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+TRACE_EVENT(xfs_log_force,
+	TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip),
+	TP_ARGS(mp, lsn, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_lsn_t, lsn)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->lsn = lsn;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d lsn 0x%llx caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lsn, (void *)__entry->caller_ip)
+)
+
+#define DEFINE_LOG_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_log_item_class, name, \
+	TP_PROTO(struct xfs_log_item *lip), \
+	TP_ARGS(lip))
+DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
+
+DECLARE_EVENT_CLASS(xfs_ail_class,
+	TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
+	TP_ARGS(lip, old_lsn, new_lsn),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(void *, lip)
+		__field(uint, type)
+		__field(unsigned long, flags)
+		__field(xfs_lsn_t, old_lsn)
+		__field(xfs_lsn_t, new_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = lip->li_log->l_mp->m_super->s_dev;
+		__entry->lip = lip;
+		__entry->type = lip->li_type;
+		__entry->flags = lip->li_flags;
+		__entry->old_lsn = old_lsn;
+		__entry->new_lsn = new_lsn;
+	),
+	TP_printk("dev %d:%d lip %p old lsn %d/%d new lsn %d/%d type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lip,
+		  CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+		  CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_AIL_EVENT(name) \
+DEFINE_EVENT(xfs_ail_class, name, \
+	TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \
+	TP_ARGS(lip, old_lsn, new_lsn))
+DEFINE_AIL_EVENT(xfs_ail_insert);
+DEFINE_AIL_EVENT(xfs_ail_move);
+DEFINE_AIL_EVENT(xfs_ail_delete);
+
+TRACE_EVENT(xfs_log_assign_tail_lsn,
+	TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn),
+	TP_ARGS(log, new_lsn),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_lsn_t, new_lsn)
+		__field(xfs_lsn_t, old_lsn)
+		__field(xfs_lsn_t, last_sync_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->new_lsn = new_lsn;
+		__entry->old_lsn = atomic64_read(&log->l_tail_lsn);
+		__entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+	),
+	TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+		  CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+		  CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn))
+)
+
+DECLARE_EVENT_CLASS(xfs_file_class,
+	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
+	TP_ARGS(iocb, iter),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+	),
+	TP_fast_assign(
+		__entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+		__entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
+		__entry->size = XFS_I(file_inode(iocb->ki_filp))->i_disk_size;
+		__entry->offset = iocb->ki_pos;
+		__entry->count = iov_iter_count(iter);
+	),
+	TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->offset,
+		  __entry->count)
+)
+
+#define DEFINE_RW_EVENT(name)		\
+DEFINE_EVENT(xfs_file_class, name,	\
+	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),		\
+	TP_ARGS(iocb, iter))
+DEFINE_RW_EVENT(xfs_file_buffered_read);
+DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
+DEFINE_RW_EVENT(xfs_file_buffered_write);
+DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
+DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+
+
+DECLARE_EVENT_CLASS(xfs_imap_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+		 int whichfork, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, offset, count, whichfork, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, whichfork)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_disk_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->whichfork = whichfork;
+		__entry->startoff = irec ? irec->br_startoff : 0;
+		__entry->startblock = irec ? irec->br_startblock : 0;
+		__entry->blockcount = irec ? irec->br_blockcount : 0;
+	),
+	TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx "
+		  "fork %s startoff 0x%llx startblock 0x%llx fsbcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __entry->startoff,
+		  (int64_t)__entry->startblock,
+		  __entry->blockcount)
+)
+
+#define DEFINE_IMAP_EVENT(name)	\
+DEFINE_EVENT(xfs_imap_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
+		 int whichfork, struct xfs_bmbt_irec *irec),		\
+	TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
+
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+	TP_ARGS(ip, offset, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, isize)
+		__field(loff_t, disize)
+		__field(loff_t, offset)
+		__field(size_t, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->isize = VFS_I(ip)->i_size;
+		__entry->disize = ip->i_disk_size;
+		__entry->offset = offset;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
+		  "pos 0x%llx bytecount 0x%zx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->isize,
+		  __entry->disize,
+		  __entry->offset,
+		  __entry->count)
+);
+
+#define DEFINE_SIMPLE_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_simple_io_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),	\
+	TP_ARGS(ip, offset, count))
+DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
+DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
+
+DECLARE_EVENT_CLASS(xfs_itrunc_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
+	TP_ARGS(ip, new_size),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_disk_size;
+		__entry->new_size = new_size;
+	),
+	TP_printk("dev %d:%d ino 0x%llx disize 0x%llx new_size 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size)
+)
+
+#define DEFINE_ITRUNC_EVENT(name) \
+DEFINE_EVENT(xfs_itrunc_class, name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
+	TP_ARGS(ip, new_size))
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
+
+TRACE_EVENT(xfs_pagecache_inval,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
+	TP_ARGS(ip, start, finish),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_off_t, start)
+		__field(xfs_off_t, finish)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_disk_size;
+		__entry->start = start;
+		__entry->finish = finish;
+	),
+	TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->start,
+		  __entry->finish)
+);
+
+TRACE_EVENT(xfs_bunmap,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len,
+		 int flags, unsigned long caller_ip),
+	TP_ARGS(ip, fileoff, len, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fileoff_t, fileoff)
+		__field(xfs_filblks_t, len)
+		__field(unsigned long, caller_ip)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_disk_size;
+		__entry->fileoff = fileoff;
+		__entry->len = len;
+		__entry->caller_ip = caller_ip;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx "
+		  "flags %s caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->fileoff,
+		  __entry->len,
+		  __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
+		  (void *)__entry->caller_ip)
+
+);
+
+DECLARE_EVENT_CLASS(xfs_extent_busy_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+);
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_extent_busy_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_extent_busy);
+DEFINE_BUSY_EVENT(xfs_extent_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_extent_busy_force);
+DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_extent_busy_clear);
+
+TRACE_EVENT(xfs_extent_busy_trim,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t tbno, xfs_extlen_t tlen),
+	TP_ARGS(mp, agno, agbno, len, tbno, tlen),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(xfs_agblock_t, tbno)
+		__field(xfs_extlen_t, tlen)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->tbno = tbno;
+		__entry->tlen = tlen;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->tbno,
+		  __entry->tlen)
+);
+
+DECLARE_EVENT_CLASS(xfs_agf_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agf, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, flags)
+		__field(__u32, length)
+		__field(__u32, bno_root)
+		__field(__u32, cnt_root)
+		__field(__u32, bno_level)
+		__field(__u32, cnt_level)
+		__field(__u32, flfirst)
+		__field(__u32, fllast)
+		__field(__u32, flcount)
+		__field(__u32, freeblks)
+		__field(__u32, longest)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = be32_to_cpu(agf->agf_seqno),
+		__entry->flags = flags;
+		__entry->length = be32_to_cpu(agf->agf_length),
+		__entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
+		__entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
+		__entry->bno_level =
+				be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
+		__entry->cnt_level =
+				be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+		__entry->flfirst = be32_to_cpu(agf->agf_flfirst),
+		__entry->fllast = be32_to_cpu(agf->agf_fllast),
+		__entry->flcount = be32_to_cpu(agf->agf_flcount),
+		__entry->freeblks = be32_to_cpu(agf->agf_freeblks),
+		__entry->longest = be32_to_cpu(agf->agf_longest);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno 0x%x flags %s length %u roots b %u c %u "
+		  "levels b %u c %u flfirst %u fllast %u flcount %u "
+		  "freeblks %u longest %u caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
+		  __entry->length,
+		  __entry->bno_root,
+		  __entry->cnt_root,
+		  __entry->bno_level,
+		  __entry->cnt_level,
+		  __entry->flfirst,
+		  __entry->fllast,
+		  __entry->flcount,
+		  __entry->freeblks,
+		  __entry->longest,
+		  (void *)__entry->caller_ip)
+);
+#define DEFINE_AGF_EVENT(name) \
+DEFINE_EVENT(xfs_agf_class, name, \
+	TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, \
+		 unsigned long caller_ip), \
+	TP_ARGS(mp, agf, flags, caller_ip))
+DEFINE_AGF_EVENT(xfs_agf);
+DEFINE_AGF_EVENT(xfs_agfl_reset);
+
+TRACE_EVENT(xfs_free_extent,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
+		 int haveright),
+	TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int, resv)
+		__field(int, haveleft)
+		__field(int, haveright)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->resv = resv;
+		__entry->haveleft = haveleft;
+		__entry->haveright = haveright;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x resv %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->resv,
+		  __entry->haveleft ?
+			(__entry->haveright ? "both" : "left") :
+			(__entry->haveright ? "right" : "none"))
+
+);
+
+DECLARE_EVENT_CLASS(xfs_alloc_class,
+	TP_PROTO(struct xfs_alloc_arg *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, minlen)
+		__field(xfs_extlen_t, maxlen)
+		__field(xfs_extlen_t, mod)
+		__field(xfs_extlen_t, prod)
+		__field(xfs_extlen_t, minleft)
+		__field(xfs_extlen_t, total)
+		__field(xfs_extlen_t, alignment)
+		__field(xfs_extlen_t, minalignslop)
+		__field(xfs_extlen_t, len)
+		__field(short, type)
+		__field(short, otype)
+		__field(char, wasdel)
+		__field(char, wasfromfl)
+		__field(int, resv)
+		__field(int, datatype)
+		__field(xfs_fsblock_t, firstblock)
+	),
+	TP_fast_assign(
+		__entry->dev = args->mp->m_super->s_dev;
+		__entry->agno = args->agno;
+		__entry->agbno = args->agbno;
+		__entry->minlen = args->minlen;
+		__entry->maxlen = args->maxlen;
+		__entry->mod = args->mod;
+		__entry->prod = args->prod;
+		__entry->minleft = args->minleft;
+		__entry->total = args->total;
+		__entry->alignment = args->alignment;
+		__entry->minalignslop = args->minalignslop;
+		__entry->len = args->len;
+		__entry->type = args->type;
+		__entry->otype = args->otype;
+		__entry->wasdel = args->wasdel;
+		__entry->wasfromfl = args->wasfromfl;
+		__entry->resv = args->resv;
+		__entry->datatype = args->datatype;
+		__entry->firstblock = args->tp->t_firstblock;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u "
+		  "prod %u minleft %u total %u alignment %u minalignslop %u "
+		  "len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
+		  "datatype 0x%x firstblock 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->minlen,
+		  __entry->maxlen,
+		  __entry->mod,
+		  __entry->prod,
+		  __entry->minleft,
+		  __entry->total,
+		  __entry->alignment,
+		  __entry->minalignslop,
+		  __entry->len,
+		  __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
+		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
+		  __entry->wasdel,
+		  __entry->wasfromfl,
+		  __entry->resv,
+		  __entry->datatype,
+		  (unsigned long long)__entry->firstblock)
+)
+
+#define DEFINE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_alloc_class, name, \
+	TP_PROTO(struct xfs_alloc_arg *args), \
+	TP_ARGS(args))
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_right);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_left);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+
+TRACE_EVENT(xfs_alloc_cur_check,
+	TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno,
+		 xfs_extlen_t len, xfs_extlen_t diff, bool new),
+	TP_ARGS(mp, btnum, bno, len, diff, new),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agblock_t, bno)
+		__field(xfs_extlen_t, len)
+		__field(xfs_extlen_t, diff)
+		__field(bool, new)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->btnum = btnum;
+		__entry->bno = bno;
+		__entry->len = len;
+		__entry->diff = diff;
+		__entry->new = new;
+	),
+	TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->bno, __entry->len, __entry->diff, __entry->new)
+)
+
+DECLARE_EVENT_CLASS(xfs_da_class,
+	TP_PROTO(struct xfs_da_args *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__dynamic_array(char, name, args->namelen)
+		__field(int, namelen)
+		__field(xfs_dahash_t, hashval)
+		__field(xfs_ino_t, inumber)
+		__field(uint32_t, op_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		if (args->namelen)
+			memcpy(__get_str(name), args->name, args->namelen);
+		__entry->namelen = args->namelen;
+		__entry->hashval = args->hashval;
+		__entry->inumber = args->inumber;
+		__entry->op_flags = args->op_flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
+		  "inumber 0x%llx op_flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->namelen,
+		  __entry->namelen ? __get_str(name) : NULL,
+		  __entry->namelen,
+		  __entry->hashval,
+		  __entry->inumber,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
+#define DEFINE_DIR2_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+	TP_PROTO(struct xfs_da_args *args), \
+	TP_ARGS(args))
+DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
+DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+
+DECLARE_EVENT_CLASS(xfs_attr_class,
+	TP_PROTO(struct xfs_da_args *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__dynamic_array(char, name, args->namelen)
+		__field(int, namelen)
+		__field(int, valuelen)
+		__field(xfs_dahash_t, hashval)
+		__field(unsigned int, attr_filter)
+		__field(unsigned int, attr_flags)
+		__field(uint32_t, op_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		if (args->namelen)
+			memcpy(__get_str(name), args->name, args->namelen);
+		__entry->namelen = args->namelen;
+		__entry->valuelen = args->valuelen;
+		__entry->hashval = args->hashval;
+		__entry->attr_filter = args->attr_filter;
+		__entry->attr_flags = args->attr_flags;
+		__entry->op_flags = args->op_flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
+		  "hashval 0x%x filter %s flags %s op_flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->namelen,
+		  __entry->namelen ? __get_str(name) : NULL,
+		  __entry->namelen,
+		  __entry->valuelen,
+		  __entry->hashval,
+		  __print_flags(__entry->attr_filter, "|",
+				XFS_ATTR_FILTER_FLAGS),
+		   __print_flags(__entry->attr_flags, "|",
+				{ XATTR_CREATE,		"CREATE" },
+				{ XATTR_REPLACE,	"REPLACE" }),
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
+#define DEFINE_ATTR_EVENT(name) \
+DEFINE_EVENT(xfs_attr_class, name, \
+	TP_PROTO(struct xfs_da_args *args), \
+	TP_ARGS(args))
+DEFINE_ATTR_EVENT(xfs_attr_sf_add);
+DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_sf_create);
+DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
+DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
+
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
+
+DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_get);
+DEFINE_ATTR_EVENT(xfs_attr_node_replace);
+DEFINE_ATTR_EVENT(xfs_attr_node_removename);
+
+DEFINE_ATTR_EVENT(xfs_attr_fillstate);
+DEFINE_ATTR_EVENT(xfs_attr_refillstate);
+
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
+
+#define DEFINE_DA_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+	TP_PROTO(struct xfs_da_args *args), \
+	TP_ARGS(args))
+DEFINE_DA_EVENT(xfs_da_split);
+DEFINE_DA_EVENT(xfs_da_join);
+DEFINE_DA_EVENT(xfs_da_link_before);
+DEFINE_DA_EVENT(xfs_da_link_after);
+DEFINE_DA_EVENT(xfs_da_unlink_back);
+DEFINE_DA_EVENT(xfs_da_unlink_forward);
+DEFINE_DA_EVENT(xfs_da_root_split);
+DEFINE_DA_EVENT(xfs_da_root_join);
+DEFINE_DA_EVENT(xfs_da_node_add);
+DEFINE_DA_EVENT(xfs_da_node_create);
+DEFINE_DA_EVENT(xfs_da_node_split);
+DEFINE_DA_EVENT(xfs_da_node_remove);
+DEFINE_DA_EVENT(xfs_da_node_rebalance);
+DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_node_toosmall);
+DEFINE_DA_EVENT(xfs_da_swap_lastblock);
+DEFINE_DA_EVENT(xfs_da_grow_inode);
+DEFINE_DA_EVENT(xfs_da_shrink_inode);
+DEFINE_DA_EVENT(xfs_da_fixhashpath);
+DEFINE_DA_EVENT(xfs_da_path_shift);
+
+DECLARE_EVENT_CLASS(xfs_dir2_space_class,
+	TP_PROTO(struct xfs_da_args *args, int idx),
+	TP_ARGS(args, idx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(uint32_t, op_flags)
+		__field(int, idx)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		__entry->op_flags = args->op_flags;
+		__entry->idx = idx;
+	),
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+		  __entry->idx)
+)
+
+#define DEFINE_DIR2_SPACE_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_space_class, name, \
+	TP_PROTO(struct xfs_da_args *args, int idx), \
+	TP_ARGS(args, idx))
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
+
+TRACE_EVENT(xfs_dir2_leafn_moveents,
+	TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
+	TP_ARGS(args, src_idx, dst_idx, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(uint32_t, op_flags)
+		__field(int, src_idx)
+		__field(int, dst_idx)
+		__field(int, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		__entry->op_flags = args->op_flags;
+		__entry->src_idx = src_idx;
+		__entry->dst_idx = dst_idx;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s "
+		  "src_idx %d dst_idx %d count %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+		  __entry->src_idx,
+		  __entry->dst_idx,
+		  __entry->count)
+);
+
+#define XFS_SWAPEXT_INODES \
+	{ 0,	"target" }, \
+	{ 1,	"temp" }
+
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_DEV);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
+
+DECLARE_EVENT_CLASS(xfs_swap_extent_class,
+	TP_PROTO(struct xfs_inode *ip, int which),
+	TP_ARGS(ip, which),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, which)
+		__field(xfs_ino_t, ino)
+		__field(int, format)
+		__field(xfs_extnum_t, nex)
+		__field(int, broot_size)
+		__field(int, fork_off)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->which = which;
+		__entry->ino = ip->i_ino;
+		__entry->format = ip->i_df.if_format;
+		__entry->nex = ip->i_df.if_nextents;
+		__entry->broot_size = ip->i_df.if_broot_bytes;
+		__entry->fork_off = xfs_inode_fork_boff(ip);
+	),
+	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
+		  "broot size %d, forkoff 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
+		  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+		  __entry->nex,
+		  __entry->broot_size,
+		  __entry->fork_off)
+)
+
+#define DEFINE_SWAPEXT_EVENT(name) \
+DEFINE_EVENT(xfs_swap_extent_class, name, \
+	TP_PROTO(struct xfs_inode *ip, int which), \
+	TP_ARGS(ip, which))
+
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+
+TRACE_EVENT(xfs_log_recover,
+	TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
+	TP_ARGS(log, headblk, tailblk),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, headblk)
+		__field(xfs_daddr_t, tailblk)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->headblk = headblk;
+		__entry->tailblk = tailblk;
+	),
+	TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
+		  __entry->tailblk)
+)
+
+TRACE_EVENT(xfs_log_recover_record,
+	TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
+	TP_ARGS(log, rhead, pass),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_lsn_t, lsn)
+		__field(int, len)
+		__field(int, num_logops)
+		__field(int, pass)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->lsn = be64_to_cpu(rhead->h_lsn);
+		__entry->len = be32_to_cpu(rhead->h_len);
+		__entry->num_logops = be32_to_cpu(rhead->h_num_logops);
+		__entry->pass = pass;
+	),
+	TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lsn, __entry->len, __entry->num_logops,
+		   __entry->pass)
+)
+
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+	TP_PROTO(struct xlog *log, struct xlog_recover *trans,
+		struct xlog_recover_item *item, int pass),
+	TP_ARGS(log, trans, item, pass),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, item)
+		__field(xlog_tid_t, tid)
+		__field(xfs_lsn_t, lsn)
+		__field(int, type)
+		__field(int, pass)
+		__field(int, count)
+		__field(int, total)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->item = (unsigned long)item;
+		__entry->tid = trans->r_log_tid;
+		__entry->lsn = trans->r_lsn;
+		__entry->type = ITEM_TYPE(item);
+		__entry->pass = pass;
+		__entry->count = item->ri_cnt;
+		__entry->total = item->ri_total;
+	),
+	TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item %p, "
+		  "item type %s item region count/total %d/%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tid,
+		  __entry->lsn,
+		  __entry->pass,
+		  (void *)__entry->item,
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __entry->count,
+		  __entry->total)
+)
+
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+	TP_PROTO(struct xlog *log, struct xlog_recover *trans, \
+		struct xlog_recover_item *item, int pass), \
+	TP_ARGS(log, trans, item, pass))
+
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+	TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f),
+	TP_ARGS(log, buf_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int64_t, blkno)
+		__field(unsigned short, len)
+		__field(unsigned short, flags)
+		__field(unsigned short, size)
+		__field(unsigned int, map_size)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->blkno = buf_f->blf_blkno;
+		__entry->len = buf_f->blf_len;
+		__entry->flags = buf_f->blf_flags;
+		__entry->size = buf_f->blf_size;
+		__entry->map_size = buf_f->blf_map_size;
+	),
+	TP_printk("dev %d:%d daddr 0x%llx, bbcount 0x%x, flags 0x%x, size %d, "
+			"map_size %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->blkno,
+		  __entry->len,
+		  __entry->flags,
+		  __entry->size,
+		  __entry->map_size)
+)
+
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+	TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \
+	TP_ARGS(log, buf_f))
+
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+	TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f),
+	TP_ARGS(log, in_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned short, size)
+		__field(int, fields)
+		__field(unsigned short, asize)
+		__field(unsigned short, dsize)
+		__field(int64_t, blkno)
+		__field(int, len)
+		__field(int, boffset)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->ino = in_f->ilf_ino;
+		__entry->size = in_f->ilf_size;
+		__entry->fields = in_f->ilf_fields;
+		__entry->asize = in_f->ilf_asize;
+		__entry->dsize = in_f->ilf_dsize;
+		__entry->blkno = in_f->ilf_blkno;
+		__entry->len = in_f->ilf_len;
+		__entry->boffset = in_f->ilf_boffset;
+	),
+	TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+			"dsize %d, daddr 0x%llx, bbcount 0x%x, boffset %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->fields,
+		  __entry->asize,
+		  __entry->dsize,
+		  __entry->blkno,
+		  __entry->len,
+		  __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+	TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \
+	TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class,
+	TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f),
+	TP_ARGS(log, in_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(unsigned int, count)
+		__field(unsigned int, isize)
+		__field(xfs_agblock_t, length)
+		__field(unsigned int, gen)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->agno = be32_to_cpu(in_f->icl_ag);
+		__entry->agbno = be32_to_cpu(in_f->icl_agbno);
+		__entry->count = be32_to_cpu(in_f->icl_count);
+		__entry->isize = be32_to_cpu(in_f->icl_isize);
+		__entry->length = be32_to_cpu(in_f->icl_length);
+		__entry->gen = be32_to_cpu(in_f->icl_gen);
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x ireccount %u isize %u gen 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->length,
+		  __entry->count,
+		  __entry->isize,
+		  __entry->gen)
+)
+#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \
+	TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \
+	TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel);
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover);
+
+DECLARE_EVENT_CLASS(xfs_discard_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
+/* btree cursor events */
+TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+
+DECLARE_EVENT_CLASS(xfs_btree_cur_class,
+	TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
+	TP_ARGS(cur, level, bp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(int, level)
+		__field(int, nlevels)
+		__field(int, ptr)
+		__field(xfs_daddr_t, daddr)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->nlevels = cur->bc_nlevels;
+		__entry->ptr = cur->bc_levels[level].ptr;
+		__entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
+	),
+	TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->nlevels,
+		  __entry->ptr,
+		  (unsigned long long)__entry->daddr)
+)
+
+#define DEFINE_BTREE_CUR_EVENT(name) \
+DEFINE_EVENT(xfs_btree_cur_class, name, \
+	TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), \
+	TP_ARGS(cur, level, bp))
+DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
+DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
+
+/* deferred ops */
+struct xfs_defer_pending;
+
+DECLARE_EVENT_CLASS(xfs_defer_class,
+	TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip),
+	TP_ARGS(tp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(struct xfs_trans *, tp)
+		__field(char, committed)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = tp->t_mountp->m_super->s_dev;
+		__entry->tp = tp;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d tp %p caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tp,
+		  (char *)__entry->caller_ip)
+)
+#define DEFINE_DEFER_EVENT(name) \
+DEFINE_EVENT(xfs_defer_class, name, \
+	TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \
+	TP_ARGS(tp, caller_ip))
+
+DECLARE_EVENT_CLASS(xfs_defer_error_class,
+	TP_PROTO(struct xfs_trans *tp, int error),
+	TP_ARGS(tp, error),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(struct xfs_trans *, tp)
+		__field(char, committed)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->dev = tp->t_mountp->m_super->s_dev;
+		__entry->tp = tp;
+		__entry->error = error;
+	),
+	TP_printk("dev %d:%d tp %p err %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tp,
+		  __entry->error)
+)
+#define DEFINE_DEFER_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_defer_error_class, name, \
+	TP_PROTO(struct xfs_trans *tp, int error), \
+	TP_ARGS(tp, error))
+
+DECLARE_EVENT_CLASS(xfs_defer_pending_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp),
+	TP_ARGS(mp, dfp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, type)
+		__field(void *, intent)
+		__field(char, committed)
+		__field(int, nr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp ? mp->m_super->s_dev : 0;
+		__entry->type = dfp->dfp_type;
+		__entry->intent = dfp->dfp_intent;
+		__entry->committed = dfp->dfp_done != NULL;
+		__entry->nr = dfp->dfp_count;
+	),
+	TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->intent,
+		  __entry->committed,
+		  __entry->nr)
+)
+#define DEFINE_DEFER_PENDING_EVENT(name) \
+DEFINE_EVENT(xfs_defer_pending_class, name, \
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \
+	TP_ARGS(mp, dfp))
+
+DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 int type, xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, type, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, type)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->type = type;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+);
+#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 int type, \
+		 xfs_agblock_t bno, \
+		 xfs_extlen_t len), \
+	TP_ARGS(mp, agno, type, bno, len))
+
+DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 int op,
+		 xfs_agblock_t agbno,
+		 xfs_ino_t ino,
+		 int whichfork,
+		 xfs_fileoff_t offset,
+		 xfs_filblks_t len,
+		 xfs_exntst_t state),
+	TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_ino_t, ino)
+		__field(xfs_agblock_t, agbno)
+		__field(int, whichfork)
+		__field(xfs_fileoff_t, l_loff)
+		__field(xfs_filblks_t, l_len)
+		__field(xfs_exntst_t, l_state)
+		__field(int, op)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->ino = ino;
+		__entry->agbno = agbno;
+		__entry->whichfork = whichfork;
+		__entry->l_loff = offset;
+		__entry->l_len = len;
+		__entry->l_state = state;
+		__entry->op = op;
+	),
+	TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->op,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->ino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __entry->l_loff,
+		  __entry->l_len,
+		  __entry->l_state)
+);
+#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_map_extent_deferred_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 int op, \
+		 xfs_agblock_t agbno, \
+		 xfs_ino_t ino, \
+		 int whichfork, \
+		 xfs_fileoff_t offset, \
+		 xfs_filblks_t len, \
+		 xfs_exntst_t state), \
+	TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
+
+DEFINE_DEFER_EVENT(xfs_defer_cancel);
+DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
+DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
+DEFINE_DEFER_EVENT(xfs_defer_finish);
+DEFINE_DEFER_EVENT(xfs_defer_finish_done);
+
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
+
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+
+#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
+
+/* rmap tracepoints */
+DECLARE_EVENT_CLASS(xfs_rmap_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
+		 const struct xfs_owner_info *oinfo),
+	TP_ARGS(mp, agno, agbno, len, unwritten, oinfo),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned long, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->owner = oinfo->oi_owner;
+		__entry->offset = oinfo->oi_offset;
+		__entry->flags = oinfo->oi_flags;
+		if (unwritten)
+			__entry->flags |= XFS_RMAP_UNWRITTEN;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
+#define DEFINE_RMAP_EVENT(name) \
+DEFINE_EVENT(xfs_rmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
+		 const struct xfs_owner_info *oinfo), \
+	TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
+
+/* simple AG-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_ag_error_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, error)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->error = error;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno 0x%x error %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->error,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_AG_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_ag_error_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \
+		 unsigned long caller_ip), \
+	TP_ARGS(mp, agno, error, caller_ip))
+
+DEFINE_RMAP_EVENT(xfs_rmap_unmap);
+DEFINE_RMAP_EVENT(xfs_rmap_unmap_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_unmap_error);
+DEFINE_RMAP_EVENT(xfs_rmap_map);
+DEFINE_RMAP_EVENT(xfs_rmap_map_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_map_error);
+DEFINE_RMAP_EVENT(xfs_rmap_convert);
+DEFINE_RMAP_EVENT(xfs_rmap_convert_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_state);
+
+DECLARE_EVENT_CLASS(xfs_rmapbt_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 uint64_t owner, uint64_t offset, unsigned int flags),
+	TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->owner = owner;
+		__entry->offset = offset;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
+#define DEFINE_RMAPBT_EVENT(name) \
+DEFINE_EVENT(xfs_rmapbt_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len, \
+		 uint64_t owner, uint64_t offset, unsigned int flags), \
+	TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+
+#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT
+DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
+DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
+
+DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block);
+DEFINE_BUSY_EVENT(xfs_rmapbt_free_block);
+DEFINE_RMAPBT_EVENT(xfs_rmap_update);
+DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
+DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error);
+
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
+
+/* deferred bmbt updates */
+#define DEFINE_BMAP_DEFERRED_EVENT	DEFINE_RMAP_DEFERRED_EVENT
+DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
+DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);
+
+/* per-AG reservation */
+DECLARE_EVENT_CLASS(xfs_ag_resv_class,
+	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
+		 xfs_extlen_t len),
+	TP_ARGS(pag, resv, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, resv)
+		__field(xfs_extlen_t, freeblks)
+		__field(xfs_extlen_t, flcount)
+		__field(xfs_extlen_t, reserved)
+		__field(xfs_extlen_t, asked)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		struct xfs_ag_resv	*r = xfs_perag_resv(pag, resv);
+
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->resv = resv;
+		__entry->freeblks = pag->pagf_freeblks;
+		__entry->flcount = pag->pagf_flcount;
+		__entry->reserved = r ? r->ar_reserved : 0;
+		__entry->asked = r ? r->ar_asked : 0;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno 0x%x resv %d freeblks %u flcount %u "
+		  "resv %u ask %u len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->resv,
+		  __entry->freeblks,
+		  __entry->flcount,
+		  __entry->reserved,
+		  __entry->asked,
+		  __entry->len)
+)
+#define DEFINE_AG_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_ag_resv_class, name, \
+	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \
+		 xfs_extlen_t len), \
+	TP_ARGS(pag, type, len))
+
+/* per-AG reservation tracepoints */
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_init);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
+
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
+
+/* refcount tracepoint classes */
+
+/* reuse the discard trace class for agbno/aglen-based traces */
+#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
+
+/* ag btree lookup tracepoint class */
+TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
+TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
+TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
+DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_lookup_t dir),
+	TP_ARGS(mp, agno, agbno, dir),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_lookup_t, dir)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->dir = dir;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
+		  __entry->dir)
+)
+
+#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \
+DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_lookup_t dir), \
+	TP_ARGS(mp, agno, agbno, dir))
+
+/* single-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *irec),
+	TP_ARGS(mp, agno, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(enum xfs_refc_domain, domain)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->domain = irec->rc_domain;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount)
+)
+
+#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *irec), \
+	TP_ARGS(mp, agno, irec))
+
+/* single-rcext and an agbno tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno),
+	TP_ARGS(mp, agno, irec, agbno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(enum xfs_refc_domain, domain)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+		__field(xfs_agblock_t, agbno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->domain = irec->rc_domain;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+		__entry->agbno = agbno;
+	),
+	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount,
+		  __entry->agbno)
+)
+
+#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \
+	TP_ARGS(mp, agno, irec, agbno))
+
+/* double-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2),
+	TP_ARGS(mp, agno, i1, i2),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(enum xfs_refc_domain, i1_domain)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(enum xfs_refc_domain, i2_domain)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_domain = i1->rc_domain;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_domain = i2->rc_domain;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount)
+)
+
+#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \
+	TP_ARGS(mp, agno, i1, i2))
+
+/* double-rcext and an agbno tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
+		 xfs_agblock_t agbno),
+	TP_ARGS(mp, agno, i1, i2, agbno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(enum xfs_refc_domain, i1_domain)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(enum xfs_refc_domain, i2_domain)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+		__field(xfs_agblock_t, agbno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_domain = i1->rc_domain;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_domain = i2->rc_domain;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+		__entry->agbno = agbno;
+	),
+	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount,
+		  __entry->agbno)
+)
+
+#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
+		 xfs_agblock_t agbno), \
+	TP_ARGS(mp, agno, i1, i2, agbno))
+
+/* triple-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
+		 struct xfs_refcount_irec *i3),
+	TP_ARGS(mp, agno, i1, i2, i3),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(enum xfs_refc_domain, i1_domain)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(enum xfs_refc_domain, i2_domain)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+		__field(enum xfs_refc_domain, i3_domain)
+		__field(xfs_agblock_t, i3_startblock)
+		__field(xfs_extlen_t, i3_blockcount)
+		__field(xfs_nlink_t, i3_refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_domain = i1->rc_domain;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_domain = i2->rc_domain;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+		__entry->i3_domain = i3->rc_domain;
+		__entry->i3_startblock = i3->rc_startblock;
+		__entry->i3_blockcount = i3->rc_blockcount;
+		__entry->i3_refcount = i3->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount,
+		  __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->i3_startblock,
+		  __entry->i3_blockcount,
+		  __entry->i3_refcount)
+);
+
+#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
+		 struct xfs_refcount_irec *i3), \
+	TP_ARGS(mp, agno, i1, i2, i3))
+
+/* refcount btree tracepoints */
+DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block);
+DEFINE_BUSY_EVENT(xfs_refcountbt_free_block);
+DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error);
+
+/* refcount adjustment tracepoints */
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease);
+DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent);
+DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
+
+/* reflink helpers */
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
+#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
+
+TRACE_EVENT(xfs_refcount_finish_one_leftover,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 int type, xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t new_agbno, xfs_extlen_t new_len),
+	TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, type)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(xfs_agblock_t, new_agbno)
+		__field(xfs_extlen_t, new_len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->type = type;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->new_agbno = new_agbno;
+		__entry->new_len = new_len;
+	),
+	TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->new_agbno,
+		  __entry->new_len)
+);
+
+/* simple inode-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_inode_error_class,
+	TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip),
+	TP_ARGS(ip, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, error)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->error = error;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx error %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->error,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_INODE_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_inode_error_class, name, \
+	TP_PROTO(struct xfs_inode *ip, int error, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip, error, caller_ip))
+
+/* reflink tracepoint classes */
+
+/* two-file io tracepoint class */
+DECLARE_EVENT_CLASS(xfs_double_io_class,
+	TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len,
+		 struct xfs_inode *dest, xfs_off_t doffset),
+	TP_ARGS(src, soffset, len, dest, doffset),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_ino)
+		__field(loff_t, src_isize)
+		__field(loff_t, src_disize)
+		__field(loff_t, src_offset)
+		__field(long long, len)
+		__field(xfs_ino_t, dest_ino)
+		__field(loff_t, dest_isize)
+		__field(loff_t, dest_disize)
+		__field(loff_t, dest_offset)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src)->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_isize = VFS_I(src)->i_size;
+		__entry->src_disize = src->i_disk_size;
+		__entry->src_offset = soffset;
+		__entry->len = len;
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_isize = VFS_I(dest)->i_size;
+		__entry->dest_disize = dest->i_disk_size;
+		__entry->dest_offset = doffset;
+	),
+	TP_printk("dev %d:%d bytecount 0x%llx "
+		  "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> "
+		  "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->len,
+		  __entry->src_ino,
+		  __entry->src_isize,
+		  __entry->src_disize,
+		  __entry->src_offset,
+		  __entry->dest_ino,
+		  __entry->dest_isize,
+		  __entry->dest_disize,
+		  __entry->dest_offset)
+)
+
+#define DEFINE_DOUBLE_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_double_io_class, name,	\
+	TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \
+		 struct xfs_inode *dest, xfs_off_t doffset), \
+	TP_ARGS(src, soffset, len, dest, doffset))
+
+/* inode/irec events */
+DECLARE_EVENT_CLASS(xfs_inode_irec_class,
+	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_extlen_t, len)
+		__field(xfs_fsblock_t, pblk)
+		__field(int, state)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = irec->br_startoff;
+		__entry->len = irec->br_blockcount;
+		__entry->pblk = irec->br_startblock;
+		__entry->state = irec->br_state;
+	),
+	TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx fsbcount 0x%x startblock 0x%llx st %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->len,
+		  __entry->pblk,
+		  __entry->state)
+);
+#define DEFINE_INODE_IREC_EVENT(name) \
+DEFINE_EVENT(xfs_inode_irec_class, name, \
+	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
+	TP_ARGS(ip, irec))
+
+/* refcount/reflink tracepoint definitions */
+
+/* reflink tracepoints */
+DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
+DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
+DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
+TRACE_EVENT(xfs_reflink_remap_blocks,
+	TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
+		 xfs_filblks_t len, struct xfs_inode *dest,
+		 xfs_fileoff_t doffset),
+	TP_ARGS(src, soffset, len, dest, doffset),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_ino)
+		__field(xfs_fileoff_t, src_lblk)
+		__field(xfs_filblks_t, len)
+		__field(xfs_ino_t, dest_ino)
+		__field(xfs_fileoff_t, dest_lblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src)->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_lblk = soffset;
+		__entry->len = len;
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_lblk = doffset;
+	),
+	TP_printk("dev %d:%d fsbcount 0x%llx "
+		  "ino 0x%llx fileoff 0x%llx -> ino 0x%llx fileoff 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->len,
+		  __entry->src_ino,
+		  __entry->src_lblk,
+		  __entry->dest_ino,
+		  __entry->dest_lblk)
+);
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
+
+/* dedupe tracepoints */
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
+
+/* ioctl tracepoints */
+TRACE_EVENT(xfs_ioctl_clone,
+	TP_PROTO(struct inode *src, struct inode *dest),
+	TP_ARGS(src, dest),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, src_ino)
+		__field(loff_t, src_isize)
+		__field(unsigned long, dest_ino)
+		__field(loff_t, dest_isize)
+	),
+	TP_fast_assign(
+		__entry->dev = src->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_isize = i_size_read(src);
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_isize = i_size_read(dest);
+	),
+	TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->src_ino,
+		  __entry->src_isize,
+		  __entry->dest_ino,
+		  __entry->dest_isize)
+);
+
+/* unshare tracepoints */
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
+
+/* copy on write */
+DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
+
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
+
+
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
+
+/* rmap swapext tracepoints */
+DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
+DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
+DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+
+/* fsmap traces */
+DECLARE_EVENT_CLASS(xfs_fsmap_class,
+	TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
+		 const struct xfs_rmap_irec *rmap),
+	TP_ARGS(mp, keydev, agno, rmap),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, keydev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_fsblock_t, bno)
+		__field(xfs_filblks_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->keydev = new_decode_dev(keydev);
+		__entry->agno = agno;
+		__entry->bno = rmap->rm_startblock;
+		__entry->len = rmap->rm_blockcount;
+		__entry->owner = rmap->rm_owner;
+		__entry->offset = rmap->rm_offset;
+		__entry->flags = rmap->rm_flags;
+	),
+	TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+)
+#define DEFINE_FSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_fsmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
+		 const struct xfs_rmap_irec *rmap), \
+	TP_ARGS(mp, keydev, agno, rmap))
+DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_mapping);
+
+DECLARE_EVENT_CLASS(xfs_getfsmap_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap),
+	TP_ARGS(mp, fsmap),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, keydev)
+		__field(xfs_daddr_t, block)
+		__field(xfs_daddr_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(uint64_t, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->keydev = new_decode_dev(fsmap->fmr_device);
+		__entry->block = fsmap->fmr_physical;
+		__entry->len = fsmap->fmr_length;
+		__entry->owner = fsmap->fmr_owner;
+		__entry->offset = fsmap->fmr_offset;
+		__entry->flags = fsmap->fmr_flags;
+	),
+	TP_printk("dev %d:%d keydev %d:%d daddr 0x%llx bbcount 0x%llx owner 0x%llx fileoff_daddr 0x%llx flags 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
+		  __entry->block,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+)
+#define DEFINE_GETFSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_getfsmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \
+	TP_ARGS(mp, fsmap))
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
+
+DECLARE_EVENT_CLASS(xfs_trans_resv_class,
+	TP_PROTO(struct xfs_mount *mp, unsigned int type,
+		 struct xfs_trans_res *res),
+	TP_ARGS(mp, type, res),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, type)
+		__field(uint, logres)
+		__field(int, logcount)
+		__field(int, logflags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->type = type;
+		__entry->logres = res->tr_logres;
+		__entry->logcount = res->tr_logcount;
+		__entry->logflags = res->tr_logflags;
+	),
+	TP_printk("dev %d:%d type %d logres %u logcount %d flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->logres,
+		  __entry->logcount,
+		  __entry->logflags)
+)
+
+#define DEFINE_TRANS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_trans_resv_class, name, \
+	TP_PROTO(struct xfs_mount *mp, unsigned int type, \
+		 struct xfs_trans_res *res), \
+	TP_ARGS(mp, type, res))
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);
+
+TRACE_EVENT(xfs_log_get_max_trans_res,
+	TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
+	TP_ARGS(mp, res),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint, logres)
+		__field(int, logcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->logres = res->tr_logres;
+		__entry->logcount = res->tr_logcount;
+	),
+	TP_printk("dev %d:%d logres %u logcount %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->logres,
+		  __entry->logcount)
+);
+
+DECLARE_EVENT_CLASS(xfs_trans_class,
+	TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip),
+	TP_ARGS(tp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint32_t, tid)
+		__field(uint32_t, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = tp->t_mountp->m_super->s_dev;
+		__entry->tid = 0;
+		if (tp->t_ticket)
+			__entry->tid = tp->t_ticket->t_tid;
+		__entry->flags = tp->t_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d trans %x flags 0x%x caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tid,
+		  __entry->flags,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_TRANS_EVENT(name) \
+DEFINE_EVENT(xfs_trans_class, name, \
+	TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \
+	TP_ARGS(tp, caller_ip))
+DEFINE_TRANS_EVENT(xfs_trans_alloc);
+DEFINE_TRANS_EVENT(xfs_trans_cancel);
+DEFINE_TRANS_EVENT(xfs_trans_commit);
+DEFINE_TRANS_EVENT(xfs_trans_dup);
+DEFINE_TRANS_EVENT(xfs_trans_free);
+DEFINE_TRANS_EVENT(xfs_trans_roll);
+DEFINE_TRANS_EVENT(xfs_trans_add_item);
+DEFINE_TRANS_EVENT(xfs_trans_commit_items);
+DEFINE_TRANS_EVENT(xfs_trans_free_items);
+
+TRACE_EVENT(xfs_iunlink_update_bucket,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+	TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(unsigned int, bucket)
+		__field(xfs_agino_t, old_ptr)
+		__field(xfs_agino_t, new_ptr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->bucket = bucket;
+		__entry->old_ptr = old_ptr;
+		__entry->new_ptr = new_ptr;
+	),
+	TP_printk("dev %d:%d agno 0x%x bucket %u old 0x%x new 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->bucket,
+		  __entry->old_ptr,
+		  __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+	TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(xfs_agino_t, old_ptr)
+		__field(xfs_agino_t, new_ptr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agino = agino;
+		__entry->old_ptr = old_ptr;
+		__entry->new_ptr = new_ptr;
+	),
+	TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agino,
+		  __entry->old_ptr,
+		  __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+	TP_PROTO(struct xfs_inode *ip),
+	TP_ARGS(ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+		__entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+	),
+	TP_printk("dev %d:%d agno 0x%x agino 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+	TP_PROTO(struct xfs_inode *ip), \
+	TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+
+DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
+	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
+	TP_ARGS(mp, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->flags)
+);
+#define DEFINE_FS_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
+	TP_ARGS(mp, flags))
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
+
+DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
+	TP_ARGS(mp, agno, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d agno 0x%x flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno, __entry->flags)
+);
+#define DEFINE_AG_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 unsigned int flags), \
+	TP_ARGS(mp, agno, flags))
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
+
+DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned int flags),
+	TP_ARGS(ip, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = ip->i_mount->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->flags)
+);
+#define DEFINE_INODE_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
+	TP_ARGS(ip, flags))
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+
+TRACE_EVENT(xfs_iwalk_ag,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t startino),
+	TP_ARGS(mp, agno, startino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = startino;
+	),
+	TP_printk("dev %d:%d agno 0x%x startino 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+		  __entry->startino)
+)
+
+TRACE_EVENT(xfs_iwalk_ag_rec,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_inobt_rec_incore *irec),
+	TP_ARGS(mp, agno, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+		__field(uint64_t, freemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = irec->ir_startino;
+		__entry->freemask = irec->ir_free;
+	),
+	TP_printk("dev %d:%d agno 0x%x startino 0x%x freemask 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+		  __entry->startino, __entry->freemask)
+)
+
+TRACE_EVENT(xfs_pwork_init,
+	TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
+	TP_ARGS(mp, nr_threads, pid),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, nr_threads)
+		__field(pid_t, pid)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->nr_threads = nr_threads;
+		__entry->pid = pid;
+	),
+	TP_printk("dev %d:%d nr_threads %u pid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_threads, __entry->pid)
+)
+
+DECLARE_EVENT_CLASS(xfs_kmem_class,
+	TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
+	TP_ARGS(size, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(ssize_t, size)
+		__field(int, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->size = size;
+		__entry->flags = flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("size %zd flags 0x%x caller %pS",
+		  __entry->size,
+		  __entry->flags,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_KMEM_EVENT(name) \
+DEFINE_EVENT(xfs_kmem_class, name, \
+	TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
+	TP_ARGS(size, flags, caller_ip))
+DEFINE_KMEM_EVENT(kmem_alloc);
+
+TRACE_EVENT(xfs_check_new_dalign,
+	TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
+	TP_ARGS(mp, new_dalign, calc_rootino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, new_dalign)
+		__field(xfs_ino_t, sb_rootino)
+		__field(xfs_ino_t, calc_rootino)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->new_dalign = new_dalign;
+		__entry->sb_rootino = mp->m_sb.sb_rootino;
+		__entry->calc_rootino = calc_rootino;
+	),
+	TP_printk("dev %d:%d new_dalign %d sb_rootino 0x%llx calc_rootino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->new_dalign, __entry->sb_rootino,
+		  __entry->calc_rootino)
+)
+
+TRACE_EVENT(xfs_btree_commit_afakeroot,
+	TP_PROTO(struct xfs_btree_cur *cur),
+	TP_ARGS(cur),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(unsigned int, levels)
+		__field(unsigned int, blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
+		__entry->agbno = cur->bc_ag.afake->af_root;
+		__entry->levels = cur->bc_ag.afake->af_levels;
+		__entry->blocks = cur->bc_ag.afake->af_blocks;
+	),
+	TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->agno,
+		  __entry->levels,
+		  __entry->blocks,
+		  __entry->agbno)
+)
+
+TRACE_EVENT(xfs_btree_commit_ifakeroot,
+	TP_PROTO(struct xfs_btree_cur *cur),
+	TP_ARGS(cur),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(unsigned int, levels)
+		__field(unsigned int, blocks)
+		__field(int, whichfork)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
+					cur->bc_ino.ip->i_ino);
+		__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
+					cur->bc_ino.ip->i_ino);
+		__entry->levels = cur->bc_ino.ifake->if_levels;
+		__entry->blocks = cur->bc_ino.ifake->if_blocks;
+		__entry->whichfork = cur->bc_ino.whichfork;
+	),
+	TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->agno,
+		  __entry->agino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __entry->levels,
+		  __entry->blocks)
+)
+
+TRACE_EVENT(xfs_btree_bload_level_geometry,
+	TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+		 uint64_t nr_this_level, unsigned int nr_per_block,
+		 unsigned int desired_npb, uint64_t blocks,
+		 uint64_t blocks_with_extra),
+	TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks,
+		blocks_with_extra),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(unsigned int, level)
+		__field(unsigned int, nlevels)
+		__field(uint64_t, nr_this_level)
+		__field(unsigned int, nr_per_block)
+		__field(unsigned int, desired_npb)
+		__field(unsigned long long, blocks)
+		__field(unsigned long long, blocks_with_extra)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->nlevels = cur->bc_nlevels;
+		__entry->nr_this_level = nr_this_level;
+		__entry->nr_per_block = nr_per_block;
+		__entry->desired_npb = desired_npb;
+		__entry->blocks = blocks;
+		__entry->blocks_with_extra = blocks_with_extra;
+	),
+	TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->nlevels,
+		  __entry->nr_this_level,
+		  __entry->nr_per_block,
+		  __entry->desired_npb,
+		  __entry->blocks,
+		  __entry->blocks_with_extra)
+)
+
+TRACE_EVENT(xfs_btree_bload_block,
+	TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+		 uint64_t block_idx, uint64_t nr_blocks,
+		 union xfs_btree_ptr *ptr, unsigned int nr_records),
+	TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(unsigned int, level)
+		__field(unsigned long long, block_idx)
+		__field(unsigned long long, nr_blocks)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(unsigned int, nr_records)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->block_idx = block_idx;
+		__entry->nr_blocks = nr_blocks;
+		if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+			xfs_fsblock_t	fsb = be64_to_cpu(ptr->l);
+
+			__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
+			__entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb);
+		} else {
+			__entry->agno = cur->bc_ag.pag->pag_agno;
+			__entry->agbno = be32_to_cpu(ptr->s);
+		}
+		__entry->nr_records = nr_records;
+	),
+	TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->block_idx,
+		  __entry->nr_blocks,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->nr_records)
+)
+
+DECLARE_EVENT_CLASS(xfs_timestamp_range_class,
+	TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max),
+	TP_ARGS(mp, min, max),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(long long, min)
+		__field(long long, max)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->min = min;
+		__entry->max = max;
+	),
+	TP_printk("dev %d:%d min %lld max %lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->min,
+		  __entry->max)
+)
+
+#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \
+DEFINE_EVENT(xfs_timestamp_range_class, name, \
+	TP_PROTO(struct xfs_mount *mp, long long min, long long max), \
+	TP_ARGS(mp, min, max))
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
+
+DECLARE_EVENT_CLASS(xfs_icwalk_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, icw, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(__u32, flags)
+		__field(uint32_t, uid)
+		__field(uint32_t, gid)
+		__field(prid_t, prid)
+		__field(__u64, min_file_size)
+		__field(long, scan_limit)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->flags = icw ? icw->icw_flags : 0;
+		__entry->uid = icw ? from_kuid(mp->m_super->s_user_ns,
+						icw->icw_uid) : 0;
+		__entry->gid = icw ? from_kgid(mp->m_super->s_user_ns,
+						icw->icw_gid) : 0;
+		__entry->prid = icw ? icw->icw_prid : 0;
+		__entry->min_file_size = icw ? icw->icw_min_file_size : 0;
+		__entry->scan_limit = icw ? icw->icw_scan_limit : 0;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %ld caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->flags,
+		  __entry->uid,
+		  __entry->gid,
+		  __entry->prid,
+		  __entry->min_file_size,
+		  __entry->scan_limit,
+		  (char *)__entry->caller_ip)
+);
+#define DEFINE_ICWALK_EVENT(name)	\
+DEFINE_EVENT(xfs_icwalk_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, \
+		 unsigned long caller_ip), \
+	TP_ARGS(mp, icw, caller_ip))
+DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks);
+DEFINE_ICWALK_EVENT(xfs_blockgc_free_space);
+
+TRACE_DEFINE_ENUM(XLOG_STATE_ACTIVE);
+TRACE_DEFINE_ENUM(XLOG_STATE_WANT_SYNC);
+TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING);
+TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC);
+TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK);
+TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY);
+
+DECLARE_EVENT_CLASS(xlog_iclog_class,
+	TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip),
+	TP_ARGS(iclog, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint32_t, state)
+		__field(int32_t, refcount)
+		__field(uint32_t, offset)
+		__field(uint32_t, flags)
+		__field(unsigned long long, lsn)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = iclog->ic_log->l_mp->m_super->s_dev;
+		__entry->state = iclog->ic_state;
+		__entry->refcount = atomic_read(&iclog->ic_refcnt);
+		__entry->offset = iclog->ic_offset;
+		__entry->flags = iclog->ic_flags;
+		__entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->state, XLOG_STATE_STRINGS),
+		  __entry->refcount,
+		  __entry->offset,
+		  __entry->lsn,
+		  __print_flags(__entry->flags, "|", XLOG_ICL_STRINGS),
+		  (char *)__entry->caller_ip)
+
+);
+
+#define DEFINE_ICLOG_EVENT(name)	\
+DEFINE_EVENT(xlog_iclog_class, name,	\
+	TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), \
+	TP_ARGS(iclog, caller_ip))
+
+DEFINE_ICLOG_EVENT(xlog_iclog_activate);
+DEFINE_ICLOG_EVENT(xlog_iclog_clean);
+DEFINE_ICLOG_EVENT(xlog_iclog_callback);
+DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_start);
+DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_done);
+DEFINE_ICLOG_EVENT(xlog_iclog_force);
+DEFINE_ICLOG_EVENT(xlog_iclog_force_lsn);
+DEFINE_ICLOG_EVENT(xlog_iclog_get_space);
+DEFINE_ICLOG_EVENT(xlog_iclog_release);
+DEFINE_ICLOG_EVENT(xlog_iclog_switch);
+DEFINE_ICLOG_EVENT(xlog_iclog_sync);
+DEFINE_ICLOG_EVENT(xlog_iclog_syncing);
+DEFINE_ICLOG_EVENT(xlog_iclog_sync_done);
+DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
+DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
+DEFINE_ICLOG_EVENT(xlog_iclog_write);
+
+TRACE_DEFINE_ENUM(XFS_DAS_UNINIT);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_DONE);
+
+DECLARE_EVENT_CLASS(xfs_das_state_class,
+	TP_PROTO(int das, struct xfs_inode *ip),
+	TP_ARGS(das, ip),
+	TP_STRUCT__entry(
+		__field(int, das)
+		__field(xfs_ino_t, ino)
+	),
+	TP_fast_assign(
+		__entry->das = das;
+		__entry->ino = ip->i_ino;
+	),
+	TP_printk("state change %s ino 0x%llx",
+		  __print_symbolic(__entry->das, XFS_DAS_STRINGS),
+		  __entry->ino)
+)
+
+#define DEFINE_DAS_STATE_EVENT(name) \
+DEFINE_EVENT(xfs_das_state_class, name, \
+	TP_PROTO(int das, struct xfs_inode *ip), \
+	TP_ARGS(das, ip))
+DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
+
+
+TRACE_EVENT(xfs_force_shutdown,
+	TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
+		 int line_num),
+	TP_ARGS(mp, ptag, flags, fname, line_num),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, ptag)
+		__field(int, flags)
+		__string(fname, fname)
+		__field(int, line_num)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ptag = ptag;
+		__entry->flags = flags;
+		__assign_str(fname, fname);
+		__entry->line_num = line_num;
+	),
+	TP_printk("dev %d:%d tag %s flags %s file %s line_num %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		__print_flags(__entry->ptag, "|", XFS_PTAG_STRINGS),
+		__print_flags(__entry->flags, "|", XFS_SHUTDOWN_STRINGS),
+		__get_str(fname),
+		__entry->line_num)
+);
+
+#endif /* _TRACE_XFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE xfs_trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
new file mode 100644
index 000000000..7bd16fbff
--- /dev/null
+++ b/fs/xfs/xfs_trans.c
@@ -0,0 +1,1433 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_extent_busy.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_icache.h"
+
+struct kmem_cache	*xfs_trans_cache;
+
+#if defined(CONFIG_TRACEPOINTS)
+static void
+xfs_trans_trace_reservations(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans_res	*res;
+	struct xfs_trans_res	*end_res;
+	int			i;
+
+	res = (struct xfs_trans_res *)M_RES(mp);
+	end_res = (struct xfs_trans_res *)(M_RES(mp) + 1);
+	for (i = 0; res < end_res; i++, res++)
+		trace_xfs_trans_resv_calc(mp, i, res);
+}
+#else
+# define xfs_trans_trace_reservations(mp)
+#endif
+
+/*
+ * Initialize the precomputed transaction reservation values
+ * in the mount structure.
+ */
+void
+xfs_trans_init(
+	struct xfs_mount	*mp)
+{
+	xfs_trans_resv_calc(mp, M_RES(mp));
+	xfs_trans_trace_reservations(mp);
+}
+
+/*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+	struct xfs_trans	*tp)
+{
+	xfs_extent_busy_sort(&tp->t_busy);
+	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
+
+	trace_xfs_trans_free(tp, _RET_IP_);
+	xfs_trans_clear_context(tp);
+	if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
+		sb_end_intwrite(tp->t_mountp->m_super);
+	xfs_trans_free_dqinfo(tp);
+	kmem_cache_free(xfs_trans_cache, tp);
+}
+
+/*
+ * This is called to create a new transaction which will share the
+ * permanent log reservation of the given transaction.  The remaining
+ * unused block and rt extent reservations are also inherited.  This
+ * implies that the original transaction is no longer allowed to allocate
+ * blocks.  Locks and log items, however, are no inherited.  They must
+ * be added to the new transaction explicitly.
+ */
+STATIC struct xfs_trans *
+xfs_trans_dup(
+	struct xfs_trans	*tp)
+{
+	struct xfs_trans	*ntp;
+
+	trace_xfs_trans_dup(tp, _RET_IP_);
+
+	ntp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+	/*
+	 * Initialize the new transaction structure.
+	 */
+	ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
+	ntp->t_mountp = tp->t_mountp;
+	INIT_LIST_HEAD(&ntp->t_items);
+	INIT_LIST_HEAD(&ntp->t_busy);
+	INIT_LIST_HEAD(&ntp->t_dfops);
+	ntp->t_firstblock = NULLFSBLOCK;
+
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(tp->t_ticket != NULL);
+
+	ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
+		       (tp->t_flags & XFS_TRANS_RESERVE) |
+		       (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) |
+		       (tp->t_flags & XFS_TRANS_RES_FDBLKS);
+	/* We gave our writer reference to the new transaction */
+	tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
+	ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
+
+	ASSERT(tp->t_blk_res >= tp->t_blk_res_used);
+	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
+	tp->t_blk_res = tp->t_blk_res_used;
+
+	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
+	tp->t_rtx_res = tp->t_rtx_res_used;
+
+	xfs_trans_switch_context(tp, ntp);
+
+	/* move deferred ops over to the new tp */
+	xfs_defer_move(ntp, tp);
+
+	xfs_trans_dup_dqinfo(tp, ntp);
+	return ntp;
+}
+
+/*
+ * This is called to reserve free disk blocks and log space for the
+ * given transaction.  This must be done before allocating any resources
+ * within the transaction.
+ *
+ * This will return ENOSPC if there are not enough blocks available.
+ * It will sleep waiting for available log space.
+ * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
+ * is used by long running transactions.  If any one of the reservations
+ * fails then they will all be backed out.
+ *
+ * This does not do quota reservations. That typically is done by the
+ * caller afterwards.
+ */
+static int
+xfs_trans_reserve(
+	struct xfs_trans	*tp,
+	struct xfs_trans_res	*resp,
+	uint			blocks,
+	uint			rtextents)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			error = 0;
+	bool			rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+	/*
+	 * Attempt to reserve the needed disk blocks by decrementing
+	 * the number needed from the number available.  This will
+	 * fail if the count would go below zero.
+	 */
+	if (blocks > 0) {
+		error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
+		if (error != 0)
+			return -ENOSPC;
+		tp->t_blk_res += blocks;
+	}
+
+	/*
+	 * Reserve the log space needed for this transaction.
+	 */
+	if (resp->tr_logres > 0) {
+		bool	permanent = false;
+
+		ASSERT(tp->t_log_res == 0 ||
+		       tp->t_log_res == resp->tr_logres);
+		ASSERT(tp->t_log_count == 0 ||
+		       tp->t_log_count == resp->tr_logcount);
+
+		if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
+			tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+			permanent = true;
+		} else {
+			ASSERT(tp->t_ticket == NULL);
+			ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+		}
+
+		if (tp->t_ticket != NULL) {
+			ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
+			error = xfs_log_regrant(mp, tp->t_ticket);
+		} else {
+			error = xfs_log_reserve(mp, resp->tr_logres,
+						resp->tr_logcount,
+						&tp->t_ticket, permanent);
+		}
+
+		if (error)
+			goto undo_blocks;
+
+		tp->t_log_res = resp->tr_logres;
+		tp->t_log_count = resp->tr_logcount;
+	}
+
+	/*
+	 * Attempt to reserve the needed realtime extents by decrementing
+	 * the number needed from the number available.  This will
+	 * fail if the count would go below zero.
+	 */
+	if (rtextents > 0) {
+		error = xfs_mod_frextents(mp, -((int64_t)rtextents));
+		if (error) {
+			error = -ENOSPC;
+			goto undo_log;
+		}
+		tp->t_rtx_res += rtextents;
+	}
+
+	return 0;
+
+	/*
+	 * Error cases jump to one of these labels to undo any
+	 * reservations which have already been performed.
+	 */
+undo_log:
+	if (resp->tr_logres > 0) {
+		xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
+		tp->t_ticket = NULL;
+		tp->t_log_res = 0;
+		tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
+	}
+
+undo_blocks:
+	if (blocks > 0) {
+		xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
+		tp->t_blk_res = 0;
+	}
+	return error;
+}
+
+int
+xfs_trans_alloc(
+	struct xfs_mount	*mp,
+	struct xfs_trans_res	*resp,
+	uint			blocks,
+	uint			rtextents,
+	uint			flags,
+	struct xfs_trans	**tpp)
+{
+	struct xfs_trans	*tp;
+	bool			want_retry = true;
+	int			error;
+
+	/*
+	 * Allocate the handle before we do our freeze accounting and setting up
+	 * GFP_NOFS allocation context so that we avoid lockdep false positives
+	 * by doing GFP_KERNEL allocations inside sb_start_intwrite().
+	 */
+retry:
+	tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
+	if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+		sb_start_intwrite(mp->m_super);
+	xfs_trans_set_context(tp);
+
+	/*
+	 * Zero-reservation ("empty") transactions can't modify anything, so
+	 * they're allowed to run while we're frozen.
+	 */
+	WARN_ON(resp->tr_logres > 0 &&
+		mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+	ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
+	       xfs_has_lazysbcount(mp));
+
+	tp->t_magic = XFS_TRANS_HEADER_MAGIC;
+	tp->t_flags = flags;
+	tp->t_mountp = mp;
+	INIT_LIST_HEAD(&tp->t_items);
+	INIT_LIST_HEAD(&tp->t_busy);
+	INIT_LIST_HEAD(&tp->t_dfops);
+	tp->t_firstblock = NULLFSBLOCK;
+
+	error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+	if (error == -ENOSPC && want_retry) {
+		xfs_trans_cancel(tp);
+
+		/*
+		 * We weren't able to reserve enough space for the transaction.
+		 * Flush the other speculative space allocations to free space.
+		 * Do not perform a synchronous scan because callers can hold
+		 * other locks.
+		 */
+		xfs_blockgc_flush_all(mp);
+		want_retry = false;
+		goto retry;
+	}
+	if (error) {
+		xfs_trans_cancel(tp);
+		return error;
+	}
+
+	trace_xfs_trans_alloc(tp, _RET_IP_);
+
+	*tpp = tp;
+	return 0;
+}
+
+/*
+ * Create an empty transaction with no reservation.  This is a defensive
+ * mechanism for routines that query metadata without actually modifying them --
+ * if the metadata being queried is somehow cross-linked (think a btree block
+ * pointer that points higher in the tree), we risk deadlock.  However, blocks
+ * grabbed as part of a transaction can be re-grabbed.  The verifiers will
+ * notice the corrupt block and the operation will fail back to userspace
+ * without deadlocking.
+ *
+ * Note the zero-length reservation; this transaction MUST be cancelled without
+ * any dirty data.
+ *
+ * Callers should obtain freeze protection to avoid a conflict with fs freezing
+ * where we can be grabbing buffers at the same time that freeze is trying to
+ * drain the buffer LRU list.
+ */
+int
+xfs_trans_alloc_empty(
+	struct xfs_mount		*mp,
+	struct xfs_trans		**tpp)
+{
+	struct xfs_trans_res		resv = {0};
+
+	return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
+}
+
+/*
+ * Record the indicated change to the given field for application
+ * to the file system's superblock when the transaction commits.
+ * For now, just store the change in the transaction structure.
+ *
+ * Mark the transaction structure to indicate that the superblock
+ * needs to be updated before committing.
+ *
+ * Because we may not be keeping track of allocated/free inodes and
+ * used filesystem blocks in the superblock, we do not mark the
+ * superblock dirty in this transaction if we modify these fields.
+ * We still need to update the transaction deltas so that they get
+ * applied to the incore superblock, but we don't want them to
+ * cause the superblock to get locked and logged if these are the
+ * only fields in the superblock that the transaction modifies.
+ */
+void
+xfs_trans_mod_sb(
+	xfs_trans_t	*tp,
+	uint		field,
+	int64_t		delta)
+{
+	uint32_t	flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY);
+	xfs_mount_t	*mp = tp->t_mountp;
+
+	switch (field) {
+	case XFS_TRANS_SB_ICOUNT:
+		tp->t_icount_delta += delta;
+		if (xfs_has_lazysbcount(mp))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_IFREE:
+		tp->t_ifree_delta += delta;
+		if (xfs_has_lazysbcount(mp))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_FDBLOCKS:
+		/*
+		 * Track the number of blocks allocated in the transaction.
+		 * Make sure it does not exceed the number reserved. If so,
+		 * shutdown as this can lead to accounting inconsistency.
+		 */
+		if (delta < 0) {
+			tp->t_blk_res_used += (uint)-delta;
+			if (tp->t_blk_res_used > tp->t_blk_res)
+				xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		} else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) {
+			int64_t	blkres_delta;
+
+			/*
+			 * Return freed blocks directly to the reservation
+			 * instead of the global pool, being careful not to
+			 * overflow the trans counter. This is used to preserve
+			 * reservation across chains of transaction rolls that
+			 * repeatedly free and allocate blocks.
+			 */
+			blkres_delta = min_t(int64_t, delta,
+					     UINT_MAX - tp->t_blk_res);
+			tp->t_blk_res += blkres_delta;
+			delta -= blkres_delta;
+		}
+		tp->t_fdblocks_delta += delta;
+		if (xfs_has_lazysbcount(mp))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_RES_FDBLOCKS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblock's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		tp->t_res_fdblocks_delta += delta;
+		if (xfs_has_lazysbcount(mp))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_FREXTENTS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.  Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_rtx_res_used += (uint)-delta;
+			ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
+		}
+		tp->t_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_RES_FREXTENTS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblock's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_DBLOCKS:
+		tp->t_dblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_AGCOUNT:
+		ASSERT(delta > 0);
+		tp->t_agcount_delta += delta;
+		break;
+	case XFS_TRANS_SB_IMAXPCT:
+		tp->t_imaxpct_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSIZE:
+		tp->t_rextsize_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBMBLOCKS:
+		tp->t_rbmblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBLOCKS:
+		tp->t_rblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTENTS:
+		tp->t_rextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSLOG:
+		tp->t_rextslog_delta += delta;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	tp->t_flags |= flags;
+}
+
+/*
+ * xfs_trans_apply_sb_deltas() is called from the commit code
+ * to bring the superblock buffer into the current transaction
+ * and modify it as requested by earlier calls to xfs_trans_mod_sb().
+ *
+ * For now we just look at each field allowed to change and change
+ * it if necessary.
+ */
+STATIC void
+xfs_trans_apply_sb_deltas(
+	xfs_trans_t	*tp)
+{
+	struct xfs_dsb	*sbp;
+	struct xfs_buf	*bp;
+	int		whole = 0;
+
+	bp = xfs_trans_getsb(tp);
+	sbp = bp->b_addr;
+
+	/*
+	 * Only update the superblock counters if we are logging them
+	 */
+	if (!xfs_has_lazysbcount((tp->t_mountp))) {
+		if (tp->t_icount_delta)
+			be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
+		if (tp->t_ifree_delta)
+			be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta);
+		if (tp->t_fdblocks_delta)
+			be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta);
+		if (tp->t_res_fdblocks_delta)
+			be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
+	}
+
+	/*
+	 * Updating frextents requires careful handling because it does not
+	 * behave like the lazysb counters because we cannot rely on log
+	 * recovery in older kenels to recompute the value from the rtbitmap.
+	 * This means that the ondisk frextents must be consistent with the
+	 * rtbitmap.
+	 *
+	 * Therefore, log the frextents change to the ondisk superblock and
+	 * update the incore superblock so that future calls to xfs_log_sb
+	 * write the correct value ondisk.
+	 *
+	 * Don't touch m_frextents because it includes incore reservations,
+	 * and those are handled by the unreserve function.
+	 */
+	if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+		struct xfs_mount	*mp = tp->t_mountp;
+		int64_t			rtxdelta;
+
+		rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
+
+		spin_lock(&mp->m_sb_lock);
+		be64_add_cpu(&sbp->sb_frextents, rtxdelta);
+		mp->m_sb.sb_frextents += rtxdelta;
+		spin_unlock(&mp->m_sb_lock);
+	}
+
+	if (tp->t_dblocks_delta) {
+		be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_agcount_delta) {
+		be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta);
+		whole = 1;
+	}
+	if (tp->t_imaxpct_delta) {
+		sbp->sb_imax_pct += tp->t_imaxpct_delta;
+		whole = 1;
+	}
+	if (tp->t_rextsize_delta) {
+		be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta);
+		whole = 1;
+	}
+	if (tp->t_rbmblocks_delta) {
+		be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rblocks_delta) {
+		be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rextents_delta) {
+		be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta);
+		whole = 1;
+	}
+	if (tp->t_rextslog_delta) {
+		sbp->sb_rextslog += tp->t_rextslog_delta;
+		whole = 1;
+	}
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+	if (whole)
+		/*
+		 * Log the whole thing, the fields are noncontiguous.
+		 */
+		xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+	else
+		/*
+		 * Since all the modifiable fields are contiguous, we
+		 * can get away with this.
+		 */
+		xfs_trans_log_buf(tp, bp, offsetof(struct xfs_dsb, sb_icount),
+				  offsetof(struct xfs_dsb, sb_frextents) +
+				  sizeof(sbp->sb_frextents) - 1);
+}
+
+/*
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and
+ * apply superblock counter changes to the in-core superblock.  The
+ * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT
+ * applied to the in-core superblock.  The idea is that that has already been
+ * done.
+ *
+ * If we are not logging superblock counters, then the inode allocated/free and
+ * used block counts are not updated in the on disk superblock. In this case,
+ * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
+ * still need to update the incore superblock with the changes.
+ *
+ * Deltas for the inode count are +/-64, hence we use a large batch size of 128
+ * so we don't need to take the counter lock on every update.
+ */
+#define XFS_ICOUNT_BATCH	128
+
+void
+xfs_trans_unreserve_and_mod_sb(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	bool			rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+	int64_t			blkdelta = 0;
+	int64_t			rtxdelta = 0;
+	int64_t			idelta = 0;
+	int64_t			ifreedelta = 0;
+	int			error;
+
+	/* calculate deltas */
+	if (tp->t_blk_res > 0)
+		blkdelta = tp->t_blk_res;
+	if ((tp->t_fdblocks_delta != 0) &&
+	    (xfs_has_lazysbcount(mp) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)))
+	        blkdelta += tp->t_fdblocks_delta;
+
+	if (tp->t_rtx_res > 0)
+		rtxdelta = tp->t_rtx_res;
+	if ((tp->t_frextents_delta != 0) &&
+	    (tp->t_flags & XFS_TRANS_SB_DIRTY))
+		rtxdelta += tp->t_frextents_delta;
+
+	if (xfs_has_lazysbcount(mp) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+		idelta = tp->t_icount_delta;
+		ifreedelta = tp->t_ifree_delta;
+	}
+
+	/* apply the per-cpu counters */
+	if (blkdelta) {
+		error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
+		ASSERT(!error);
+	}
+
+	if (idelta)
+		percpu_counter_add_batch(&mp->m_icount, idelta,
+					 XFS_ICOUNT_BATCH);
+
+	if (ifreedelta)
+		percpu_counter_add(&mp->m_ifree, ifreedelta);
+
+	if (rtxdelta) {
+		error = xfs_mod_frextents(mp, rtxdelta);
+		ASSERT(!error);
+	}
+
+	if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
+		return;
+
+	/* apply remaining deltas */
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
+	mp->m_sb.sb_icount += idelta;
+	mp->m_sb.sb_ifree += ifreedelta;
+	/*
+	 * Do not touch sb_frextents here because we are dealing with incore
+	 * reservation.  sb_frextents is not part of the lazy sb counters so it
+	 * must be consistent with the ondisk rtbitmap and must never include
+	 * incore reservations.
+	 */
+	mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
+	mp->m_sb.sb_agcount += tp->t_agcount_delta;
+	mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
+	mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
+	mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
+	mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
+	mp->m_sb.sb_rextents += tp->t_rextents_delta;
+	mp->m_sb.sb_rextslog += tp->t_rextslog_delta;
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * Debug checks outside of the spinlock so they don't lock up the
+	 * machine if they fail.
+	 */
+	ASSERT(mp->m_sb.sb_imax_pct >= 0);
+	ASSERT(mp->m_sb.sb_rextslog >= 0);
+	return;
+}
+
+/* Add the given log item to the transaction's list of log items. */
+void
+xfs_trans_add_item(
+	struct xfs_trans	*tp,
+	struct xfs_log_item	*lip)
+{
+	ASSERT(lip->li_log == tp->t_mountp->m_log);
+	ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
+	ASSERT(list_empty(&lip->li_trans));
+	ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags));
+
+	list_add_tail(&lip->li_trans, &tp->t_items);
+	trace_xfs_trans_add_item(tp, _RET_IP_);
+}
+
+/*
+ * Unlink the log item from the transaction. the log item is no longer
+ * considered dirty in this transaction, as the linked transaction has
+ * finished, either by abort or commit completion.
+ */
+void
+xfs_trans_del_item(
+	struct xfs_log_item	*lip)
+{
+	clear_bit(XFS_LI_DIRTY, &lip->li_flags);
+	list_del_init(&lip->li_trans);
+}
+
+/* Detach and unlock all of the items in a transaction */
+static void
+xfs_trans_free_items(
+	struct xfs_trans	*tp,
+	bool			abort)
+{
+	struct xfs_log_item	*lip, *next;
+
+	trace_xfs_trans_free_items(tp, _RET_IP_);
+
+	list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+		xfs_trans_del_item(lip);
+		if (abort)
+			set_bit(XFS_LI_ABORTED, &lip->li_flags);
+		if (lip->li_ops->iop_release)
+			lip->li_ops->iop_release(lip);
+	}
+}
+
+static inline void
+xfs_log_item_batch_insert(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	xfs_lsn_t		commit_lsn)
+{
+	int	i;
+
+	spin_lock(&ailp->ail_lock);
+	/* xfs_trans_ail_update_bulk drops ailp->ail_lock */
+	xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
+
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+
+		if (lip->li_ops->iop_unpin)
+			lip->li_ops->iop_unpin(lip, 0);
+	}
+}
+
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through iop_committed and iop_committing, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is iop_committed processing, followed by an
+ * iop_unpin(aborted) call.
+ *
+ * The AIL cursor is used to optimise the insert process. If commit_lsn is not
+ * at the end of the AIL, the insert cursor avoids the need to walk
+ * the AIL to find the insertion point on every xfs_log_item_batch_insert()
+ * call. This saves a lot of needless list walking and is a net win, even
+ * though it slightly increases that amount of AIL lock traffic to set it up
+ * and tear it down.
+ */
+void
+xfs_trans_committed_bulk(
+	struct xfs_ail		*ailp,
+	struct list_head	*lv_chain,
+	xfs_lsn_t		commit_lsn,
+	bool			aborted)
+{
+#define LOG_ITEM_BATCH_SIZE	32
+	struct xfs_log_item	*log_items[LOG_ITEM_BATCH_SIZE];
+	struct xfs_log_vec	*lv;
+	struct xfs_ail_cursor	cur;
+	int			i = 0;
+
+	spin_lock(&ailp->ail_lock);
+	xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn);
+	spin_unlock(&ailp->ail_lock);
+
+	/* unpin all the log items */
+	list_for_each_entry(lv, lv_chain, lv_list) {
+		struct xfs_log_item	*lip = lv->lv_item;
+		xfs_lsn_t		item_lsn;
+
+		if (aborted)
+			set_bit(XFS_LI_ABORTED, &lip->li_flags);
+
+		if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
+			lip->li_ops->iop_release(lip);
+			continue;
+		}
+
+		if (lip->li_ops->iop_committed)
+			item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
+		else
+			item_lsn = commit_lsn;
+
+		/* item_lsn of -1 means the item needs no further processing */
+		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+			continue;
+
+		/*
+		 * if we are aborting the operation, no point in inserting the
+		 * object into the AIL as we are in a shutdown situation.
+		 */
+		if (aborted) {
+			ASSERT(xlog_is_shutdown(ailp->ail_log));
+			if (lip->li_ops->iop_unpin)
+				lip->li_ops->iop_unpin(lip, 1);
+			continue;
+		}
+
+		if (item_lsn != commit_lsn) {
+
+			/*
+			 * Not a bulk update option due to unusual item_lsn.
+			 * Push into AIL immediately, rechecking the lsn once
+			 * we have the ail lock. Then unpin the item. This does
+			 * not affect the AIL cursor the bulk insert path is
+			 * using.
+			 */
+			spin_lock(&ailp->ail_lock);
+			if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+				xfs_trans_ail_update(ailp, lip, item_lsn);
+			else
+				spin_unlock(&ailp->ail_lock);
+			if (lip->li_ops->iop_unpin)
+				lip->li_ops->iop_unpin(lip, 0);
+			continue;
+		}
+
+		/* Item is a candidate for bulk AIL insert.  */
+		log_items[i++] = lv->lv_item;
+		if (i >= LOG_ITEM_BATCH_SIZE) {
+			xfs_log_item_batch_insert(ailp, &cur, log_items,
+					LOG_ITEM_BATCH_SIZE, commit_lsn);
+			i = 0;
+		}
+	}
+
+	/* make sure we insert the remainder! */
+	if (i)
+		xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
+
+	spin_lock(&ailp->ail_lock);
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->ail_lock);
+}
+
+/*
+ * Sort transaction items prior to running precommit operations. This will
+ * attempt to order the items such that they will always be locked in the same
+ * order. Items that have no sort function are moved to the end of the list
+ * and so are locked last.
+ *
+ * This may need refinement as different types of objects add sort functions.
+ *
+ * Function is more complex than it needs to be because we are comparing 64 bit
+ * values and the function only returns 32 bit values.
+ */
+static int
+xfs_trans_precommit_sort(
+	void			*unused_arg,
+	const struct list_head	*a,
+	const struct list_head	*b)
+{
+	struct xfs_log_item	*lia = container_of(a,
+					struct xfs_log_item, li_trans);
+	struct xfs_log_item	*lib = container_of(b,
+					struct xfs_log_item, li_trans);
+	int64_t			diff;
+
+	/*
+	 * If both items are non-sortable, leave them alone. If only one is
+	 * sortable, move the non-sortable item towards the end of the list.
+	 */
+	if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
+		return 0;
+	if (!lia->li_ops->iop_sort)
+		return 1;
+	if (!lib->li_ops->iop_sort)
+		return -1;
+
+	diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * Run transaction precommit functions.
+ *
+ * If there is an error in any of the callouts, then stop immediately and
+ * trigger a shutdown to abort the transaction. There is no recovery possible
+ * from errors at this point as the transaction is dirty....
+ */
+static int
+xfs_trans_run_precommits(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_log_item	*lip, *n;
+	int			error = 0;
+
+	/*
+	 * Sort the item list to avoid ABBA deadlocks with other transactions
+	 * running precommit operations that lock multiple shared items such as
+	 * inode cluster buffers.
+	 */
+	list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
+
+	/*
+	 * Precommit operations can remove the log item from the transaction
+	 * if the log item exists purely to delay modifications until they
+	 * can be ordered against other operations. Hence we have to use
+	 * list_for_each_entry_safe() here.
+	 */
+	list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+			continue;
+		if (lip->li_ops->iop_precommit) {
+			error = lip->li_ops->iop_precommit(tp, lip);
+			if (error)
+				break;
+		}
+	}
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	return error;
+}
+
+/*
+ * Commit the given transaction to the log.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
+ */
+static int
+__xfs_trans_commit(
+	struct xfs_trans	*tp,
+	bool			regrant)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xlog		*log = mp->m_log;
+	xfs_csn_t		commit_seq = 0;
+	int			error = 0;
+	int			sync = tp->t_flags & XFS_TRANS_SYNC;
+
+	trace_xfs_trans_commit(tp, _RET_IP_);
+
+	error = xfs_trans_run_precommits(tp);
+	if (error) {
+		if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+			xfs_defer_cancel(tp);
+		goto out_unreserve;
+	}
+
+	/*
+	 * Finish deferred items on final commit. Only permanent transactions
+	 * should ever have deferred ops.
+	 */
+	WARN_ON_ONCE(!list_empty(&tp->t_dfops) &&
+		     !(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+	if (!regrant && (tp->t_flags & XFS_TRANS_PERM_LOG_RES)) {
+		error = xfs_defer_finish_noroll(&tp);
+		if (error)
+			goto out_unreserve;
+	}
+
+	/*
+	 * If there is nothing to be logged by the transaction,
+	 * then unlock all of the items associated with the
+	 * transaction and free the transaction structure.
+	 * Also make sure to return any reserved blocks to
+	 * the free pool.
+	 */
+	if (!(tp->t_flags & XFS_TRANS_DIRTY))
+		goto out_unreserve;
+
+	/*
+	 * We must check against log shutdown here because we cannot abort log
+	 * items and leave them dirty, inconsistent and unpinned in memory while
+	 * the log is active. This leaves them open to being written back to
+	 * disk, and that will lead to on-disk corruption.
+	 */
+	if (xlog_is_shutdown(log)) {
+		error = -EIO;
+		goto out_unreserve;
+	}
+
+	ASSERT(tp->t_ticket != NULL);
+
+	/*
+	 * If we need to update the superblock, then do it now.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+		xfs_trans_apply_sb_deltas(tp);
+	xfs_trans_apply_dquot_deltas(tp);
+
+	xlog_cil_commit(log, tp, &commit_seq, regrant);
+
+	xfs_trans_free(tp);
+
+	/*
+	 * If the transaction needs to be synchronous, then force the
+	 * log out now and wait for it.
+	 */
+	if (sync) {
+		error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL);
+		XFS_STATS_INC(mp, xs_trans_sync);
+	} else {
+		XFS_STATS_INC(mp, xs_trans_async);
+	}
+
+	return error;
+
+out_unreserve:
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * It is indeed possible for the transaction to be not dirty but
+	 * the dqinfo portion to be.  All that means is that we have some
+	 * (non-persistent) quota reservations that need to be unreserved.
+	 */
+	xfs_trans_unreserve_and_mod_dquots(tp);
+	if (tp->t_ticket) {
+		if (regrant && !xlog_is_shutdown(log))
+			xfs_log_ticket_regrant(log, tp->t_ticket);
+		else
+			xfs_log_ticket_ungrant(log, tp->t_ticket);
+		tp->t_ticket = NULL;
+	}
+	xfs_trans_free_items(tp, !!error);
+	xfs_trans_free(tp);
+
+	XFS_STATS_INC(mp, xs_trans_empty);
+	return error;
+}
+
+int
+xfs_trans_commit(
+	struct xfs_trans	*tp)
+{
+	return __xfs_trans_commit(tp, false);
+}
+
+/*
+ * Unlock all of the transaction's items and free the transaction.  If the
+ * transaction is dirty, we must shut down the filesystem because there is no
+ * way to restore them to their previous state.
+ *
+ * If the transaction has made a log reservation, make sure to release it as
+ * well.
+ *
+ * This is a high level function (equivalent to xfs_trans_commit()) and so can
+ * be called after the transaction has effectively been aborted due to the mount
+ * being shut down. However, if the mount has not been shut down and the
+ * transaction is dirty we will shut the mount down and, in doing so, that
+ * guarantees that the log is shut down, too. Hence we don't need to be as
+ * careful with shutdown state and dirty items here as we need to be in
+ * xfs_trans_commit().
+ */
+void
+xfs_trans_cancel(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xlog		*log = mp->m_log;
+	bool			dirty = (tp->t_flags & XFS_TRANS_DIRTY);
+
+	trace_xfs_trans_cancel(tp, _RET_IP_);
+
+	/*
+	 * It's never valid to cancel a transaction with deferred ops attached,
+	 * because the transaction is effectively dirty.  Complain about this
+	 * loudly before freeing the in-memory defer items.
+	 */
+	if (!list_empty(&tp->t_dfops)) {
+		ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops));
+		ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+		dirty = true;
+		xfs_defer_cancel(tp);
+	}
+
+	/*
+	 * See if the caller is relying on us to shut down the filesystem. We
+	 * only want an error report if there isn't already a shutdown in
+	 * progress, so we only need to check against the mount shutdown state
+	 * here.
+	 */
+	if (dirty && !xfs_is_shutdown(mp)) {
+		XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	}
+#ifdef DEBUG
+	/* Log items need to be consistent until the log is shut down. */
+	if (!dirty && !xlog_is_shutdown(log)) {
+		struct xfs_log_item *lip;
+
+		list_for_each_entry(lip, &tp->t_items, li_trans)
+			ASSERT(!xlog_item_is_intent_done(lip));
+	}
+#endif
+	xfs_trans_unreserve_and_mod_sb(tp);
+	xfs_trans_unreserve_and_mod_dquots(tp);
+
+	if (tp->t_ticket) {
+		xfs_log_ticket_ungrant(log, tp->t_ticket);
+		tp->t_ticket = NULL;
+	}
+
+	xfs_trans_free_items(tp, dirty);
+	xfs_trans_free(tp);
+}
+
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with xfs_trans_commit(), but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+	struct xfs_trans	**tpp)
+{
+	struct xfs_trans	*trans = *tpp;
+	struct xfs_trans_res	tres;
+	int			error;
+
+	trace_xfs_trans_roll(trans, _RET_IP_);
+
+	/*
+	 * Copy the critical parameters from one trans to the next.
+	 */
+	tres.tr_logres = trans->t_log_res;
+	tres.tr_logcount = trans->t_log_count;
+
+	*tpp = xfs_trans_dup(trans);
+
+	/*
+	 * Commit the current transaction.
+	 * If this commit failed, then it'd just unlock those items that
+	 * are not marked ihold. That also means that a filesystem shutdown
+	 * is in progress. The caller takes the responsibility to cancel
+	 * the duplicate transaction that gets returned.
+	 */
+	error = __xfs_trans_commit(trans, true);
+	if (error)
+		return error;
+
+	/*
+	 * Reserve space in the log for the next transaction.
+	 * This also pushes items in the "AIL", the list of logged items,
+	 * out to disk if they are taking up space at the tail of the log
+	 * that we want to use.  This requires that either nothing be locked
+	 * across this call, or that anything that is locked be logged in
+	 * the prior and the next transactions.
+	 */
+	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+	return xfs_trans_reserve(*tpp, &tres, 0, 0);
+}
+
+/*
+ * Allocate an transaction, lock and join the inode to it, and reserve quota.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized.  The caller is responsible for
+ * releasing ILOCK_EXCL if a new transaction is returned.
+ */
+int
+xfs_trans_alloc_inode(
+	struct xfs_inode	*ip,
+	struct xfs_trans_res	*resv,
+	unsigned int		dblocks,
+	unsigned int		rblocks,
+	bool			force,
+	struct xfs_trans	**tpp)
+{
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = ip->i_mount;
+	bool			retried = false;
+	int			error;
+
+retry:
+	error = xfs_trans_alloc(mp, resv, dblocks,
+			rblocks / mp->m_sb.sb_rextsize,
+			force ? XFS_TRANS_RESERVE : 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_qm_dqattach_locked(ip, false);
+	if (error) {
+		/* Caller should have allocated the dquots! */
+		ASSERT(error != -ENOENT);
+		goto out_cancel;
+	}
+
+	error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force);
+	if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+		xfs_trans_cancel(tp);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_blockgc_free_quota(ip, 0);
+		retried = true;
+		goto retry;
+	}
+	if (error)
+		goto out_cancel;
+
+	*tpp = tp;
+	return 0;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Allocate an transaction in preparation for inode creation by reserving quota
+ * against the given dquots.  Callers are not required to hold any inode locks.
+ */
+int
+xfs_trans_alloc_icreate(
+	struct xfs_mount	*mp,
+	struct xfs_trans_res	*resv,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	struct xfs_dquot	*pdqp,
+	unsigned int		dblocks,
+	struct xfs_trans	**tpp)
+{
+	struct xfs_trans	*tp;
+	bool			retried = false;
+	int			error;
+
+retry:
+	error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp);
+	if (error)
+		return error;
+
+	error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks);
+	if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+		xfs_trans_cancel(tp);
+		xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+		retried = true;
+		goto retry;
+	}
+	if (error) {
+		xfs_trans_cancel(tp);
+		return error;
+	}
+
+	*tpp = tp;
+	return 0;
+}
+
+/*
+ * Allocate an transaction, lock and join the inode to it, and reserve quota
+ * in preparation for inode attribute changes that include uid, gid, or prid
+ * changes.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized.  The ILOCK will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_ichange(
+	struct xfs_inode	*ip,
+	struct xfs_dquot	*new_udqp,
+	struct xfs_dquot	*new_gdqp,
+	struct xfs_dquot	*new_pdqp,
+	bool			force,
+	struct xfs_trans	**tpp)
+{
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*udqp;
+	struct xfs_dquot	*gdqp;
+	struct xfs_dquot	*pdqp;
+	bool			retried = false;
+	int			error;
+
+retry:
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	error = xfs_qm_dqattach_locked(ip, false);
+	if (error) {
+		/* Caller should have allocated the dquots! */
+		ASSERT(error != -ENOENT);
+		goto out_cancel;
+	}
+
+	/*
+	 * For each quota type, skip quota reservations if the inode's dquots
+	 * now match the ones that came from the caller, or the caller didn't
+	 * pass one in.  The inode's dquots can change if we drop the ILOCK to
+	 * perform a blockgc scan, so we must preserve the caller's arguments.
+	 */
+	udqp = (new_udqp != ip->i_udquot) ? new_udqp : NULL;
+	gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL;
+	pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL;
+	if (udqp || gdqp || pdqp) {
+		unsigned int	qflags = XFS_QMOPT_RES_REGBLKS;
+
+		if (force)
+			qflags |= XFS_QMOPT_FORCE_RES;
+
+		/*
+		 * Reserve enough quota to handle blocks on disk and reserved
+		 * for a delayed allocation.  We'll actually transfer the
+		 * delalloc reservation between dquots at chown time, even
+		 * though that part is only semi-transactional.
+		 */
+		error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp,
+				pdqp, ip->i_nblocks + ip->i_delayed_blks,
+				1, qflags);
+		if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+			xfs_trans_cancel(tp);
+			xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+			retried = true;
+			goto retry;
+		}
+		if (error)
+			goto out_cancel;
+	}
+
+	*tpp = tp;
+	return 0;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+/*
+ * Allocate an transaction, lock and join the directory and child inodes to it,
+ * and reserve quota for a directory update.  If there isn't sufficient space,
+ * @dblocks will be set to zero for a reservationless directory update and
+ * @nospace_error will be set to a negative errno describing the space
+ * constraint we hit.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized.  The ILOCKs will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_dir(
+	struct xfs_inode	*dp,
+	struct xfs_trans_res	*resv,
+	struct xfs_inode	*ip,
+	unsigned int		*dblocks,
+	struct xfs_trans	**tpp,
+	int			*nospace_error)
+{
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = ip->i_mount;
+	unsigned int		resblks;
+	bool			retried = false;
+	int			error;
+
+retry:
+	*nospace_error = 0;
+	resblks = *dblocks;
+	error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+	if (error == -ENOSPC) {
+		*nospace_error = error;
+		resblks = 0;
+		error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+	}
+	if (error)
+		return error;
+
+	xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	error = xfs_qm_dqattach_locked(dp, false);
+	if (error) {
+		/* Caller should have allocated the dquots! */
+		ASSERT(error != -ENOENT);
+		goto out_cancel;
+	}
+
+	error = xfs_qm_dqattach_locked(ip, false);
+	if (error) {
+		/* Caller should have allocated the dquots! */
+		ASSERT(error != -ENOENT);
+		goto out_cancel;
+	}
+
+	if (resblks == 0)
+		goto done;
+
+	error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false);
+	if (error == -EDQUOT || error == -ENOSPC) {
+		if (!retried) {
+			xfs_trans_cancel(tp);
+			xfs_blockgc_free_quota(dp, 0);
+			retried = true;
+			goto retry;
+		}
+
+		*nospace_error = error;
+		resblks = 0;
+		error = 0;
+	}
+	if (error)
+		goto out_cancel;
+
+done:
+	*tpp = tp;
+	*dblocks = resblks;
+	return 0;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
new file mode 100644
index 000000000..558197859
--- /dev/null
+++ b/fs/xfs/xfs_trans.h
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_TRANS_H__
+#define	__XFS_TRANS_H__
+
+/* kernel only transaction subsystem defines */
+
+struct xlog;
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_trans_res;
+struct xfs_dquot_acct;
+struct xfs_rud_log_item;
+struct xfs_rui_log_item;
+struct xfs_btree_cur;
+struct xfs_cui_log_item;
+struct xfs_cud_log_item;
+struct xfs_bui_log_item;
+struct xfs_bud_log_item;
+
+struct xfs_log_item {
+	struct list_head		li_ail;		/* AIL pointers */
+	struct list_head		li_trans;	/* transaction list */
+	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
+	struct xlog			*li_log;
+	struct xfs_ail			*li_ailp;	/* ptr to AIL */
+	uint				li_type;	/* item type */
+	unsigned long			li_flags;	/* misc flags */
+	struct xfs_buf			*li_buf;	/* real buffer pointer */
+	struct list_head		li_bio_list;	/* buffer item list */
+	const struct xfs_item_ops	*li_ops;	/* function list */
+
+	/* delayed logging */
+	struct list_head		li_cil;		/* CIL pointers */
+	struct xfs_log_vec		*li_lv;		/* active log vector */
+	struct xfs_log_vec		*li_lv_shadow;	/* standby vector */
+	xfs_csn_t			li_seq;		/* CIL commit seq */
+	uint32_t			li_order_id;	/* CIL commit order */
+};
+
+/*
+ * li_flags use the (set/test/clear)_bit atomic interfaces because updates can
+ * race with each other and we don't want to have to use the AIL lock to
+ * serialise all updates.
+ */
+#define	XFS_LI_IN_AIL	0
+#define	XFS_LI_ABORTED	1
+#define	XFS_LI_FAILED	2
+#define	XFS_LI_DIRTY	3
+#define	XFS_LI_WHITEOUT	4
+
+#define XFS_LI_FLAGS \
+	{ (1u << XFS_LI_IN_AIL),	"IN_AIL" }, \
+	{ (1u << XFS_LI_ABORTED),	"ABORTED" }, \
+	{ (1u << XFS_LI_FAILED),	"FAILED" }, \
+	{ (1u << XFS_LI_DIRTY),		"DIRTY" }, \
+	{ (1u << XFS_LI_WHITEOUT),	"WHITEOUT" }
+
+struct xfs_item_ops {
+	unsigned flags;
+	void (*iop_size)(struct xfs_log_item *, int *, int *);
+	void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
+	void (*iop_pin)(struct xfs_log_item *);
+	void (*iop_unpin)(struct xfs_log_item *, int remove);
+	uint64_t (*iop_sort)(struct xfs_log_item *lip);
+	int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip);
+	void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
+	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+	void (*iop_release)(struct xfs_log_item *);
+	int (*iop_recover)(struct xfs_log_item *lip,
+			   struct list_head *capture_list);
+	bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
+	struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
+			struct xfs_trans *tp);
+	struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
+};
+
+/*
+ * Log item ops flags
+ */
+/*
+ * Release the log item when the journal commits instead of inserting into the
+ * AIL for writeback tracking and/or log tail pinning.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED	(1 << 0)
+#define XFS_ITEM_INTENT			(1 << 1)
+#define XFS_ITEM_INTENT_DONE		(1 << 2)
+
+static inline bool
+xlog_item_is_intent(struct xfs_log_item *lip)
+{
+	return lip->li_ops->flags & XFS_ITEM_INTENT;
+}
+
+static inline bool
+xlog_item_is_intent_done(struct xfs_log_item *lip)
+{
+	return lip->li_ops->flags & XFS_ITEM_INTENT_DONE;
+}
+
+void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
+			  int type, const struct xfs_item_ops *ops);
+
+/*
+ * Return values for the iop_push() routines.
+ */
+#define XFS_ITEM_SUCCESS	0
+#define XFS_ITEM_PINNED		1
+#define XFS_ITEM_LOCKED		2
+#define XFS_ITEM_FLUSHING	3
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+	unsigned int		t_magic;	/* magic number */
+	unsigned int		t_log_res;	/* amt of log space resvd */
+	unsigned int		t_log_count;	/* count for perm log res */
+	unsigned int		t_blk_res;	/* # of blocks resvd */
+	unsigned int		t_blk_res_used;	/* # of resvd blocks used */
+	unsigned int		t_rtx_res;	/* # of rt extents resvd */
+	unsigned int		t_rtx_res_used;	/* # of resvd rt extents used */
+	unsigned int		t_flags;	/* misc flags */
+	xfs_fsblock_t		t_firstblock;	/* first block allocated */
+	struct xlog_ticket	*t_ticket;	/* log mgr ticket */
+	struct xfs_mount	*t_mountp;	/* ptr to fs mount struct */
+	struct xfs_dquot_acct   *t_dqinfo;	/* acctg info for dquots */
+	int64_t			t_icount_delta;	/* superblock icount change */
+	int64_t			t_ifree_delta;	/* superblock ifree change */
+	int64_t			t_fdblocks_delta; /* superblock fdblocks chg */
+	int64_t			t_res_fdblocks_delta; /* on-disk only chg */
+	int64_t			t_frextents_delta;/* superblock freextents chg*/
+	int64_t			t_res_frextents_delta; /* on-disk only chg */
+	int64_t			t_dblocks_delta;/* superblock dblocks change */
+	int64_t			t_agcount_delta;/* superblock agcount change */
+	int64_t			t_imaxpct_delta;/* superblock imaxpct change */
+	int64_t			t_rextsize_delta;/* superblock rextsize chg */
+	int64_t			t_rbmblocks_delta;/* superblock rbmblocks chg */
+	int64_t			t_rblocks_delta;/* superblock rblocks change */
+	int64_t			t_rextents_delta;/* superblocks rextents chg */
+	int64_t			t_rextslog_delta;/* superblocks rextslog chg */
+	struct list_head	t_items;	/* log item descriptors */
+	struct list_head	t_busy;		/* list of busy extents */
+	struct list_head	t_dfops;	/* deferred operations */
+	unsigned long		t_pflags;	/* saved process flags state */
+} xfs_trans_t;
+
+/*
+ * XFS transaction mechanism exported interfaces that are
+ * actually macros.
+ */
+#define	xfs_trans_set_sync(tp)		((tp)->t_flags |= XFS_TRANS_SYNC)
+
+/*
+ * XFS transaction mechanism exported interfaces.
+ */
+int		xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
+			uint blocks, uint rtextents, uint flags,
+			struct xfs_trans **tpp);
+int		xfs_trans_alloc_empty(struct xfs_mount *mp,
+			struct xfs_trans **tpp);
+void		xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
+
+int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target,
+		struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags,
+		struct xfs_buf **bpp);
+
+static inline int
+xfs_trans_get_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	int			numblks,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_trans_get_buf_map(tp, target, &map, 1, flags, bpp);
+}
+
+int		xfs_trans_read_buf_map(struct xfs_mount *mp,
+				       struct xfs_trans *tp,
+				       struct xfs_buftarg *target,
+				       struct xfs_buf_map *map, int nmaps,
+				       xfs_buf_flags_t flags,
+				       struct xfs_buf **bpp,
+				       const struct xfs_buf_ops *ops);
+
+static inline int
+xfs_trans_read_buf(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	int			numblks,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
+				      flags, bpp, ops);
+}
+
+struct xfs_buf	*xfs_trans_getsb(struct xfs_trans *);
+
+void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+bool		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
+void		xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
+				  uint);
+void		xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+bool		xfs_trans_buf_is_dirty(struct xfs_buf *bp);
+void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+
+int		xfs_trans_commit(struct xfs_trans *);
+int		xfs_trans_roll(struct xfs_trans **);
+int		xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
+void		xfs_trans_cancel(xfs_trans_t *);
+int		xfs_trans_ail_init(struct xfs_mount *);
+void		xfs_trans_ail_destroy(struct xfs_mount *);
+
+void		xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
+				       enum xfs_blft);
+void		xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
+					struct xfs_buf *src_bp);
+
+extern struct kmem_cache	*xfs_trans_cache;
+
+static inline struct xfs_log_item *
+xfs_trans_item_relog(
+	struct xfs_log_item	*lip,
+	struct xfs_trans	*tp)
+{
+	return lip->li_ops->iop_relog(lip, tp);
+}
+
+struct xfs_dquot;
+
+int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
+		unsigned int dblocks, unsigned int rblocks, bool force,
+		struct xfs_trans **tpp);
+int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
+		struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
+		struct xfs_dquot *pdqp, unsigned int dblocks,
+		struct xfs_trans **tpp);
+int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
+		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
+		struct xfs_trans **tpp);
+int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
+		struct xfs_inode *ip, unsigned int *dblocks,
+		struct xfs_trans **tpp, int *nospace_error);
+
+static inline void
+xfs_trans_set_context(
+	struct xfs_trans	*tp)
+{
+	ASSERT(current->journal_info == NULL);
+	tp->t_pflags = memalloc_nofs_save();
+	current->journal_info = tp;
+}
+
+static inline void
+xfs_trans_clear_context(
+	struct xfs_trans	*tp)
+{
+	if (current->journal_info == tp) {
+		memalloc_nofs_restore(tp->t_pflags);
+		current->journal_info = NULL;
+	}
+}
+
+static inline void
+xfs_trans_switch_context(
+	struct xfs_trans	*old_tp,
+	struct xfs_trans	*new_tp)
+{
+	ASSERT(current->journal_info == old_tp);
+	new_tp->t_pflags = old_tp->t_pflags;
+	old_tp->t_pflags = 0;
+	current->journal_info = new_tp;
+}
+
+#endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
new file mode 100644
index 000000000..f51df7d94
--- /dev/null
+++ b/fs/xfs/xfs_trans_ail.c
@@ -0,0 +1,934 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_trace.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+
+#ifdef DEBUG
+/*
+ * Check that the list is sorted as it should be.
+ *
+ * Called with the ail lock held, but we don't want to assert fail with it
+ * held otherwise we'll lock everything up and won't be able to debug the
+ * cause. Hence we sample and check the state under the AIL lock and return if
+ * everything is fine, otherwise we drop the lock and run the ASSERT checks.
+ * Asserts may not be fatal, so pick the lock back up and continue onwards.
+ */
+STATIC void
+xfs_ail_check(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+	__must_hold(&ailp->ail_lock)
+{
+	struct xfs_log_item	*prev_lip;
+	struct xfs_log_item	*next_lip;
+	xfs_lsn_t		prev_lsn = NULLCOMMITLSN;
+	xfs_lsn_t		next_lsn = NULLCOMMITLSN;
+	xfs_lsn_t		lsn;
+	bool			in_ail;
+
+
+	if (list_empty(&ailp->ail_head))
+		return;
+
+	/*
+	 * Sample then check the next and previous entries are valid.
+	 */
+	in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags);
+	prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail);
+	if (&prev_lip->li_ail != &ailp->ail_head)
+		prev_lsn = prev_lip->li_lsn;
+	next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail);
+	if (&next_lip->li_ail != &ailp->ail_head)
+		next_lsn = next_lip->li_lsn;
+	lsn = lip->li_lsn;
+
+	if (in_ail &&
+	    (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) &&
+	    (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0))
+		return;
+
+	spin_unlock(&ailp->ail_lock);
+	ASSERT(in_ail);
+	ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0);
+	ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0);
+	spin_lock(&ailp->ail_lock);
+}
+#else /* !DEBUG */
+#define	xfs_ail_check(a,l)
+#endif /* DEBUG */
+
+/*
+ * Return a pointer to the last item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static struct xfs_log_item *
+xfs_ail_max(
+	struct xfs_ail  *ailp)
+{
+	if (list_empty(&ailp->ail_head))
+		return NULL;
+
+	return list_entry(ailp->ail_head.prev, struct xfs_log_item, li_ail);
+}
+
+/*
+ * Return a pointer to the item which follows the given item in the AIL.  If
+ * the given item is the last item in the list, then return NULL.
+ */
+static struct xfs_log_item *
+xfs_ail_next(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_ail.next == &ailp->ail_head)
+		return NULL;
+
+	return list_first_entry(&lip->li_ail, struct xfs_log_item, li_ail);
+}
+
+/*
+ * This is called by the log manager code to determine the LSN of the tail of
+ * the log.  This is exactly the LSN of the first item in the AIL.  If the AIL
+ * is empty, then this function returns 0.
+ *
+ * We need the AIL lock in order to get a coherent read of the lsn of the last
+ * item in the AIL.
+ */
+static xfs_lsn_t
+__xfs_ail_min_lsn(
+	struct xfs_ail		*ailp)
+{
+	struct xfs_log_item	*lip = xfs_ail_min(ailp);
+
+	if (lip)
+		return lip->li_lsn;
+	return 0;
+}
+
+xfs_lsn_t
+xfs_ail_min_lsn(
+	struct xfs_ail		*ailp)
+{
+	xfs_lsn_t		lsn;
+
+	spin_lock(&ailp->ail_lock);
+	lsn = __xfs_ail_min_lsn(ailp);
+	spin_unlock(&ailp->ail_lock);
+
+	return lsn;
+}
+
+/*
+ * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
+ */
+static xfs_lsn_t
+xfs_ail_max_lsn(
+	struct xfs_ail		*ailp)
+{
+	xfs_lsn_t       	lsn = 0;
+	struct xfs_log_item	*lip;
+
+	spin_lock(&ailp->ail_lock);
+	lip = xfs_ail_max(ailp);
+	if (lip)
+		lsn = lip->li_lsn;
+	spin_unlock(&ailp->ail_lock);
+
+	return lsn;
+}
+
+/*
+ * The cursor keeps track of where our current traversal is up to by tracking
+ * the next item in the list for us. However, for this to be safe, removing an
+ * object from the AIL needs to invalidate any cursor that points to it. hence
+ * the traversal cursor needs to be linked to the struct xfs_ail so that
+ * deletion can search all the active cursors for invalidation.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur)
+{
+	cur->item = NULL;
+	list_add_tail(&cur->list, &ailp->ail_cursors);
+}
+
+/*
+ * Get the next item in the traversal and advance the cursor.  If the cursor
+ * was invalidated (indicated by a lip of 1), restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur)
+{
+	struct xfs_log_item	*lip = cur->item;
+
+	if ((uintptr_t)lip & 1)
+		lip = xfs_ail_min(ailp);
+	if (lip)
+		cur->item = xfs_ail_next(ailp, lip);
+	return lip;
+}
+
+/*
+ * When the traversal is complete, we need to remove the cursor from the list
+ * of traversing cursors.
+ */
+void
+xfs_trans_ail_cursor_done(
+	struct xfs_ail_cursor	*cur)
+{
+	cur->item = NULL;
+	list_del_init(&cur->list);
+}
+
+/*
+ * Invalidate any cursor that is pointing to this item. This is called when an
+ * item is removed from the AIL. Any cursor pointing to this object is now
+ * invalid and the traversal needs to be terminated so it doesn't reference a
+ * freed object. We set the low bit of the cursor item pointer so we can
+ * distinguish between an invalidation and the end of the list when getting the
+ * next item from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_ail_cursor	*cur;
+
+	list_for_each_entry(cur, &ailp->ail_cursors, list) {
+		if (cur->item == lip)
+			cur->item = (struct xfs_log_item *)
+					((uintptr_t)cur->item | 1);
+	}
+}
+
+/*
+ * Find the first item in the AIL with the given @lsn by searching in ascending
+ * LSN order and initialise the cursor to point to the next item for a
+ * ascending traversal.  Pass a @lsn of zero to initialise the cursor to the
+ * first item in the AIL. Returns NULL if the list is empty.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_first(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_log_item	*lip;
+
+	xfs_trans_ail_cursor_init(ailp, cur);
+
+	if (lsn == 0) {
+		lip = xfs_ail_min(ailp);
+		goto out;
+	}
+
+	list_for_each_entry(lip, &ailp->ail_head, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+			goto out;
+	}
+	return NULL;
+
+out:
+	if (lip)
+		cur->item = xfs_ail_next(ailp, lip);
+	return lip;
+}
+
+static struct xfs_log_item *
+__xfs_trans_ail_cursor_last(
+	struct xfs_ail		*ailp,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_log_item	*lip;
+
+	list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
+			return lip;
+	}
+	return NULL;
+}
+
+/*
+ * Find the last item in the AIL with the given @lsn by searching in descending
+ * LSN order and initialise the cursor to point to that item.  If there is no
+ * item with the value of @lsn, then it sets the cursor to the last item with an
+ * LSN lower than @lsn.  Returns NULL if the list is empty.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_last(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	xfs_lsn_t		lsn)
+{
+	xfs_trans_ail_cursor_init(ailp, cur);
+	cur->item = __xfs_trans_ail_cursor_last(ailp, lsn);
+	return cur->item;
+}
+
+/*
+ * Splice the log item list into the AIL at the given LSN. We splice to the
+ * tail of the given LSN to maintain insert order for push traversals. The
+ * cursor is optional, allowing repeated updates to the same LSN to avoid
+ * repeated traversals.  This should not be called with an empty list.
+ */
+static void
+xfs_ail_splice(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct list_head	*list,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_log_item	*lip;
+
+	ASSERT(!list_empty(list));
+
+	/*
+	 * Use the cursor to determine the insertion point if one is
+	 * provided.  If not, or if the one we got is not valid,
+	 * find the place in the AIL where the items belong.
+	 */
+	lip = cur ? cur->item : NULL;
+	if (!lip || (uintptr_t)lip & 1)
+		lip = __xfs_trans_ail_cursor_last(ailp, lsn);
+
+	/*
+	 * If a cursor is provided, we know we're processing the AIL
+	 * in lsn order, and future items to be spliced in will
+	 * follow the last one being inserted now.  Update the
+	 * cursor to point to that last item, now while we have a
+	 * reliable pointer to it.
+	 */
+	if (cur)
+		cur->item = list_entry(list->prev, struct xfs_log_item, li_ail);
+
+	/*
+	 * Finally perform the splice.  Unless the AIL was empty,
+	 * lip points to the item in the AIL _after_ which the new
+	 * items should go.  If lip is null the AIL was empty, so
+	 * the new items go at the head of the AIL.
+	 */
+	if (lip)
+		list_splice(list, &lip->li_ail);
+	else
+		list_splice(list, &ailp->ail_head);
+}
+
+/*
+ * Delete the given item from the AIL.  Return a pointer to the item.
+ */
+static void
+xfs_ail_delete(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+{
+	xfs_ail_check(ailp, lip);
+	list_del(&lip->li_ail);
+	xfs_trans_ail_cursor_clear(ailp, lip);
+}
+
+/*
+ * Requeue a failed buffer for writeback.
+ *
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore  always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ */
+static inline int
+xfsaild_resubmit_item(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	struct xfs_buf		*bp = lip->li_buf;
+
+	if (!xfs_buf_trylock(bp))
+		return XFS_ITEM_LOCKED;
+
+	if (!xfs_buf_delwri_queue(bp, buffer_list)) {
+		xfs_buf_unlock(bp);
+		return XFS_ITEM_FLUSHING;
+	}
+
+	/* protected by ail_lock */
+	list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+		if (bp->b_flags & _XBF_INODES)
+			clear_bit(XFS_LI_FAILED, &lip->li_flags);
+		else
+			xfs_clear_li_failed(lip);
+	}
+
+	xfs_buf_unlock(bp);
+	return XFS_ITEM_SUCCESS;
+}
+
+static inline uint
+xfsaild_push_item(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+{
+	/*
+	 * If log item pinning is enabled, skip the push and track the item as
+	 * pinned. This can help induce head-behind-tail conditions.
+	 */
+	if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
+		return XFS_ITEM_PINNED;
+
+	/*
+	 * Consider the item pinned if a push callback is not defined so the
+	 * caller will force the log. This should only happen for intent items
+	 * as they are unpinned once the associated done item is committed to
+	 * the on-disk log.
+	 */
+	if (!lip->li_ops->iop_push)
+		return XFS_ITEM_PINNED;
+	if (test_bit(XFS_LI_FAILED, &lip->li_flags))
+		return xfsaild_resubmit_item(lip, &ailp->ail_buf_list);
+	return lip->li_ops->iop_push(lip, &ailp->ail_buf_list);
+}
+
+static long
+xfsaild_push(
+	struct xfs_ail		*ailp)
+{
+	struct xfs_mount	*mp = ailp->ail_log->l_mp;
+	struct xfs_ail_cursor	cur;
+	struct xfs_log_item	*lip;
+	xfs_lsn_t		lsn;
+	xfs_lsn_t		target;
+	long			tout;
+	int			stuck = 0;
+	int			flushing = 0;
+	int			count = 0;
+
+	/*
+	 * If we encountered pinned items or did not finish writing out all
+	 * buffers the last time we ran, force a background CIL push to get the
+	 * items unpinned in the near future. We do not wait on the CIL push as
+	 * that could stall us for seconds if there is enough background IO
+	 * load. Stalling for that long when the tail of the log is pinned and
+	 * needs flushing will hard stop the transaction subsystem when log
+	 * space runs out.
+	 */
+	if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 &&
+	    (!list_empty_careful(&ailp->ail_buf_list) ||
+	     xfs_ail_min_lsn(ailp))) {
+		ailp->ail_log_flush = 0;
+
+		XFS_STATS_INC(mp, xs_push_ail_flush);
+		xlog_cil_flush(ailp->ail_log);
+	}
+
+	spin_lock(&ailp->ail_lock);
+
+	/*
+	 * If we have a sync push waiter, we always have to push till the AIL is
+	 * empty. Update the target to point to the end of the AIL so that
+	 * capture updates that occur after the sync push waiter has gone to
+	 * sleep.
+	 */
+	if (waitqueue_active(&ailp->ail_empty)) {
+		lip = xfs_ail_max(ailp);
+		if (lip)
+			target = lip->li_lsn;
+	} else {
+		/* barrier matches the ail_target update in xfs_ail_push() */
+		smp_rmb();
+		target = ailp->ail_target;
+		ailp->ail_target_prev = target;
+	}
+
+	/* we're done if the AIL is empty or our push has reached the end */
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
+	if (!lip)
+		goto out_done;
+
+	XFS_STATS_INC(mp, xs_push_ail);
+
+	lsn = lip->li_lsn;
+	while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
+		int	lock_result;
+
+		/*
+		 * Note that iop_push may unlock and reacquire the AIL lock.  We
+		 * rely on the AIL cursor implementation to be able to deal with
+		 * the dropped lock.
+		 */
+		lock_result = xfsaild_push_item(ailp, lip);
+		switch (lock_result) {
+		case XFS_ITEM_SUCCESS:
+			XFS_STATS_INC(mp, xs_push_ail_success);
+			trace_xfs_ail_push(lip);
+
+			ailp->ail_last_pushed_lsn = lsn;
+			break;
+
+		case XFS_ITEM_FLUSHING:
+			/*
+			 * The item or its backing buffer is already being
+			 * flushed.  The typical reason for that is that an
+			 * inode buffer is locked because we already pushed the
+			 * updates to it as part of inode clustering.
+			 *
+			 * We do not want to stop flushing just because lots
+			 * of items are already being flushed, but we need to
+			 * re-try the flushing relatively soon if most of the
+			 * AIL is being flushed.
+			 */
+			XFS_STATS_INC(mp, xs_push_ail_flushing);
+			trace_xfs_ail_flushing(lip);
+
+			flushing++;
+			ailp->ail_last_pushed_lsn = lsn;
+			break;
+
+		case XFS_ITEM_PINNED:
+			XFS_STATS_INC(mp, xs_push_ail_pinned);
+			trace_xfs_ail_pinned(lip);
+
+			stuck++;
+			ailp->ail_log_flush++;
+			break;
+		case XFS_ITEM_LOCKED:
+			XFS_STATS_INC(mp, xs_push_ail_locked);
+			trace_xfs_ail_locked(lip);
+
+			stuck++;
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
+
+		count++;
+
+		/*
+		 * Are there too many items we can't do anything with?
+		 *
+		 * If we are skipping too many items because we can't flush
+		 * them or they are already being flushed, we back off and
+		 * given them time to complete whatever operation is being
+		 * done. i.e. remove pressure from the AIL while we can't make
+		 * progress so traversals don't slow down further inserts and
+		 * removals to/from the AIL.
+		 *
+		 * The value of 100 is an arbitrary magic number based on
+		 * observation.
+		 */
+		if (stuck > 100)
+			break;
+
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+		if (lip == NULL)
+			break;
+		lsn = lip->li_lsn;
+	}
+
+out_done:
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->ail_lock);
+
+	if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
+		ailp->ail_log_flush++;
+
+	if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
+		/*
+		 * We reached the target or the AIL is empty, so wait a bit
+		 * longer for I/O to complete and remove pushed items from the
+		 * AIL before we start the next scan from the start of the AIL.
+		 */
+		tout = 50;
+		ailp->ail_last_pushed_lsn = 0;
+	} else if (((stuck + flushing) * 100) / count > 90) {
+		/*
+		 * Either there is a lot of contention on the AIL or we are
+		 * stuck due to operations in progress. "Stuck" in this case
+		 * is defined as >90% of the items we tried to push were stuck.
+		 *
+		 * Backoff a bit more to allow some I/O to complete before
+		 * restarting from the start of the AIL. This prevents us from
+		 * spinning on the same items, and if they are pinned will all
+		 * the restart to issue a log force to unpin the stuck items.
+		 */
+		tout = 20;
+		ailp->ail_last_pushed_lsn = 0;
+	} else {
+		/*
+		 * Assume we have more work to do in a short while.
+		 */
+		tout = 10;
+	}
+
+	return tout;
+}
+
+static int
+xfsaild(
+	void		*data)
+{
+	struct xfs_ail	*ailp = data;
+	long		tout = 0;	/* milliseconds */
+	unsigned int	noreclaim_flag;
+
+	noreclaim_flag = memalloc_noreclaim_save();
+	set_freezable();
+
+	while (1) {
+		if (tout && tout <= 20)
+			set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
+		else
+			set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+
+		/*
+		 * Check kthread_should_stop() after we set the task state to
+		 * guarantee that we either see the stop bit and exit or the
+		 * task state is reset to runnable such that it's not scheduled
+		 * out indefinitely and detects the stop bit at next iteration.
+		 * A memory barrier is included in above task state set to
+		 * serialize again kthread_stop().
+		 */
+		if (kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+
+			/*
+			 * The caller forces out the AIL before stopping the
+			 * thread in the common case, which means the delwri
+			 * queue is drained. In the shutdown case, the queue may
+			 * still hold relogged buffers that haven't been
+			 * submitted because they were pinned since added to the
+			 * queue.
+			 *
+			 * Log I/O error processing stales the underlying buffer
+			 * and clears the delwri state, expecting the buf to be
+			 * removed on the next submission attempt. That won't
+			 * happen if we're shutting down, so this is the last
+			 * opportunity to release such buffers from the queue.
+			 */
+			ASSERT(list_empty(&ailp->ail_buf_list) ||
+			       xlog_is_shutdown(ailp->ail_log));
+			xfs_buf_delwri_cancel(&ailp->ail_buf_list);
+			break;
+		}
+
+		spin_lock(&ailp->ail_lock);
+
+		/*
+		 * Idle if the AIL is empty and we are not racing with a target
+		 * update. We check the AIL after we set the task to a sleep
+		 * state to guarantee that we either catch an ail_target update
+		 * or that a wake_up resets the state to TASK_RUNNING.
+		 * Otherwise, we run the risk of sleeping indefinitely.
+		 *
+		 * The barrier matches the ail_target update in xfs_ail_push().
+		 */
+		smp_rmb();
+		if (!xfs_ail_min(ailp) &&
+		    ailp->ail_target == ailp->ail_target_prev &&
+		    list_empty(&ailp->ail_buf_list)) {
+			spin_unlock(&ailp->ail_lock);
+			schedule();
+			tout = 0;
+			continue;
+		}
+		spin_unlock(&ailp->ail_lock);
+
+		if (tout)
+			schedule_timeout(msecs_to_jiffies(tout));
+
+		__set_current_state(TASK_RUNNING);
+
+		try_to_freeze();
+
+		tout = xfsaild_push(ailp);
+	}
+
+	memalloc_noreclaim_restore(noreclaim_flag);
+	return 0;
+}
+
+/*
+ * This routine is called to move the tail of the AIL forward.  It does this by
+ * trying to flush items in the AIL whose lsns are below the given
+ * threshold_lsn.
+ *
+ * The push is run asynchronously in a workqueue, which means the caller needs
+ * to handle waiting on the async flush for space to become available.
+ * We don't want to interrupt any push that is in progress, hence we only queue
+ * work if we set the pushing bit appropriately.
+ *
+ * We do this unlocked - we only need to know whether there is anything in the
+ * AIL at the time we are called. We don't need to access the contents of
+ * any of the objects, so the lock is not needed.
+ */
+void
+xfs_ail_push(
+	struct xfs_ail		*ailp,
+	xfs_lsn_t		threshold_lsn)
+{
+	struct xfs_log_item	*lip;
+
+	lip = xfs_ail_min(ailp);
+	if (!lip || xlog_is_shutdown(ailp->ail_log) ||
+	    XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0)
+		return;
+
+	/*
+	 * Ensure that the new target is noticed in push code before it clears
+	 * the XFS_AIL_PUSHING_BIT.
+	 */
+	smp_wmb();
+	xfs_trans_ail_copy_lsn(ailp, &ailp->ail_target, &threshold_lsn);
+	smp_wmb();
+
+	wake_up_process(ailp->ail_task);
+}
+
+/*
+ * Push out all items in the AIL immediately
+ */
+void
+xfs_ail_push_all(
+	struct xfs_ail  *ailp)
+{
+	xfs_lsn_t       threshold_lsn = xfs_ail_max_lsn(ailp);
+
+	if (threshold_lsn)
+		xfs_ail_push(ailp, threshold_lsn);
+}
+
+/*
+ * Push out all items in the AIL immediately and wait until the AIL is empty.
+ */
+void
+xfs_ail_push_all_sync(
+	struct xfs_ail  *ailp)
+{
+	DEFINE_WAIT(wait);
+
+	spin_lock(&ailp->ail_lock);
+	while (xfs_ail_max(ailp) != NULL) {
+		prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE);
+		wake_up_process(ailp->ail_task);
+		spin_unlock(&ailp->ail_lock);
+		schedule();
+		spin_lock(&ailp->ail_lock);
+	}
+	spin_unlock(&ailp->ail_lock);
+
+	finish_wait(&ailp->ail_empty, &wait);
+}
+
+void
+xfs_ail_update_finish(
+	struct xfs_ail		*ailp,
+	xfs_lsn_t		old_lsn) __releases(ailp->ail_lock)
+{
+	struct xlog		*log = ailp->ail_log;
+
+	/* if the tail lsn hasn't changed, don't do updates or wakeups. */
+	if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
+		spin_unlock(&ailp->ail_lock);
+		return;
+	}
+
+	if (!xlog_is_shutdown(log))
+		xlog_assign_tail_lsn_locked(log->l_mp);
+
+	if (list_empty(&ailp->ail_head))
+		wake_up_all(&ailp->ail_empty);
+	spin_unlock(&ailp->ail_lock);
+	xfs_log_space_wake(log->l_mp);
+}
+
+/*
+ * xfs_trans_ail_update - bulk AIL insertion operation.
+ *
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
+ *
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
+ *
+ * To optimise the insert operation, we delete all the items from the AIL in
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
+ */
+void
+xfs_trans_ail_update_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	xfs_lsn_t		lsn) __releases(ailp->ail_lock)
+{
+	struct xfs_log_item	*mlip;
+	xfs_lsn_t		tail_lsn = 0;
+	int			i;
+	LIST_HEAD(tmp);
+
+	ASSERT(nr_items > 0);		/* Not required, but true. */
+	mlip = xfs_ail_min(ailp);
+
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+		if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
+			/* check if we really need to move the item */
+			if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
+				continue;
+
+			trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+			if (mlip == lip && !tail_lsn)
+				tail_lsn = lip->li_lsn;
+
+			xfs_ail_delete(ailp, lip);
+		} else {
+			trace_xfs_ail_insert(lip, 0, lsn);
+		}
+		lip->li_lsn = lsn;
+		list_add(&lip->li_ail, &tmp);
+	}
+
+	if (!list_empty(&tmp))
+		xfs_ail_splice(ailp, cur, &tmp, lsn);
+
+	xfs_ail_update_finish(ailp, tail_lsn);
+}
+
+/* Insert a log item into the AIL. */
+void
+xfs_trans_ail_insert(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	spin_lock(&ailp->ail_lock);
+	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+}
+
+/*
+ * Delete one log item from the AIL.
+ *
+ * If this item was at the tail of the AIL, return the LSN of the log item so
+ * that we can use it to check if the LSN of the tail of the log has moved
+ * when finishing up the AIL delete process in xfs_ail_update_finish().
+ */
+xfs_lsn_t
+xfs_ail_delete_one(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_log_item	*mlip = xfs_ail_min(ailp);
+	xfs_lsn_t		lsn = lip->li_lsn;
+
+	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+	xfs_ail_delete(ailp, lip);
+	clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
+	lip->li_lsn = 0;
+
+	if (mlip == lip)
+		return lsn;
+	return 0;
+}
+
+void
+xfs_trans_ail_delete(
+	struct xfs_log_item	*lip,
+	int			shutdown_type)
+{
+	struct xfs_ail		*ailp = lip->li_ailp;
+	struct xlog		*log = ailp->ail_log;
+	xfs_lsn_t		tail_lsn;
+
+	spin_lock(&ailp->ail_lock);
+	if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
+		spin_unlock(&ailp->ail_lock);
+		if (shutdown_type && !xlog_is_shutdown(log)) {
+			xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE,
+	"%s: attempting to delete a log item that is not in the AIL",
+					__func__);
+			xlog_force_shutdown(log, shutdown_type);
+		}
+		return;
+	}
+
+	/* xfs_ail_update_finish() drops the AIL lock */
+	xfs_clear_li_failed(lip);
+	tail_lsn = xfs_ail_delete_one(ailp, lip);
+	xfs_ail_update_finish(ailp, tail_lsn);
+}
+
+int
+xfs_trans_ail_init(
+	xfs_mount_t	*mp)
+{
+	struct xfs_ail	*ailp;
+
+	ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+	if (!ailp)
+		return -ENOMEM;
+
+	ailp->ail_log = mp->m_log;
+	INIT_LIST_HEAD(&ailp->ail_head);
+	INIT_LIST_HEAD(&ailp->ail_cursors);
+	spin_lock_init(&ailp->ail_lock);
+	INIT_LIST_HEAD(&ailp->ail_buf_list);
+	init_waitqueue_head(&ailp->ail_empty);
+
+	ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+				mp->m_super->s_id);
+	if (IS_ERR(ailp->ail_task))
+		goto out_free_ailp;
+
+	mp->m_ail = ailp;
+	return 0;
+
+out_free_ailp:
+	kmem_free(ailp);
+	return -ENOMEM;
+}
+
+void
+xfs_trans_ail_destroy(
+	xfs_mount_t	*mp)
+{
+	struct xfs_ail	*ailp = mp->m_ail;
+
+	kthread_stop(ailp->ail_task);
+	kmem_free(ailp);
+}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
new file mode 100644
index 000000000..6549e50d8
--- /dev/null
+++ b/fs/xfs/xfs_trans_buf.c
@@ -0,0 +1,768 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_trace.h"
+
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps)
+{
+	struct xfs_log_item	*lip;
+	struct xfs_buf_log_item	*blip;
+	int			len = 0;
+	int			i;
+
+	for (i = 0; i < nmaps; i++)
+		len += map[i].bm_len;
+
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
+		blip = (struct xfs_buf_log_item *)lip;
+		if (blip->bli_item.li_type == XFS_LI_BUF &&
+		    blip->bli_buf->b_target == target &&
+		    xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn &&
+		    blip->bli_buf->b_length == len) {
+			ASSERT(blip->bli_buf->b_map_count == nmaps);
+			return blip->bli_buf;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Add the locked buffer to the transaction.
+ *
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it.  Then add the buf item to the transaction.
+ */
+STATIC void
+_xfs_trans_bjoin(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	int			reset_recur)
+{
+	struct xfs_buf_log_item	*bip;
+
+	ASSERT(bp->b_transp == NULL);
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_log_item.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+	bip = bp->b_log_item;
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	if (reset_recur)
+		bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Attach the item to the transaction so we can find it in
+	 * xfs_trans_get_buf() and friends.
+	 */
+	xfs_trans_add_item(tp, &bip->bli_item);
+	bp->b_transp = tp;
+
+}
+
+void
+xfs_trans_bjoin(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	_xfs_trans_bjoin(tp, bp, 0);
+	trace_xfs_trans_bjoin(bp->b_log_item);
+}
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.  If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * get_buf() call.
+ */
+int
+xfs_trans_get_buf_map(
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp;
+	struct xfs_buf_log_item	*bip;
+	int			error;
+
+	*bpp = NULL;
+	if (!tp)
+		return xfs_buf_get_map(target, map, nmaps, flags, bpp);
+
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
+	if (bp != NULL) {
+		ASSERT(xfs_buf_islocked(bp));
+		if (xfs_is_shutdown(tp->t_mountp)) {
+			xfs_buf_stale(bp);
+			bp->b_flags |= XBF_DONE;
+		}
+
+		ASSERT(bp->b_transp == tp);
+		bip = bp->b_log_item;
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		trace_xfs_trans_get_buf_recur(bip);
+		*bpp = bp;
+		return 0;
+	}
+
+	error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
+	if (error)
+		return error;
+
+	ASSERT(!bp->b_error);
+
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_get_buf(bp->b_log_item);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Get and lock the superblock buffer for the given transaction.
+ */
+struct xfs_buf *
+xfs_trans_getsb(
+	struct xfs_trans	*tp)
+{
+	struct xfs_buf		*bp = tp->t_mountp->m_sb_bp;
+
+	/*
+	 * Just increment the lock recursion count if the buffer is already
+	 * attached to this transaction.
+	 */
+	if (bp->b_transp == tp) {
+		struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+
+		trace_xfs_trans_getsb_recur(bip);
+	} else {
+		xfs_buf_lock(bp);
+		xfs_buf_hold(bp);
+		_xfs_trans_bjoin(tp, bp, 1);
+
+		trace_xfs_trans_getsb(bp->b_log_item);
+	}
+
+	return bp;
+}
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.  If it has not yet been
+ * read in, read it from disk. If it is already locked
+ * within the transaction and already read in, just increment its
+ * lock recursion count and return a pointer to it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * read_buf() call.
+ */
+int
+xfs_trans_read_buf_map(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp = NULL;
+	struct xfs_buf_log_item	*bip;
+	int			error;
+
+	*bpp = NULL;
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  If it is already read in we just increment
+	 * the lock recursion count and return the buffer to the caller.
+	 * If the buffer is not yet read in, then we read it in, increment
+	 * the lock recursion count, and return it to the caller.
+	 */
+	if (tp)
+		bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
+	if (bp) {
+		ASSERT(xfs_buf_islocked(bp));
+		ASSERT(bp->b_transp == tp);
+		ASSERT(bp->b_log_item != NULL);
+		ASSERT(!bp->b_error);
+		ASSERT(bp->b_flags & XBF_DONE);
+
+		/*
+		 * We never locked this buf ourselves, so we shouldn't
+		 * brelse it either. Just get out.
+		 */
+		if (xfs_is_shutdown(mp)) {
+			trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+			return -EIO;
+		}
+
+		/*
+		 * Check if the caller is trying to read a buffer that is
+		 * already attached to the transaction yet has no buffer ops
+		 * assigned.  Ops are usually attached when the buffer is
+		 * attached to the transaction, or by the read caller if
+		 * special circumstances.  That didn't happen, which is not
+		 * how this is supposed to go.
+		 *
+		 * If the buffer passes verification we'll let this go, but if
+		 * not we have to shut down.  Let the transaction cleanup code
+		 * release this buffer when it kills the tranaction.
+		 */
+		ASSERT(bp->b_ops != NULL);
+		error = xfs_buf_reverify(bp, ops);
+		if (error) {
+			xfs_buf_ioerror_alert(bp, __return_address);
+
+			if (tp->t_flags & XFS_TRANS_DIRTY)
+				xfs_force_shutdown(tp->t_mountp,
+						SHUTDOWN_META_IO_ERROR);
+
+			/* bad CRC means corrupted metadata */
+			if (error == -EFSBADCRC)
+				error = -EFSCORRUPTED;
+			return error;
+		}
+
+		bip = bp->b_log_item;
+		bip->bli_recur++;
+
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		trace_xfs_trans_read_buf_recur(bip);
+		ASSERT(bp->b_ops != NULL || ops == NULL);
+		*bpp = bp;
+		return 0;
+	}
+
+	error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops,
+			__return_address);
+	switch (error) {
+	case 0:
+		break;
+	default:
+		if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
+			xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
+		fallthrough;
+	case -ENOMEM:
+	case -EAGAIN:
+		return error;
+	}
+
+	if (xfs_is_shutdown(mp)) {
+		xfs_buf_relse(bp);
+		trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+		return -EIO;
+	}
+
+	if (tp) {
+		_xfs_trans_bjoin(tp, bp, 1);
+		trace_xfs_trans_read_buf(bp->b_log_item);
+	}
+	ASSERT(bp->b_ops != NULL || ops == NULL);
+	*bpp = bp;
+	return 0;
+
+}
+
+/* Has this buffer been dirtied by anyone? */
+bool
+xfs_trans_buf_is_dirty(
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	if (!bip)
+		return false;
+	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+	return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
+}
+
+/*
+ * Release a buffer previously joined to the transaction. If the buffer is
+ * modified within this transaction, decrement the recursion count but do not
+ * release the buffer even if the count goes to 0. If the buffer is not modified
+ * within the transaction, decrement the recursion count and release the buffer
+ * if the recursion count goes to 0.
+ *
+ * If the buffer is to be released and it was not already dirty before this
+ * transaction began, then also free the buf_log_item associated with it.
+ *
+ * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call.
+ */
+void
+xfs_trans_brelse(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(bp->b_transp == tp);
+
+	if (!tp) {
+		xfs_buf_relse(bp);
+		return;
+	}
+
+	trace_xfs_trans_brelse(bip);
+	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	/*
+	 * If the release is for a recursive lookup, then decrement the count
+	 * and return.
+	 */
+	if (bip->bli_recur > 0) {
+		bip->bli_recur--;
+		return;
+	}
+
+	/*
+	 * If the buffer is invalidated or dirty in this transaction, we can't
+	 * release it until we commit.
+	 */
+	if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags))
+		return;
+	if (bip->bli_flags & XFS_BLI_STALE)
+		return;
+
+	/*
+	 * Unlink the log item from the transaction and clear the hold flag, if
+	 * set. We wouldn't want the next user of the buffer to get confused.
+	 */
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	xfs_trans_del_item(&bip->bli_item);
+	bip->bli_flags &= ~XFS_BLI_HOLD;
+
+	/* drop the reference to the bli */
+	xfs_buf_item_put(bip);
+
+	bp->b_transp = NULL;
+	xfs_buf_relse(bp);
+}
+
+/*
+ * Mark the buffer as not needing to be unlocked when the buf item's
+ * iop_committing() routine is called.  The buffer must already be locked
+ * and associated with the given transaction.
+ */
+/* ARGSUSED */
+void
+xfs_trans_bhold(
+	xfs_trans_t		*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_HOLD;
+	trace_xfs_trans_bhold(bip);
+}
+
+/*
+ * Cancel the previous buffer hold request made on this buffer
+ * for this transaction.
+ */
+void
+xfs_trans_bhold_release(
+	xfs_trans_t		*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT(bip->bli_flags & XFS_BLI_HOLD);
+
+	bip->bli_flags &= ~XFS_BLI_HOLD;
+	trace_xfs_trans_bhold_release(bip);
+}
+
+/*
+ * Mark a buffer dirty in the transaction.
+ */
+void
+xfs_trans_dirty_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+
+	/*
+	 * Mark the buffer as needing to be written out eventually,
+	 * and set its iodone function to remove the buffer's buf log
+	 * item from the AIL and free it when the buffer is flushed
+	 * to disk.
+	 */
+	bp->b_flags |= XBF_DONE;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	/*
+	 * If we invalidated the buffer within this transaction, then
+	 * cancel the invalidation now that we're dirtying the buffer
+	 * again.  There are no races with the code in xfs_buf_item_unpin(),
+	 * because we have a reference to the buffer this entire time.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		bip->bli_flags &= ~XFS_BLI_STALE;
+		ASSERT(bp->b_flags & XBF_STALE);
+		bp->b_flags &= ~XBF_STALE;
+		bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+	}
+	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
+}
+
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	uint			first,
+	uint			last)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(first <= last && last < BBTOB(bp->b_length));
+	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+
+	xfs_trans_dirty_buf(tp, bp);
+
+	trace_xfs_trans_log_buf(bip);
+	xfs_buf_item_log(bip, first, last);
+}
+
+
+/*
+ * Invalidate a buffer that is being used within a transaction.
+ *
+ * Typically this is because the blocks in the buffer are being freed, so we
+ * need to prevent it from being written out when we're done.  Allowing it
+ * to be written again might overwrite data in the free blocks if they are
+ * reallocated to a file.
+ *
+ * We prevent the buffer from being written out by marking it stale.  We can't
+ * get rid of the buf log item at this point because the buffer may still be
+ * pinned by another transaction.  If that is the case, then we'll wait until
+ * the buffer is committed to disk for the last time (we can tell by the ref
+ * count) and free it in xfs_buf_item_unpin().  Until that happens we will
+ * keep the buffer locked so that the buffer and buf log item are not reused.
+ *
+ * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log
+ * the buf item.  This will be used at recovery time to determine that copies
+ * of the buffer in the log before this should not be replayed.
+ *
+ * We mark the item descriptor and the transaction dirty so that we'll hold
+ * the buffer until after the commit.
+ *
+ * Since we're invalidating the buffer, we also clear the state about which
+ * parts of the buffer have been logged.  We also clear the flag indicating
+ * that this is an inode buffer since the data in the buffer will no longer
+ * be valid.
+ *
+ * We set the stale bit in the buffer as well since we're getting rid of it.
+ */
+void
+xfs_trans_binval(
+	xfs_trans_t		*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	int			i;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_trans_binval(bip);
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * If the buffer is already invalidated, then
+		 * just return.
+		 */
+		ASSERT(bp->b_flags & XBF_STALE);
+		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
+		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
+		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
+		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+		ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
+		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+		return;
+	}
+
+	xfs_buf_stale(bp);
+
+	bip->bli_flags |= XFS_BLI_STALE;
+	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+	bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+	bip->__bli_format.blf_flags |= XFS_BLF_CANCEL;
+	bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK;
+	for (i = 0; i < bip->bli_format_count; i++) {
+		memset(bip->bli_formats[i].blf_data_map, 0,
+		       (bip->bli_formats[i].blf_map_size * sizeof(uint)));
+	}
+	set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
+	tp->t_flags |= XFS_TRANS_DIRTY;
+}
+
+/*
+ * This call is used to indicate that the buffer contains on-disk inodes which
+ * must be handled specially during recovery.  They require special handling
+ * because only the di_next_unlinked from the inodes in the buffer should be
+ * recovered.  The rest of the data in the buffer is logged via the inodes
+ * themselves.
+ *
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
+ */
+void
+xfs_trans_inode_buf(
+	xfs_trans_t		*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_INODE_BUF;
+	bp->b_flags |= _XBF_INODES;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * This call is used to indicate that the buffer is going to
+ * be staled and was an inode buffer. This means it gets
+ * special processing during unpin - where any inodes
+ * associated with the buffer should be removed from ail.
+ * There is also special processing during recovery,
+ * any replay of the inodes in the buffer needs to be
+ * prevented as the buffer may have been reused.
+ */
+void
+xfs_trans_stale_inode_buf(
+	xfs_trans_t		*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_STALE_INODE;
+	bp->b_flags |= _XBF_INODES;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * Mark the buffer as being one which contains newly allocated
+ * inodes.  We need to make sure that even if this buffer is
+ * relogged as an 'inode buf' we still recover all of the inode
+ * images in the face of a crash.  This works in coordination with
+ * xfs_buf_item_committed() to ensure that the buffer remains in the
+ * AIL at its original location even after it has been relogged.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_alloc_buf(
+	xfs_trans_t		*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+	bp->b_flags |= _XBF_INODES;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * Mark the buffer as ordered for this transaction. This means that the contents
+ * of the buffer are not recorded in the transaction but it is tracked in the
+ * AIL as though it was. This allows us to record logical changes in
+ * transactions rather than the physical changes we make to the buffer without
+ * changing writeback ordering constraints of metadata buffers.
+ */
+bool
+xfs_trans_ordered_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	if (xfs_buf_item_dirty_format(bip))
+		return false;
+
+	bip->bli_flags |= XFS_BLI_ORDERED;
+	trace_xfs_buf_item_ordered(bip);
+
+	/*
+	 * We don't log a dirty range of an ordered buffer but it still needs
+	 * to be marked dirty and that it has been logged.
+	 */
+	xfs_trans_dirty_buf(tp, bp);
+	return true;
+}
+
+/*
+ * Set the type of the buffer for log recovery so that it can correctly identify
+ * and hence attach the correct buffer ops to the buffer after replay.
+ */
+void
+xfs_trans_buf_set_type(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	enum xfs_blft		type)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	if (!tp)
+		return;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	xfs_blft_to_flags(&bip->__bli_format, type);
+}
+
+void
+xfs_trans_buf_copy_type(
+	struct xfs_buf		*dst_bp,
+	struct xfs_buf		*src_bp)
+{
+	struct xfs_buf_log_item	*sbip = src_bp->b_log_item;
+	struct xfs_buf_log_item	*dbip = dst_bp->b_log_item;
+	enum xfs_blft		type;
+
+	type = xfs_blft_from_flags(&sbip->__bli_format);
+	xfs_blft_to_flags(&dbip->__bli_format, type);
+}
+
+/*
+ * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
+ * dquots. However, unlike in inode buffer recovery, dquot buffers get
+ * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag).
+ * The only thing that makes dquot buffers different from regular
+ * buffers is that we must not replay dquot bufs when recovering
+ * if a _corresponding_ quotaoff has happened. We also have to distinguish
+ * between usr dquot bufs and grp dquot bufs, because usr and grp quotas
+ * can be turned off independently.
+ */
+/* ARGSUSED */
+void
+xfs_trans_dquot_buf(
+	xfs_trans_t		*tp,
+	struct xfs_buf		*bp,
+	uint			type)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(type == XFS_BLF_UDQUOT_BUF ||
+	       type == XFS_BLF_PDQUOT_BUF ||
+	       type == XFS_BLF_GDQUOT_BUF);
+
+	bip->__bli_format.blf_flags |= type;
+
+	switch (type) {
+	case XFS_BLF_UDQUOT_BUF:
+		type = XFS_BLFT_UDQUOT_BUF;
+		break;
+	case XFS_BLF_PDQUOT_BUF:
+		type = XFS_BLFT_PDQUOT_BUF;
+		break;
+	case XFS_BLF_GDQUOT_BUF:
+		type = XFS_BLFT_GDQUOT_BUF;
+		break;
+	default:
+		type = XFS_BLFT_UNKNOWN_BUF;
+		break;
+	}
+
+	bp->b_flags |= _XBF_DQUOTS;
+	xfs_trans_buf_set_type(tp, bp, type);
+}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
new file mode 100644
index 000000000..aa00cf67a
--- /dev/null
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -0,0 +1,859 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+STATIC void	xfs_trans_alloc_dqinfo(xfs_trans_t *);
+
+/*
+ * Add the locked dquot to the transaction.
+ * The dquot must be locked, and it cannot be associated with any
+ * transaction.
+ */
+void
+xfs_trans_dqjoin(
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp)
+{
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(dqp->q_logitem.qli_dquot == dqp);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
+}
+
+/*
+ * This is called to mark the dquot as needing
+ * to be logged when the transaction is committed.  The dquot must
+ * already be associated with the given transaction.
+ * Note that it marks the entire transaction as dirty. In the ordinary
+ * case, this gets called via xfs_trans_commit, after the transaction
+ * is already dirty. However, there's nothing stop this from getting
+ * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
+ * flag.
+ */
+void
+xfs_trans_log_dquot(
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp)
+{
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/* Upgrade the dquot to bigtime format if possible. */
+	if (dqp->q_id != 0 &&
+	    xfs_has_bigtime(tp->t_mountp) &&
+	    !(dqp->q_type & XFS_DQTYPE_BIGTIME))
+		dqp->q_type |= XFS_DQTYPE_BIGTIME;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
+}
+
+/*
+ * Carry forward whatever is left of the quota blk reservation to
+ * the spanky new transaction
+ */
+void
+xfs_trans_dup_dqinfo(
+	struct xfs_trans	*otp,
+	struct xfs_trans	*ntp)
+{
+	struct xfs_dqtrx	*oq, *nq;
+	int			i, j;
+	struct xfs_dqtrx	*oqa, *nqa;
+	uint64_t		blk_res_used;
+
+	if (!otp->t_dqinfo)
+		return;
+
+	xfs_trans_alloc_dqinfo(ntp);
+
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		oqa = otp->t_dqinfo->dqs[j];
+		nqa = ntp->t_dqinfo->dqs[j];
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			blk_res_used = 0;
+
+			if (oqa[i].qt_dquot == NULL)
+				break;
+			oq = &oqa[i];
+			nq = &nqa[i];
+
+			if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
+				blk_res_used = oq->qt_bcount_delta;
+
+			nq->qt_dquot = oq->qt_dquot;
+			nq->qt_bcount_delta = nq->qt_icount_delta = 0;
+			nq->qt_rtbcount_delta = 0;
+
+			/*
+			 * Transfer whatever is left of the reservations.
+			 */
+			nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
+			oq->qt_blk_res = blk_res_used;
+
+			nq->qt_rtblk_res = oq->qt_rtblk_res -
+				oq->qt_rtblk_res_used;
+			oq->qt_rtblk_res = oq->qt_rtblk_res_used;
+
+			nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
+			oq->qt_ino_res = oq->qt_ino_res_used;
+
+		}
+	}
+}
+
+/*
+ * Wrap around mod_dquot to account for both user and group quotas.
+ */
+void
+xfs_trans_mod_dquot_byino(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		field,
+	int64_t		delta)
+{
+	xfs_mount_t	*mp = tp->t_mountp;
+
+	if (!XFS_IS_QUOTA_ON(mp) ||
+	    xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+		return;
+
+	if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
+		(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+	if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
+		(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+	if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
+		(void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
+}
+
+STATIC struct xfs_dqtrx *
+xfs_trans_get_dqtrx(
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp)
+{
+	int			i;
+	struct xfs_dqtrx	*qa;
+
+	switch (xfs_dquot_type(dqp)) {
+	case XFS_DQTYPE_USER:
+		qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+		break;
+	case XFS_DQTYPE_GROUP:
+		qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
+		break;
+	case XFS_DQTYPE_PROJ:
+		qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
+		break;
+	default:
+		return NULL;
+	}
+
+	for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+		if (qa[i].qt_dquot == NULL ||
+		    qa[i].qt_dquot == dqp)
+			return &qa[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * Make the changes in the transaction structure.
+ * The moral equivalent to xfs_trans_mod_sb().
+ * We don't touch any fields in the dquot, so we don't care
+ * if it's locked or not (most of the time it won't be).
+ */
+void
+xfs_trans_mod_dquot(
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp,
+	uint			field,
+	int64_t			delta)
+{
+	struct xfs_dqtrx	*qtrx;
+
+	ASSERT(tp);
+	ASSERT(XFS_IS_QUOTA_ON(tp->t_mountp));
+	qtrx = NULL;
+
+	if (!delta)
+		return;
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+	/*
+	 * Find either the first free slot or the slot that belongs
+	 * to this dquot.
+	 */
+	qtrx = xfs_trans_get_dqtrx(tp, dqp);
+	ASSERT(qtrx);
+	if (qtrx->qt_dquot == NULL)
+		qtrx->qt_dquot = dqp;
+
+	trace_xfs_trans_mod_dquot_before(qtrx);
+	trace_xfs_trans_mod_dquot(tp, dqp, field, delta);
+
+	switch (field) {
+	/* regular disk blk reservation */
+	case XFS_TRANS_DQ_RES_BLKS:
+		qtrx->qt_blk_res += delta;
+		break;
+
+	/* inode reservation */
+	case XFS_TRANS_DQ_RES_INOS:
+		qtrx->qt_ino_res += delta;
+		break;
+
+	/* disk blocks used. */
+	case XFS_TRANS_DQ_BCOUNT:
+		qtrx->qt_bcount_delta += delta;
+		break;
+
+	case XFS_TRANS_DQ_DELBCOUNT:
+		qtrx->qt_delbcnt_delta += delta;
+		break;
+
+	/* Inode Count */
+	case XFS_TRANS_DQ_ICOUNT:
+		if (qtrx->qt_ino_res && delta > 0) {
+			qtrx->qt_ino_res_used += delta;
+			ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+		}
+		qtrx->qt_icount_delta += delta;
+		break;
+
+	/* rtblk reservation */
+	case XFS_TRANS_DQ_RES_RTBLKS:
+		qtrx->qt_rtblk_res += delta;
+		break;
+
+	/* rtblk count */
+	case XFS_TRANS_DQ_RTBCOUNT:
+		if (qtrx->qt_rtblk_res && delta > 0) {
+			qtrx->qt_rtblk_res_used += delta;
+			ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
+		}
+		qtrx->qt_rtbcount_delta += delta;
+		break;
+
+	case XFS_TRANS_DQ_DELRTBCOUNT:
+		qtrx->qt_delrtb_delta += delta;
+		break;
+
+	default:
+		ASSERT(0);
+	}
+
+	trace_xfs_trans_mod_dquot_after(qtrx);
+}
+
+
+/*
+ * Given an array of dqtrx structures, lock all the dquots associated and join
+ * them to the transaction, provided they have been modified.  We know that the
+ * highest number of dquots of one type - usr, grp and prj - involved in a
+ * transaction is 3 so we don't need to make this very generic.
+ */
+STATIC void
+xfs_trans_dqlockedjoin(
+	struct xfs_trans	*tp,
+	struct xfs_dqtrx	*q)
+{
+	ASSERT(q[0].qt_dquot != NULL);
+	if (q[1].qt_dquot == NULL) {
+		xfs_dqlock(q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+	} else {
+		ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+		xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[1].qt_dquot);
+	}
+}
+
+/* Apply dqtrx changes to the quota reservation counters. */
+static inline void
+xfs_apply_quota_reservation_deltas(
+	struct xfs_dquot_res	*res,
+	uint64_t		reserved,
+	int64_t			res_used,
+	int64_t			count_delta)
+{
+	if (reserved != 0) {
+		/*
+		 * Subtle math here: If reserved > res_used (the normal case),
+		 * we're simply subtracting the unused transaction quota
+		 * reservation from the dquot reservation.
+		 *
+		 * If, however, res_used > reserved, then we have allocated
+		 * more quota blocks than were reserved for the transaction.
+		 * We must add that excess to the dquot reservation since it
+		 * tracks (usage + resv) and by definition we didn't reserve
+		 * that excess.
+		 */
+		res->reserved -= abs(reserved - res_used);
+	} else if (count_delta != 0) {
+		/*
+		 * These blks were never reserved, either inside a transaction
+		 * or outside one (in a delayed allocation). Also, this isn't
+		 * always a negative number since we sometimes deliberately
+		 * skip quota reservations.
+		 */
+		res->reserved += count_delta;
+	}
+}
+
+/*
+ * Called by xfs_trans_commit() and similar in spirit to
+ * xfs_trans_apply_sb_deltas().
+ * Go thru all the dquots belonging to this transaction and modify the
+ * INCORE dquot to reflect the actual usages.
+ * Unreserve just the reservations done by this transaction.
+ * dquot is still left locked at exit.
+ */
+void
+xfs_trans_apply_dquot_deltas(
+	struct xfs_trans	*tp)
+{
+	int			i, j;
+	struct xfs_dquot	*dqp;
+	struct xfs_dqtrx	*qtrx, *qa;
+	int64_t			totalbdelta;
+	int64_t			totalrtbdelta;
+
+	if (!tp->t_dqinfo)
+		return;
+
+	ASSERT(tp->t_dqinfo);
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		qa = tp->t_dqinfo->dqs[j];
+		if (qa[0].qt_dquot == NULL)
+			continue;
+
+		/*
+		 * Lock all of the dquots and join them to the transaction.
+		 */
+		xfs_trans_dqlockedjoin(tp, qa);
+
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			uint64_t	blk_res_used;
+
+			qtrx = &qa[i];
+			/*
+			 * The array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+
+			ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+			/*
+			 * adjust the actual number of blocks used
+			 */
+
+			/*
+			 * The issue here is - sometimes we don't make a blkquota
+			 * reservation intentionally to be fair to users
+			 * (when the amount is small). On the other hand,
+			 * delayed allocs do make reservations, but that's
+			 * outside of a transaction, so we have no
+			 * idea how much was really reserved.
+			 * So, here we've accumulated delayed allocation blks and
+			 * non-delay blks. The assumption is that the
+			 * delayed ones are always reserved (outside of a
+			 * transaction), and the others may or may not have
+			 * quota reservations.
+			 */
+			totalbdelta = qtrx->qt_bcount_delta +
+				qtrx->qt_delbcnt_delta;
+			totalrtbdelta = qtrx->qt_rtbcount_delta +
+				qtrx->qt_delrtb_delta;
+
+			if (totalbdelta != 0 || totalrtbdelta != 0 ||
+			    qtrx->qt_icount_delta != 0) {
+				trace_xfs_trans_apply_dquot_deltas_before(dqp);
+				trace_xfs_trans_apply_dquot_deltas(qtrx);
+			}
+
+#ifdef DEBUG
+			if (totalbdelta < 0)
+				ASSERT(dqp->q_blk.count >= -totalbdelta);
+
+			if (totalrtbdelta < 0)
+				ASSERT(dqp->q_rtb.count >= -totalrtbdelta);
+
+			if (qtrx->qt_icount_delta < 0)
+				ASSERT(dqp->q_ino.count >= -qtrx->qt_icount_delta);
+#endif
+			if (totalbdelta)
+				dqp->q_blk.count += totalbdelta;
+
+			if (qtrx->qt_icount_delta)
+				dqp->q_ino.count += qtrx->qt_icount_delta;
+
+			if (totalrtbdelta)
+				dqp->q_rtb.count += totalrtbdelta;
+
+			if (totalbdelta != 0 || totalrtbdelta != 0 ||
+			    qtrx->qt_icount_delta != 0)
+				trace_xfs_trans_apply_dquot_deltas_after(dqp);
+
+			/*
+			 * Get any default limits in use.
+			 * Start/reset the timer(s) if needed.
+			 */
+			if (dqp->q_id) {
+				xfs_qm_adjust_dqlimits(dqp);
+				xfs_qm_adjust_dqtimers(dqp);
+			}
+
+			dqp->q_flags |= XFS_DQFLAG_DIRTY;
+			/*
+			 * add this to the list of items to get logged
+			 */
+			xfs_trans_log_dquot(tp, dqp);
+			/*
+			 * Take off what's left of the original reservation.
+			 * In case of delayed allocations, there's no
+			 * reservation that a transaction structure knows of.
+			 */
+			blk_res_used = max_t(int64_t, 0, qtrx->qt_bcount_delta);
+			xfs_apply_quota_reservation_deltas(&dqp->q_blk,
+					qtrx->qt_blk_res, blk_res_used,
+					qtrx->qt_bcount_delta);
+
+			/*
+			 * Adjust the RT reservation.
+			 */
+			xfs_apply_quota_reservation_deltas(&dqp->q_rtb,
+					qtrx->qt_rtblk_res,
+					qtrx->qt_rtblk_res_used,
+					qtrx->qt_rtbcount_delta);
+
+			/*
+			 * Adjust the inode reservation.
+			 */
+			ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+			xfs_apply_quota_reservation_deltas(&dqp->q_ino,
+					qtrx->qt_ino_res,
+					qtrx->qt_ino_res_used,
+					qtrx->qt_icount_delta);
+
+			ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
+			ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
+			ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+		}
+	}
+}
+
+/*
+ * Release the reservations, and adjust the dquots accordingly.
+ * This is called only when the transaction is being aborted. If by
+ * any chance we have done dquot modifications incore (ie. deltas) already,
+ * we simply throw those away, since that's the expected behavior
+ * when a transaction is curtailed without a commit.
+ */
+void
+xfs_trans_unreserve_and_mod_dquots(
+	struct xfs_trans	*tp)
+{
+	int			i, j;
+	struct xfs_dquot	*dqp;
+	struct xfs_dqtrx	*qtrx, *qa;
+	bool			locked;
+
+	if (!tp->t_dqinfo)
+		return;
+
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		qa = tp->t_dqinfo->dqs[j];
+
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * We assume that the array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+			/*
+			 * Unreserve the original reservation. We don't care
+			 * about the number of blocks used field, or deltas.
+			 * Also we don't bother to zero the fields.
+			 */
+			locked = false;
+			if (qtrx->qt_blk_res) {
+				xfs_dqlock(dqp);
+				locked = true;
+				dqp->q_blk.reserved -=
+					(xfs_qcnt_t)qtrx->qt_blk_res;
+			}
+			if (qtrx->qt_ino_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = true;
+				}
+				dqp->q_ino.reserved -=
+					(xfs_qcnt_t)qtrx->qt_ino_res;
+			}
+
+			if (qtrx->qt_rtblk_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = true;
+				}
+				dqp->q_rtb.reserved -=
+					(xfs_qcnt_t)qtrx->qt_rtblk_res;
+			}
+			if (locked)
+				xfs_dqunlock(dqp);
+
+		}
+	}
+}
+
+STATIC void
+xfs_quota_warn(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	int			type)
+{
+	enum quota_type		qtype;
+
+	switch (xfs_dquot_type(dqp)) {
+	case XFS_DQTYPE_PROJ:
+		qtype = PRJQUOTA;
+		break;
+	case XFS_DQTYPE_USER:
+		qtype = USRQUOTA;
+		break;
+	case XFS_DQTYPE_GROUP:
+		qtype = GRPQUOTA;
+		break;
+	default:
+		return;
+	}
+
+	quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id),
+			   mp->m_super->s_dev, type);
+}
+
+/*
+ * Decide if we can make an additional reservation against a quota resource.
+ * Returns an inode QUOTA_NL_ warning code and whether or not it's fatal.
+ *
+ * Note that we assume that the numeric difference between the inode and block
+ * warning codes will always be 3 since it's userspace ABI now, and will never
+ * decrease the quota reservation, so the *BELOW messages are irrelevant.
+ */
+static inline int
+xfs_dqresv_check(
+	struct xfs_dquot_res	*res,
+	struct xfs_quota_limits	*qlim,
+	int64_t			delta,
+	bool			*fatal)
+{
+	xfs_qcnt_t		hardlimit = res->hardlimit;
+	xfs_qcnt_t		softlimit = res->softlimit;
+	xfs_qcnt_t		total_count = res->reserved + delta;
+
+	BUILD_BUG_ON(QUOTA_NL_BHARDWARN     != QUOTA_NL_IHARDWARN + 3);
+	BUILD_BUG_ON(QUOTA_NL_BSOFTLONGWARN != QUOTA_NL_ISOFTLONGWARN + 3);
+	BUILD_BUG_ON(QUOTA_NL_BSOFTWARN     != QUOTA_NL_ISOFTWARN + 3);
+
+	*fatal = false;
+	if (delta <= 0)
+		return QUOTA_NL_NOWARN;
+
+	if (!hardlimit)
+		hardlimit = qlim->hard;
+	if (!softlimit)
+		softlimit = qlim->soft;
+
+	if (hardlimit && total_count > hardlimit) {
+		*fatal = true;
+		return QUOTA_NL_IHARDWARN;
+	}
+
+	if (softlimit && total_count > softlimit) {
+		time64_t	now = ktime_get_real_seconds();
+
+		if (res->timer != 0 && now > res->timer) {
+			*fatal = true;
+			return QUOTA_NL_ISOFTLONGWARN;
+		}
+
+		return QUOTA_NL_ISOFTWARN;
+	}
+
+	return QUOTA_NL_NOWARN;
+}
+
+/*
+ * This reserves disk blocks and inodes against a dquot.
+ * Flags indicate if the dquot is to be locked here and also
+ * if the blk reservation is for RT or regular blocks.
+ * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
+ */
+STATIC int
+xfs_trans_dqresv(
+	struct xfs_trans	*tp,
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	int64_t			nblks,
+	long			ninos,
+	uint			flags)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_def_quota	*defq;
+	struct xfs_dquot_res	*blkres;
+	struct xfs_quota_limits	*qlim;
+
+	xfs_dqlock(dqp);
+
+	defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
+
+	if (flags & XFS_TRANS_DQ_RES_BLKS) {
+		blkres = &dqp->q_blk;
+		qlim = &defq->blk;
+	} else {
+		blkres = &dqp->q_rtb;
+		qlim = &defq->rtb;
+	}
+
+	if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id &&
+	    xfs_dquot_is_enforced(dqp)) {
+		int		quota_nl;
+		bool		fatal;
+
+		/*
+		 * dquot is locked already. See if we'd go over the hardlimit
+		 * or exceed the timelimit if we'd reserve resources.
+		 */
+		quota_nl = xfs_dqresv_check(blkres, qlim, nblks, &fatal);
+		if (quota_nl != QUOTA_NL_NOWARN) {
+			/*
+			 * Quota block warning codes are 3 more than the inode
+			 * codes, which we check above.
+			 */
+			xfs_quota_warn(mp, dqp, quota_nl + 3);
+			if (fatal)
+				goto error_return;
+		}
+
+		quota_nl = xfs_dqresv_check(&dqp->q_ino, &defq->ino, ninos,
+				&fatal);
+		if (quota_nl != QUOTA_NL_NOWARN) {
+			xfs_quota_warn(mp, dqp, quota_nl);
+			if (fatal)
+				goto error_return;
+		}
+	}
+
+	/*
+	 * Change the reservation, but not the actual usage.
+	 * Note that q_blk.reserved = q_blk.count + resv
+	 */
+	blkres->reserved += (xfs_qcnt_t)nblks;
+	dqp->q_ino.reserved += (xfs_qcnt_t)ninos;
+
+	/*
+	 * note the reservation amt in the trans struct too,
+	 * so that the transaction knows how much was reserved by
+	 * it against this particular dquot.
+	 * We don't do this when we are reserving for a delayed allocation,
+	 * because we don't have the luxury of a transaction envelope then.
+	 */
+	if (tp) {
+		ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+		xfs_trans_mod_dquot(tp, dqp, flags & XFS_QMOPT_RESBLK_MASK,
+				    nblks);
+		xfs_trans_mod_dquot(tp, dqp, XFS_TRANS_DQ_RES_INOS, ninos);
+	}
+
+	if (XFS_IS_CORRUPT(mp, dqp->q_blk.reserved < dqp->q_blk.count) ||
+	    XFS_IS_CORRUPT(mp, dqp->q_rtb.reserved < dqp->q_rtb.count) ||
+	    XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count))
+		goto error_corrupt;
+
+	xfs_dqunlock(dqp);
+	return 0;
+
+error_return:
+	xfs_dqunlock(dqp);
+	if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ)
+		return -ENOSPC;
+	return -EDQUOT;
+error_corrupt:
+	xfs_dqunlock(dqp);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	return -EFSCORRUPTED;
+}
+
+
+/*
+ * Given dquot(s), make disk block and/or inode reservations against them.
+ * The fact that this does the reservation against user, group and
+ * project quotas is important, because this follows a all-or-nothing
+ * approach.
+ *
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ *	   XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
+ *	   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
+ *	   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
+ * dquots are unlocked on return, if they were not locked by caller.
+ */
+int
+xfs_trans_reserve_quota_bydquots(
+	struct xfs_trans	*tp,
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	struct xfs_dquot	*pdqp,
+	int64_t			nblks,
+	long			ninos,
+	uint			flags)
+{
+	int		error;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+
+	if (udqp) {
+		error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags);
+		if (error)
+			return error;
+	}
+
+	if (gdqp) {
+		error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
+		if (error)
+			goto unwind_usr;
+	}
+
+	if (pdqp) {
+		error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
+		if (error)
+			goto unwind_grp;
+	}
+
+	/*
+	 * Didn't change anything critical, so, no need to log
+	 */
+	return 0;
+
+unwind_grp:
+	flags |= XFS_QMOPT_FORCE_RES;
+	if (gdqp)
+		xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
+unwind_usr:
+	flags |= XFS_QMOPT_FORCE_RES;
+	if (udqp)
+		xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+	return error;
+}
+
+
+/*
+ * Lock the dquot and change the reservation if we can.
+ * This doesn't change the actual usage, just the reservation.
+ * The inode sent in is locked.
+ */
+int
+xfs_trans_reserve_quota_nblks(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int64_t			dblocks,
+	int64_t			rblocks,
+	bool			force)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	unsigned int		qflags = 0;
+	int			error;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (force)
+		qflags |= XFS_QMOPT_FORCE_RES;
+
+	/* Reserve data device quota against the inode's dquots. */
+	error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+			ip->i_gdquot, ip->i_pdquot, dblocks, 0,
+			XFS_QMOPT_RES_REGBLKS | qflags);
+	if (error)
+		return error;
+
+	/* Do the same but for realtime blocks. */
+	error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+			ip->i_gdquot, ip->i_pdquot, rblocks, 0,
+			XFS_QMOPT_RES_RTBLKS | qflags);
+	if (error) {
+		xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+				ip->i_gdquot, ip->i_pdquot, -dblocks, 0,
+				XFS_QMOPT_RES_REGBLKS);
+		return error;
+	}
+
+	return 0;
+}
+
+/* Change the quota reservations for an inode creation activity. */
+int
+xfs_trans_reserve_quota_icreate(
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	struct xfs_dquot	*pdqp,
+	int64_t			dblocks)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	if (!XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp,
+			dblocks, 1, XFS_QMOPT_RES_REGBLKS);
+}
+
+STATIC void
+xfs_trans_alloc_dqinfo(
+	xfs_trans_t	*tp)
+{
+	tp->t_dqinfo = kmem_cache_zalloc(xfs_dqtrx_cache,
+					 GFP_KERNEL | __GFP_NOFAIL);
+}
+
+void
+xfs_trans_free_dqinfo(
+	xfs_trans_t	*tp)
+{
+	if (!tp->t_dqinfo)
+		return;
+	kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo);
+	tp->t_dqinfo = NULL;
+}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
new file mode 100644
index 000000000..d54001503
--- /dev/null
+++ b/fs/xfs/xfs_trans_priv.h
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_TRANS_PRIV_H__
+#define	__XFS_TRANS_PRIV_H__
+
+struct xlog;
+struct xfs_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
+
+
+void	xfs_trans_init(struct xfs_mount *);
+void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
+void	xfs_trans_del_item(struct xfs_log_item *);
+void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+
+void	xfs_trans_committed_bulk(struct xfs_ail *ailp,
+				struct list_head *lv_chain,
+				xfs_lsn_t commit_lsn, bool aborted);
+/*
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
+ */
+struct xfs_ail_cursor {
+	struct list_head	list;
+	struct xfs_log_item	*item;
+};
+
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+	struct xlog		*ail_log;
+	struct task_struct	*ail_task;
+	struct list_head	ail_head;
+	xfs_lsn_t		ail_target;
+	xfs_lsn_t		ail_target_prev;
+	struct list_head	ail_cursors;
+	spinlock_t		ail_lock;
+	xfs_lsn_t		ail_last_pushed_lsn;
+	int			ail_log_flush;
+	struct list_head	ail_buf_list;
+	wait_queue_head_t	ail_empty;
+};
+
+/*
+ * From xfs_trans_ail.c
+ */
+void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
+				struct xfs_ail_cursor *cur,
+				struct xfs_log_item **log_items, int nr_items,
+				xfs_lsn_t lsn) __releases(ailp->ail_lock);
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static inline struct xfs_log_item *
+xfs_ail_min(
+	struct xfs_ail  *ailp)
+{
+	return list_first_entry_or_null(&ailp->ail_head, struct xfs_log_item,
+					li_ail);
+}
+
+static inline void
+xfs_trans_ail_update(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn) __releases(ailp->ail_lock)
+{
+	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+}
+
+void xfs_trans_ail_insert(struct xfs_ail *ailp, struct xfs_log_item *lip,
+		xfs_lsn_t lsn);
+
+xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
+			__releases(ailp->ail_lock);
+void xfs_trans_ail_delete(struct xfs_log_item *lip, int shutdown_type);
+
+void			xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
+void			xfs_ail_push_all(struct xfs_ail *);
+void			xfs_ail_push_all_sync(struct xfs_ail *);
+struct xfs_log_item	*xfs_ail_min(struct xfs_ail  *ailp);
+xfs_lsn_t		xfs_ail_min_lsn(struct xfs_ail *ailp);
+
+struct xfs_log_item *	xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur,
+					xfs_lsn_t lsn);
+struct xfs_log_item *	xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur,
+					xfs_lsn_t lsn);
+struct xfs_log_item *	xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur);
+void			xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur);
+
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+	struct xfs_ail	*ailp,
+	xfs_lsn_t	*dst,
+	xfs_lsn_t	*src)
+{
+	ASSERT(sizeof(xfs_lsn_t) == 8);	/* don't lock if it shrinks */
+	spin_lock(&ailp->ail_lock);
+	*dst = *src;
+	spin_unlock(&ailp->ail_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+	struct xfs_ail	*ailp,
+	xfs_lsn_t	*dst,
+	xfs_lsn_t	*src)
+{
+	ASSERT(sizeof(xfs_lsn_t) == 8);
+	*dst = *src;
+}
+#endif
+
+static inline void
+xfs_clear_li_failed(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf	*bp = lip->li_buf;
+
+	ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
+	lockdep_assert_held(&lip->li_ailp->ail_lock);
+
+	if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
+		lip->li_buf = NULL;
+		xfs_buf_rele(bp);
+	}
+}
+
+static inline void
+xfs_set_li_failed(
+	struct xfs_log_item	*lip,
+	struct xfs_buf		*bp)
+{
+	lockdep_assert_held(&lip->li_ailp->ail_lock);
+
+	if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
+		xfs_buf_hold(bp);
+		lip->li_buf = bp;
+	}
+}
+
+#endif	/* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
new file mode 100644
index 000000000..c325a28b8
--- /dev/null
+++ b/fs/xfs/xfs_xattr.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2008 Christoph Hellwig.
+ * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_acl.h"
+#include "xfs_log.h"
+#include "xfs_xattr.h"
+
+#include <linux/posix_acl_xattr.h>
+
+/*
+ * Get permission to use log-assisted atomic exchange of file extents.
+ *
+ * Callers must not be running any transactions or hold any inode locks, and
+ * they must release the permission by calling xlog_drop_incompat_feat
+ * when they're done.
+ */
+static inline int
+xfs_attr_grab_log_assist(
+	struct xfs_mount	*mp)
+{
+	int			error = 0;
+
+	/*
+	 * Protect ourselves from an idle log clearing the logged xattrs log
+	 * incompat feature bit.
+	 */
+	xlog_use_incompat_feat(mp->m_log);
+
+	/*
+	 * If log-assisted xattrs are already enabled, the caller can use the
+	 * log assisted swap functions with the log-incompat reference we got.
+	 */
+	if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+		return 0;
+
+	/* Enable log-assisted xattrs. */
+	error = xfs_add_incompat_log_feature(mp,
+			XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+	if (error)
+		goto drop_incompat;
+
+	xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP,
+ "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!");
+
+	return 0;
+drop_incompat:
+	xlog_drop_incompat_feat(mp->m_log);
+	return error;
+}
+
+static inline void
+xfs_attr_rele_log_assist(
+	struct xfs_mount	*mp)
+{
+	xlog_drop_incompat_feat(mp->m_log);
+}
+
+static inline bool
+xfs_attr_want_log_assist(
+	struct xfs_mount	*mp)
+{
+#ifdef DEBUG
+	/* Logged xattrs require a V5 super for log_incompat */
+	return xfs_has_crc(mp) && xfs_globals.larp;
+#else
+	return false;
+#endif
+}
+
+/*
+ * Set or remove an xattr, having grabbed the appropriate logging resources
+ * prior to calling libxfs.
+ */
+int
+xfs_attr_change(
+	struct xfs_da_args	*args)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+	bool			use_logging = false;
+	int			error;
+
+	ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED));
+
+	if (xfs_attr_want_log_assist(mp)) {
+		error = xfs_attr_grab_log_assist(mp);
+		if (error)
+			return error;
+
+		args->op_flags |= XFS_DA_OP_LOGGED;
+		use_logging = true;
+	}
+
+	error = xfs_attr_set(args);
+
+	if (use_logging)
+		xfs_attr_rele_log_assist(mp);
+	return error;
+}
+
+
+static int
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
+		struct inode *inode, const char *name, void *value, size_t size)
+{
+	struct xfs_da_args	args = {
+		.dp		= XFS_I(inode),
+		.attr_filter	= handler->flags,
+		.name		= name,
+		.namelen	= strlen(name),
+		.value		= value,
+		.valuelen	= size,
+	};
+	int			error;
+
+	error = xfs_attr_get(&args);
+	if (error)
+		return error;
+	return args.valuelen;
+}
+
+static int
+xfs_xattr_set(const struct xattr_handler *handler,
+	      struct user_namespace *mnt_userns, struct dentry *unused,
+	      struct inode *inode, const char *name, const void *value,
+	      size_t size, int flags)
+{
+	struct xfs_da_args	args = {
+		.dp		= XFS_I(inode),
+		.attr_filter	= handler->flags,
+		.attr_flags	= flags,
+		.name		= name,
+		.namelen	= strlen(name),
+		.value		= (void *)value,
+		.valuelen	= size,
+	};
+	int			error;
+
+	error = xfs_attr_change(&args);
+	if (!error && (handler->flags & XFS_ATTR_ROOT))
+		xfs_forget_acl(inode, name);
+	return error;
+}
+
+static const struct xattr_handler xfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.flags	= 0, /* no flags implies user namespace */
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.flags	= XFS_ATTR_ROOT,
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.flags	= XFS_ATTR_SECURE,
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+const struct xattr_handler *xfs_xattr_handlers[] = {
+	&xfs_xattr_user_handler,
+	&xfs_xattr_trusted_handler,
+	&xfs_xattr_security_handler,
+#ifdef CONFIG_XFS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	NULL
+};
+
+static void
+__xfs_xattr_put_listent(
+	struct xfs_attr_list_context *context,
+	char *prefix,
+	int prefix_len,
+	unsigned char *name,
+	int namelen)
+{
+	char *offset;
+	int arraytop;
+
+	if (context->count < 0 || context->seen_enough)
+		return;
+
+	if (!context->buffer)
+		goto compute_size;
+
+	arraytop = context->count + prefix_len + namelen + 1;
+	if (arraytop > context->firstu) {
+		context->count = -1;	/* insufficient space */
+		context->seen_enough = 1;
+		return;
+	}
+	offset = context->buffer + context->count;
+	strncpy(offset, prefix, prefix_len);
+	offset += prefix_len;
+	strncpy(offset, (char *)name, namelen);			/* real name */
+	offset += namelen;
+	*offset = '\0';
+
+compute_size:
+	context->count += prefix_len + namelen + 1;
+	return;
+}
+
+static void
+xfs_xattr_put_listent(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen)
+{
+	char *prefix;
+	int prefix_len;
+
+	ASSERT(context->count >= 0);
+
+	if (flags & XFS_ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+		if (namelen == SGI_ACL_FILE_SIZE &&
+		    strncmp(name, SGI_ACL_FILE,
+			    SGI_ACL_FILE_SIZE) == 0) {
+			__xfs_xattr_put_listent(
+					context, XATTR_SYSTEM_PREFIX,
+					XATTR_SYSTEM_PREFIX_LEN,
+					XATTR_POSIX_ACL_ACCESS,
+					strlen(XATTR_POSIX_ACL_ACCESS));
+		} else if (namelen == SGI_ACL_DEFAULT_SIZE &&
+			 strncmp(name, SGI_ACL_DEFAULT,
+				 SGI_ACL_DEFAULT_SIZE) == 0) {
+			__xfs_xattr_put_listent(
+					context, XATTR_SYSTEM_PREFIX,
+					XATTR_SYSTEM_PREFIX_LEN,
+					XATTR_POSIX_ACL_DEFAULT,
+					strlen(XATTR_POSIX_ACL_DEFAULT));
+		}
+#endif
+
+		/*
+		 * Only show root namespace entries if we are actually allowed to
+		 * see them.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return;
+
+		prefix = XATTR_TRUSTED_PREFIX;
+		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	} else if (flags & XFS_ATTR_SECURE) {
+		prefix = XATTR_SECURITY_PREFIX;
+		prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	} else {
+		prefix = XATTR_USER_PREFIX;
+		prefix_len = XATTR_USER_PREFIX_LEN;
+	}
+
+	__xfs_xattr_put_listent(context, prefix, prefix_len, name,
+				namelen);
+	return;
+}
+
+ssize_t
+xfs_vn_listxattr(
+	struct dentry	*dentry,
+	char		*data,
+	size_t		size)
+{
+	struct xfs_attr_list_context context;
+	struct inode	*inode = d_inode(dentry);
+	int		error;
+
+	/*
+	 * First read the regular on-disk attributes.
+	 */
+	memset(&context, 0, sizeof(context));
+	context.dp = XFS_I(inode);
+	context.resynch = 1;
+	context.buffer = size ? data : NULL;
+	context.bufsize = size;
+	context.firstu = context.bufsize;
+	context.put_listent = xfs_xattr_put_listent;
+
+	error = xfs_attr_list(&context);
+	if (error)
+		return error;
+	if (context.count < 0)
+		return -ERANGE;
+
+	return context.count;
+}
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
new file mode 100644
index 000000000..2b09133b1
--- /dev/null
+++ b/fs/xfs/xfs_xattr.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_XATTR_H__
+#define __XFS_XATTR_H__
+
+int xfs_attr_change(struct xfs_da_args *args);
+
+extern const struct xattr_handler *xfs_xattr_handlers[];
+
+#endif /* __XFS_XATTR_H__ */