13 files changed, 20009 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
new file mode 100644
index 0000000..d69808e
--- /dev/null
+++ b/src/backend/access/nbtree/Makefile
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for access/nbtree
+#
+# IDENTIFICATION
+#    src/backend/access/nbtree/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/nbtree
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	nbtcompare.o \
+	nbtdedup.o \
+	nbtinsert.o \
+	nbtpage.o \
+	nbtree.o \
+	nbtsearch.o \
+	nbtsort.o \
+	nbtsplitloc.o \
+	nbtutils.o \
+	nbtvalidate.o \
+	nbtxlog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
new file mode 100644
index 0000000..bfe33b6
--- /dev/null
+++ b/src/backend/access/nbtree/README
@@ -0,0 +1,1056 @@
+src/backend/access/nbtree/README
+
+Btree Indexing
+==============
+
+This directory contains a correct implementation of Lehman and Yao's
+high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
+Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions
+on Database Systems, Vol 6, No. 4, December 1981, pp 650-670).  We also
+use a simplified version of the deletion logic described in Lanin and
+Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm,
+Proceedings of 1986 Fall Joint Computer Conference, pp 380-389).
+
+The basic Lehman & Yao Algorithm
+--------------------------------
+
+Compared to a classic B-tree, L&Y adds a right-link pointer to each page,
+to the page's right sibling.  It also adds a "high key" to each page, which
+is an upper bound on the keys that are allowed on that page.  These two
+additions make it possible detect a concurrent page split, which allows the
+tree to be searched without holding any read locks (except to keep a single
+page from being modified while reading it).
+
+When a search follows a downlink to a child page, it compares the page's
+high key with the search key.  If the search key is greater than the high
+key, the page must've been split concurrently, and you must follow the
+right-link to find the new page containing the key range you're looking
+for.  This might need to be repeated, if the page has been split more than
+once.
+
+Lehman and Yao talk about alternating "separator" keys and downlinks in
+internal pages rather than tuples or records.  We use the term "pivot"
+tuple to refer to tuples which don't point to heap tuples, that are used
+only for tree navigation.  All tuples on non-leaf pages and high keys on
+leaf pages are pivot tuples.  Since pivot tuples are only used to represent
+which part of the key space belongs on each page, they can have attribute
+values copied from non-pivot tuples that were deleted and killed by VACUUM
+some time ago.  A pivot tuple may contain a "separator" key and downlink,
+just a separator key (i.e. the downlink value is implicitly undefined), or
+just a downlink (i.e. all attributes are truncated away).
+
+The requirement that all btree keys be unique is satisfied by treating heap
+TID as a tiebreaker attribute.  Logical duplicates are sorted in heap TID
+order.  This is necessary because Lehman and Yao also require that the key
+range for a subtree S is described by Ki < v <= Ki+1 where Ki and Ki+1 are
+the adjacent keys in the parent page (Ki must be _strictly_ less than v,
+which is assured by having reliably unique keys).  Keys are always unique
+on their level, with the exception of a leaf page's high key, which can be
+fully equal to the last item on the page.
+
+The Postgres implementation of suffix truncation must make sure that the
+Lehman and Yao invariants hold, and represents that absent/truncated
+attributes in pivot tuples have the sentinel value "minus infinity".  The
+later section on suffix truncation will be helpful if it's unclear how the
+Lehman & Yao invariants work with a real world example.
+
+Differences to the Lehman & Yao algorithm
+-----------------------------------------
+
+We have made the following changes in order to incorporate the L&Y algorithm
+into Postgres:
+
+Lehman and Yao don't require read locks, but assume that in-memory
+copies of tree pages are unshared.  Postgres shares in-memory buffers
+among backends.  As a result, we do page-level read locking on btree
+pages in order to guarantee that no record is modified while we are
+examining it.  This reduces concurrency but guarantees correct
+behavior.
+
+We support the notion of an ordered "scan" of an index as well as
+insertions, deletions, and simple lookups.  A scan in the forward
+direction is no problem, we just use the right-sibling pointers that
+L&Y require anyway.  (Thus, once we have descended the tree to the
+correct start point for the scan, the scan looks only at leaf pages
+and never at higher tree levels.)  To support scans in the backward
+direction, we also store a "left sibling" link much like the "right
+sibling".  (This adds an extra step to the L&Y split algorithm: while
+holding the write lock on the page being split, we also lock its former
+right sibling to update that page's left-link.  This is safe since no
+writer of that page can be interested in acquiring a write lock on our
+page.)  A backwards scan has one additional bit of complexity: after
+following the left-link we must account for the possibility that the
+left sibling page got split before we could read it.  So, we have to
+move right until we find a page whose right-link matches the page we
+came from.  (Actually, it's even harder than that; see page deletion
+discussion below.)
+
+Page read locks are held only for as long as a scan is examining a page.
+To minimize lock/unlock traffic, an index scan always searches a leaf page
+to identify all the matching items at once, copying their heap tuple IDs
+into backend-local storage.  The heap tuple IDs are then processed while
+not holding any page lock within the index.  We do continue to hold a pin
+on the leaf page in some circumstances, to protect against concurrent
+deletions (see below).  In this state the scan is effectively stopped
+"between" pages, either before or after the page it has pinned.  This is
+safe in the presence of concurrent insertions and even page splits, because
+items are never moved across pre-existing page boundaries --- so the scan
+cannot miss any items it should have seen, nor accidentally return the same
+item twice.  The scan must remember the page's right-link at the time it
+was scanned, since that is the page to move right to; if we move right to
+the current right-link then we'd re-scan any items moved by a page split.
+We don't similarly remember the left-link, since it's best to use the most
+up-to-date left-link when trying to move left (see detailed move-left
+algorithm below).
+
+In most cases we release our lock and pin on a page before attempting
+to acquire pin and lock on the page we are moving to.  In a few places
+it is necessary to lock the next page before releasing the current one.
+This is safe when moving right or up, but not when moving left or down
+(else we'd create the possibility of deadlocks).
+
+Lehman and Yao fail to discuss what must happen when the root page
+becomes full and must be split.  Our implementation is to split the
+root in the same way that any other page would be split, then construct
+a new root page holding pointers to both of the resulting pages (which
+now become siblings on the next level of the tree).  The new root page
+is then installed by altering the root pointer in the meta-data page (see
+below).  This works because the root is not treated specially in any
+other way --- in particular, searches will move right using its link
+pointer if the link is set.  Therefore, searches will find the data
+that's been moved into the right sibling even if they read the meta-data
+page before it got updated.  This is the same reasoning that makes a
+split of a non-root page safe.  The locking considerations are similar too.
+
+When an inserter recurses up the tree, splitting internal pages to insert
+links to pages inserted on the level below, it is possible that it will
+need to access a page above the level that was the root when it began its
+descent (or more accurately, the level that was the root when it read the
+meta-data page).  In this case the stack it made while descending does not
+help for finding the correct page.  When this happens, we find the correct
+place by re-descending the tree until we reach the level one above the
+level we need to insert a link to, and then moving right as necessary.
+(Typically this will take only two fetches, the meta-data page and the new
+root, but in principle there could have been more than one root split
+since we saw the root.  We can identify the correct tree level by means of
+the level numbers stored in each page.  The situation is rare enough that
+we do not need a more efficient solution.)
+
+Lehman and Yao must couple/chain locks as part of moving right when
+relocating a child page's downlink during an ascent of the tree.  This is
+the only point where Lehman and Yao have to simultaneously hold three
+locks (a lock on the child, the original parent, and the original parent's
+right sibling).  We don't need to couple internal page locks for pages on
+the same level, though.  We match a child's block number to a downlink
+from a pivot tuple one level up, whereas Lehman and Yao match on the
+separator key associated with the downlink that was followed during the
+initial descent.  We can release the lock on the original parent page
+before acquiring a lock on its right sibling, since there is never any
+need to deal with the case where the separator key that we must relocate
+becomes the original parent's high key.  Lanin and Shasha don't couple
+locks here either, though they also don't couple locks between levels
+during ascents.  They are willing to "wait and try again" to avoid races.
+Their algorithm is optimistic, which means that "an insertion holds no
+more than one write lock at a time during its ascent".  We more or less
+stick with Lehman and Yao's approach of conservatively coupling parent and
+child locks when ascending the tree, since it's far simpler.
+
+Lehman and Yao assume fixed-size keys, but we must deal with
+variable-size keys.  Therefore there is not a fixed maximum number of
+keys per page; we just stuff in as many as will fit.  When we split a
+page, we try to equalize the number of bytes, not items, assigned to
+pages (though suffix truncation is also considered).  Note we must include
+the incoming item in this calculation, otherwise it is possible to find
+that the incoming item doesn't fit on the split page where it needs to go!
+
+Deleting index tuples during VACUUM
+-----------------------------------
+
+Before deleting a leaf item, we get a super-exclusive lock on the target
+page, so that no other backend has a pin on the page when the deletion
+starts.  This is not necessary for correctness in terms of the btree index
+operations themselves; as explained above, index scans logically stop
+"between" pages and so can't lose their place.  The reason we do it is to
+provide an interlock between VACUUM and indexscans.  Since VACUUM deletes
+index entries before reclaiming heap tuple line pointers, the
+super-exclusive lock guarantees that VACUUM can't reclaim for re-use a
+line pointer that an indexscanning process might be about to visit.  This
+guarantee works only for simple indexscans that visit the heap in sync
+with the index scan, not for bitmap scans.  We only need the guarantee
+when using non-MVCC snapshot rules; when using an MVCC snapshot, it
+doesn't matter if the heap tuple is replaced with an unrelated tuple at
+the same TID, because the new tuple won't be visible to our scan anyway.
+Therefore, a scan using an MVCC snapshot which has no other confounding
+factors will not hold the pin after the page contents are read.  The
+current reasons for exceptions, where a pin is still needed, are if the
+index is not WAL-logged or if the scan is an index-only scan.  If later
+work allows the pin to be dropped for all cases we will be able to
+simplify the vacuum code, since the concept of a super-exclusive lock
+for btree indexes will no longer be needed.
+
+Because a pin is not always held, and a page can be split even while
+someone does hold a pin on it, it is possible that an indexscan will
+return items that are no longer stored on the page it has a pin on, but
+rather somewhere to the right of that page.  To ensure that VACUUM can't
+prematurely remove such heap tuples, we require btbulkdelete to obtain a
+super-exclusive lock on every leaf page in the index, even pages that
+don't contain any deletable tuples.  Any scan which could yield incorrect
+results if the tuple at a TID matching the scan's range and filter
+conditions were replaced by a different tuple while the scan is in
+progress must hold the pin on each index page until all index entries read
+from the page have been processed.  This guarantees that the btbulkdelete
+call cannot return while any indexscan is still holding a copy of a
+deleted index tuple if the scan could be confused by that.  Note that this
+requirement does not say that btbulkdelete must visit the pages in any
+particular order.  (See also simple deletion and bottom-up deletion,
+below.)
+
+There is no such interlocking for deletion of items in internal pages,
+since backends keep no lock nor pin on a page they have descended past.
+Hence, when a backend is ascending the tree using its stack, it must
+be prepared for the possibility that the item it wants is to the left of
+the recorded position (but it can't have moved left out of the recorded
+page).  Since we hold a lock on the lower page (per L&Y) until we have
+re-found the parent item that links to it, we can be assured that the
+parent item does still exist and can't have been deleted.
+
+VACUUM's linear scan, concurrent page splits
+--------------------------------------------
+
+VACUUM accesses the index by doing a linear scan to search for deletable
+TIDs, while considering the possibility of deleting empty pages in
+passing.  This is in physical/block order, not logical/keyspace order.
+The tricky part of this is avoiding missing any deletable tuples in the
+presence of concurrent page splits: a page split could easily move some
+tuples from a page not yet passed over by the sequential scan to a
+lower-numbered page already passed over.
+
+To implement this, we provide a "vacuum cycle ID" mechanism that makes it
+possible to determine whether a page has been split since the current
+btbulkdelete cycle started.  If btbulkdelete finds a page that has been
+split since it started, and has a right-link pointing to a lower page
+number, then it temporarily suspends its sequential scan and visits that
+page instead.  It must continue to follow right-links and vacuum dead
+tuples until reaching a page that either hasn't been split since
+btbulkdelete started, or is above the location of the outer sequential
+scan.  Then it can resume the sequential scan.  This ensures that all
+tuples are visited.  It may be that some tuples are visited twice, but
+that has no worse effect than an inaccurate index tuple count (and we
+can't guarantee an accurate count anyway in the face of concurrent
+activity).  Note that this still works if the has-been-recently-split test
+has a small probability of false positives, so long as it never gives a
+false negative.  This makes it possible to implement the test with a small
+counter value stored on each index page.
+
+Deleting entire pages during VACUUM
+-----------------------------------
+
+We consider deleting an entire page from the btree only when it's become
+completely empty of items.  (Merging partly-full pages would allow better
+space reuse, but it seems impractical to move existing data items left or
+right to make this happen --- a scan moving in the opposite direction
+might miss the items if so.)  Also, we *never* delete the rightmost page
+on a tree level (this restriction simplifies the traversal algorithms, as
+explained below).  Page deletion always begins from an empty leaf page.  An
+internal page can only be deleted as part of deleting an entire subtree.
+This is always a "skinny" subtree consisting of a "chain" of internal pages
+plus a single leaf page.  There is one page on each level of the subtree,
+and each level/page covers the same key space.
+
+Deleting a leaf page is a two-stage process.  In the first stage, the page
+is unlinked from its parent, and marked as half-dead.  The parent page must
+be found using the same type of search as used to find the parent during an
+insertion split.  We lock the target and the parent pages, change the
+target's downlink to point to the right sibling, and remove its old
+downlink.  This causes the target page's key space to effectively belong to
+its right sibling.  (Neither the left nor right sibling pages need to
+change their "high key" if any; so there is no problem with possibly not
+having enough space to replace a high key.)  At the same time, we mark the
+target page as half-dead, which causes any subsequent searches to ignore it
+and move right (or left, in a backwards scan).  This leaves the tree in a
+similar state as during a page split: the page has no downlink pointing to
+it, but it's still linked to its siblings.
+
+(Note: Lanin and Shasha prefer to make the key space move left, but their
+argument for doing so hinges on not having left-links, which we have
+anyway.  So we simplify the algorithm by moving the key space right.  This
+is only possible because we don't match on a separator key when ascending
+the tree during a page split, unlike Lehman and Yao/Lanin and Shasha -- it
+doesn't matter if the downlink is re-found in a pivot tuple whose separator
+key does not match the one encountered when inserter initially descended
+the tree.)
+
+To preserve consistency on the parent level, we cannot merge the key space
+of a page into its right sibling unless the right sibling is a child of
+the same parent --- otherwise, the parent's key space assignment changes
+too, meaning we'd have to make bounding-key updates in its parent, and
+perhaps all the way up the tree.  Since we can't possibly do that
+atomically, we forbid this case.  That means that the rightmost child of a
+parent node can't be deleted unless it's the only remaining child, in which
+case we will delete the parent too (see below).
+
+In the second-stage, the half-dead leaf page is unlinked from its siblings.
+We first lock the left sibling (if any) of the target, the target page
+itself, and its right sibling (there must be one) in that order.  Then we
+update the side-links in the siblings, and mark the target page deleted.
+
+When we're about to delete the last remaining child of a parent page, things
+are slightly more complicated.  In the first stage, we leave the immediate
+parent of the leaf page alone, and remove the downlink to the parent page
+instead, from the grandparent.  If it's the last child of the grandparent
+too, we recurse up until we find a parent with more than one child, and
+remove the downlink of that page.  The leaf page is marked as half-dead, and
+the block number of the page whose downlink was removed is stashed in the
+half-dead leaf page.  This leaves us with a chain of internal pages, with
+one downlink each, leading to the half-dead leaf page, and no downlink
+pointing to the topmost page in the chain.
+
+While we recurse up to find the topmost parent in the chain, we keep the
+leaf page locked, but don't need to hold locks on the intermediate pages
+between the leaf and the topmost parent -- insertions into upper tree levels
+happen only as a result of splits of child pages, and that can't happen as
+long as we're keeping the leaf locked.  The internal pages in the chain
+cannot acquire new children afterwards either, because the leaf page is
+marked as half-dead and won't be split.
+
+Removing the downlink to the top of the to-be-deleted subtree/chain
+effectively transfers the key space to the right sibling for all the
+intermediate levels too, in one atomic operation.  A concurrent search might
+still visit the intermediate pages, but it will move right when it reaches
+the half-dead page at the leaf level.  In particular, the search will move to
+the subtree to the right of the half-dead leaf page/to-be-deleted subtree,
+since the half-dead leaf page's right sibling must be a "cousin" page, not a
+"true" sibling page (or a second cousin page when the to-be-deleted chain
+starts at leaf page's grandparent page, and so on).
+
+In the second stage, the topmost page in the chain is unlinked from its
+siblings, and the half-dead leaf page is updated to point to the next page
+down in the chain.  This is repeated until there are no internal pages left
+in the chain.  Finally, the half-dead leaf page itself is unlinked from its
+siblings.
+
+A deleted page cannot be recycled immediately, since there may be other
+processes waiting to reference it (ie, search processes that just left the
+parent, or scans moving right or left from one of the siblings).  These
+processes must be able to observe a deleted page for some time after the
+deletion operation, in order to be able to at least recover from it (they
+recover by moving right, as with concurrent page splits).  Searchers never
+have to worry about concurrent page recycling.
+
+See "Placing deleted pages in the FSM" section below for a description of
+when and how deleted pages become safe for VACUUM to make recyclable.
+
+Page deletion and backwards scans
+---------------------------------
+
+Moving left in a backward scan is complicated because we must consider
+the possibility that the left sibling was just split (meaning we must find
+the rightmost page derived from the left sibling), plus the possibility
+that the page we were just on has now been deleted and hence isn't in the
+sibling chain at all anymore.  So the move-left algorithm becomes:
+
+0. Remember the page we are on as the "original page".
+1. Follow the original page's left-link (we're done if this is zero).
+2. If the current page is live and its right-link matches the "original
+   page", we are done.
+3. Otherwise, move right one or more times looking for a live page whose
+   right-link matches the "original page".  If found, we are done.  (In
+   principle we could scan all the way to the right end of the index, but
+   in practice it seems better to give up after a small number of tries.
+   It's unlikely the original page's sibling split more than a few times
+   while we were in flight to it; if we do not find a matching link in a
+   few tries, then most likely the original page is deleted.)
+4. Return to the "original page".  If it is still live, return to step 1
+   (we guessed wrong about it being deleted, and should restart with its
+   current left-link).  If it is dead, move right until a non-dead page
+   is found (there must be one, since rightmost pages are never deleted),
+   mark that as the new "original page", and return to step 1.
+
+This algorithm is correct because the live page found by step 4 will have
+the same left keyspace boundary as the page we started from.  Therefore,
+when we ultimately exit, it must be on a page whose right keyspace
+boundary matches the left boundary of where we started --- which is what
+we need to be sure we don't miss or re-scan any items.
+
+Page deletion and tree height
+-----------------------------
+
+Because we never delete the rightmost page of any level (and in particular
+never delete the root), it's impossible for the height of the tree to
+decrease.  After massive deletions we might have a scenario in which the
+tree is "skinny", with several single-page levels below the root.
+Operations will still be correct in this case, but we'd waste cycles
+descending through the single-page levels.  To handle this we use an idea
+from Lanin and Shasha: we keep track of the "fast root" level, which is
+the lowest single-page level.  The meta-data page keeps a pointer to this
+level as well as the true root.  All ordinary operations initiate their
+searches at the fast root not the true root.  When we split a page that is
+alone on its level or delete the next-to-last page on a level (both cases
+are easily detected), we have to make sure that the fast root pointer is
+adjusted appropriately.  In the split case, we do this work as part of the
+atomic update for the insertion into the parent level; in the delete case
+as part of the atomic update for the delete (either way, the metapage has
+to be the last page locked in the update to avoid deadlock risks).  This
+avoids race conditions if two such operations are executing concurrently.
+
+Placing deleted pages in the FSM
+--------------------------------
+
+Recycling a page is decoupled from page deletion.  A deleted page can only
+be put in the FSM to be recycled once there is no possible scan or search
+that has a reference to it; until then, it must stay in place with its
+sibling links undisturbed, as a tombstone that allows concurrent searches
+to detect and then recover from concurrent deletions (which are rather
+like concurrent page splits to searchers).  This design is an
+implementation of what Lanin and Shasha call "the drain technique".
+
+We implement the technique by waiting until all active snapshots and
+registered snapshots as of the page deletion are gone; which is overly
+strong, but is simple to implement within Postgres.  When marked fully
+dead, a deleted page is labeled with the next-transaction counter value.
+VACUUM can reclaim the page for re-use when the stored XID is guaranteed
+to be "visible to everyone".  As collateral damage, we wait for snapshots
+taken until the next transaction to allocate an XID commits.  We also wait
+for running XIDs with no snapshots.
+
+Prior to PostgreSQL 14, VACUUM would only place _old_ deleted pages that
+it encounters during its linear scan (pages deleted by a previous VACUUM
+operation) in the FSM.  Newly deleted pages were never placed in the FSM,
+because that was assumed to _always_ be unsafe.  That assumption was
+unnecessarily pessimistic in practice, though -- it often doesn't take
+very long for newly deleted pages to become safe to place in the FSM.
+There is no truly principled way to predict when deleted pages will become
+safe to place in the FSM for recycling -- it might become safe almost
+immediately (long before the current VACUUM completes), or it might not
+even be safe by the time the next VACUUM takes place.  Recycle safety is
+purely a question of maintaining the consistency (or at least the apparent
+consistency) of a physical data structure.  The state within the backend
+running VACUUM is simply not relevant.
+
+PostgreSQL 14 added the ability for VACUUM to consider if it's possible to
+recycle newly deleted pages at the end of the full index scan where the
+page deletion took place.  It is convenient to check if it's safe at that
+point.  This does require that VACUUM keep around a little bookkeeping
+information about newly deleted pages, but that's very cheap.  Using
+in-memory state for this avoids the need to revisit newly deleted pages a
+second time later on -- we can just use safexid values from the local
+bookkeeping state to determine recycle safety in a deferred fashion.
+
+The need for additional FSM indirection after a page deletion operation
+takes place is a natural consequence of the highly permissive rules for
+index scans with Lehman and Yao's design.  In general an index scan
+doesn't have to hold a lock or even a pin on any page when it descends the
+tree (nothing that you'd usually think of as an interlock is held "between
+levels").  At the same time, index scans cannot be allowed to land on a
+truly unrelated page due to concurrent recycling (not to be confused with
+concurrent deletion), because that results in wrong answers to queries.
+Simpler approaches to page deletion that don't need to defer recycling are
+possible, but none seem compatible with Lehman and Yao's design.
+
+Placing an already-deleted page in the FSM to be recycled when needed
+doesn't actually change the state of the page.  The page will be changed
+whenever it is subsequently taken from the FSM for reuse.  The deleted
+page's contents will be overwritten by the split operation (it will become
+the new right sibling page).
+
+Fastpath For Index Insertion
+----------------------------
+
+We optimize for a common case of insertion of increasing index key
+values by caching the last page to which this backend inserted the last
+value, if this page was the rightmost leaf page. For the next insert, we
+can then quickly check if the cached page is still the rightmost leaf
+page and also the correct place to hold the current value. We can avoid
+the cost of walking down the tree in such common cases.
+
+The optimization works on the assumption that there can only be one
+non-ignorable leaf rightmost page, and so not even a visible-to-everyone
+style interlock is required.  We cannot fail to detect that our hint was
+invalidated, because there can only be one such page in the B-Tree at
+any time. It's possible that the page will be deleted and recycled
+without a backend's cached page also being detected as invalidated, but
+only when we happen to recycle a block that once again gets recycled as the
+rightmost leaf page.
+
+Simple deletion
+---------------
+
+If a process visits a heap tuple and finds that it's dead and removable
+(ie, dead to all open transactions, not only that process), then we can
+return to the index and mark the corresponding index entry "known dead",
+allowing subsequent index scans to skip visiting the heap tuple.  The
+"known dead" marking works by setting the index item's lp_flags state
+to LP_DEAD.  This is currently only done in plain indexscans, not bitmap
+scans, because only plain scans visit the heap and index "in sync" and so
+there's not a convenient way to do it for bitmap scans.  Note also that
+LP_DEAD bits are often set when checking a unique index for conflicts on
+insert (this is simpler because it takes place when we hold an exclusive
+lock on the leaf page).
+
+Once an index tuple has been marked LP_DEAD it can actually be deleted
+from the index immediately; since index scans only stop "between" pages,
+no scan can lose its place from such a deletion.  We separate the steps
+because we allow LP_DEAD to be set with only a share lock (it's exactly
+like a hint bit for a heap tuple), but physically removing tuples requires
+exclusive lock.  Also, delaying the deletion often allows us to pick up
+extra index tuples that weren't initially safe for index scans to mark
+LP_DEAD.  We do this with index tuples whose TIDs point to the same table
+blocks as an LP_DEAD-marked tuple.  They're practically free to check in
+passing, and have a pretty good chance of being safe to delete due to
+various locality effects.
+
+We only try to delete LP_DEAD tuples (and nearby tuples) when we are
+otherwise faced with having to split a page to do an insertion (and hence
+have exclusive lock on it already).  Deduplication and bottom-up index
+deletion can also prevent a page split, but simple deletion is always our
+preferred approach.  (Note that posting list tuples can only have their
+LP_DEAD bit set when every table TID within the posting list is known
+dead.  This isn't much of a problem in practice because LP_DEAD bits are
+just a starting point for simple deletion -- we still manage to perform
+granular deletes of posting list TIDs quite often.)
+
+It's sufficient to have an exclusive lock on the index page, not a
+super-exclusive lock, to do deletion of LP_DEAD items.  It might seem
+that this breaks the interlock between VACUUM and indexscans, but that is
+not so: as long as an indexscanning process has a pin on the page where
+the index item used to be, VACUUM cannot complete its btbulkdelete scan
+and so cannot remove the heap tuple.  This is another reason why
+btbulkdelete has to get a super-exclusive lock on every leaf page, not only
+the ones where it actually sees items to delete.
+
+LP_DEAD setting by index scans cannot be sure that a TID whose index tuple
+it had planned on LP_DEAD-setting has not been recycled by VACUUM if it
+drops its pin in the meantime.  It must conservatively also remember the
+LSN of the page, and only act to set LP_DEAD bits when the LSN has not
+changed at all. (Avoiding dropping the pin entirely also makes it safe, of
+course.)
+
+Bottom-Up deletion
+------------------
+
+We attempt to delete whatever duplicates happen to be present on the page
+when the duplicates are suspected to be caused by version churn from
+successive UPDATEs.  This only happens when we receive an executor hint
+indicating that optimizations like heapam's HOT have not worked out for
+the index -- the incoming tuple must be a logically unchanged duplicate
+which is needed for MVCC purposes, suggesting that that might well be the
+dominant source of new index tuples on the leaf page in question.  (Also,
+bottom-up deletion is triggered within unique indexes in cases with
+continual INSERT and DELETE related churn, since that is easy to detect
+without any external hint.)
+
+Simple deletion will already have failed to prevent a page split when a
+bottom-up deletion pass takes place (often because no LP_DEAD bits were
+ever set on the page).  The two mechanisms have closely related
+implementations.  The same WAL records are used for each operation, and
+the same tableam infrastructure is used to determine what TIDs/tuples are
+actually safe to delete.  The implementations only differ in how they pick
+TIDs to consider for deletion, and whether or not the tableam will give up
+before accessing all table blocks (bottom-up deletion lives with the
+uncertainty of its success by keeping the cost of failure low).  Even
+still, the two mechanisms are clearly distinct at the conceptual level.
+
+Bottom-up index deletion is driven entirely by heuristics (whereas simple
+deletion is guaranteed to delete at least those index tuples that are
+already LP_DEAD marked -- there must be at least one).  We have no
+certainty that we'll find even one index tuple to delete.  That's why we
+closely cooperate with the tableam to keep the costs it pays in balance
+with the benefits we receive.  The interface that we use for this is
+described in detail in access/tableam.h.
+
+Bottom-up index deletion can be thought of as a backstop mechanism against
+unnecessary version-driven page splits.  It is based in part on an idea
+from generational garbage collection: the "generational hypothesis".  This
+is the empirical observation that "most objects die young".  Within
+nbtree, new index tuples often quickly appear in the same place, and then
+quickly become garbage.  There can be intense concentrations of garbage in
+relatively few leaf pages with certain workloads (or there could be in
+earlier versions of PostgreSQL without bottom-up index deletion, at
+least).  See doc/src/sgml/btree.sgml for a high-level description of the
+design principles behind bottom-up index deletion in nbtree, including
+details of how it complements VACUUM.
+
+We expect to find a reasonably large number of tuples that are safe to
+delete within each bottom-up pass.  If we don't then we won't need to
+consider the question of bottom-up deletion for the same leaf page for
+quite a while (usually because the page splits, which resolves the
+situation for the time being).  We expect to perform regular bottom-up
+deletion operations against pages that are at constant risk of unnecessary
+page splits caused only by version churn.  When the mechanism works well
+we'll constantly be "on the verge" of having version-churn-driven page
+splits, but never actually have even one.
+
+Our duplicate heuristics work well despite being fairly simple.
+Unnecessary page splits only occur when there are truly pathological
+levels of version churn (in theory a small amount of version churn could
+make a page split occur earlier than strictly necessary, but that's pretty
+harmless).  We don't have to understand the underlying workload; we only
+have to understand the general nature of the pathology that we target.
+Version churn is easy to spot when it is truly pathological.  Affected
+leaf pages are fairly homogeneous.
+
+WAL Considerations
+------------------
+
+The insertion and deletion algorithms in themselves don't guarantee btree
+consistency after a crash.  To provide robustness, we depend on WAL
+replay.  A single WAL entry is effectively an atomic action --- we can
+redo it from the log if it fails to complete.
+
+Ordinary item insertions (that don't force a page split) are of course
+single WAL entries, since they only affect one page.  The same for
+leaf-item deletions (if the deletion brings the leaf page to zero items,
+it is now a candidate to be deleted, but that is a separate action).
+
+An insertion that causes a page split is logged as a single WAL entry for
+the changes occurring on the insertion's level --- including update of the
+right sibling's left-link --- followed by a second WAL entry for the
+insertion on the parent level (which might itself be a page split, requiring
+an additional insertion above that, etc).
+
+For a root split, the follow-on WAL entry is a "new root" entry rather than
+an "insertion" entry, but details are otherwise much the same.
+
+Because splitting involves multiple atomic actions, it's possible that the
+system crashes between splitting a page and inserting the downlink for the
+new half to the parent.  After recovery, the downlink for the new page will
+be missing.  The search algorithm works correctly, as the page will be found
+by following the right-link from its left sibling, although if a lot of
+downlinks in the tree are missing, performance will suffer.  A more serious
+consequence is that if the page without a downlink gets split again, the
+insertion algorithm will fail to find the location in the parent level to
+insert the downlink.
+
+Our approach is to create any missing downlinks on-the-fly, when searching
+the tree for a new insertion.  It could be done during searches, too, but
+it seems best not to put any extra updates in what would otherwise be a
+read-only operation (updating is not possible in hot standby mode anyway).
+It would seem natural to add the missing downlinks in VACUUM, but since
+inserting a downlink might require splitting a page, it might fail if you
+run out of disk space.  That would be bad during VACUUM - the reason for
+running VACUUM in the first place might be that you run out of disk space,
+and now VACUUM won't finish because you're out of disk space.  In contrast,
+an insertion can require enlarging the physical file anyway.  There is one
+minor exception: VACUUM finishes interrupted splits of internal pages when
+deleting their children.  This allows the code for re-finding parent items
+to be used by both page splits and page deletion.
+
+To identify missing downlinks, when a page is split, the left page is
+flagged to indicate that the split is not yet complete (INCOMPLETE_SPLIT).
+When the downlink is inserted to the parent, the flag is cleared atomically
+with the insertion.  The child page is kept locked until the insertion in
+the parent is finished and the flag in the child cleared, but can be
+released immediately after that, before recursing up the tree if the parent
+also needs to be split.  This ensures that incompletely split pages should
+not be seen under normal circumstances; only if insertion to the parent
+has failed for some reason. (It's also possible for a reader to observe
+a page with the incomplete split flag set during recovery; see later
+section on "Scans during Recovery" for details.)
+
+We flag the left page, even though it's the right page that's missing the
+downlink, because it's more convenient to know already when following the
+right-link from the left page to the right page that it will need to have
+its downlink inserted to the parent.
+
+When splitting a non-root page that is alone on its level, the required
+metapage update (of the "fast root" link) is performed and logged as part
+of the insertion into the parent level.  When splitting the root page, the
+metapage update is handled as part of the "new root" action.
+
+Each step in page deletion is logged as a separate WAL entry: marking the
+leaf as half-dead and removing the downlink is one record, and unlinking a
+page is a second record.  If vacuum is interrupted for some reason, or the
+system crashes, the tree is consistent for searches and insertions.  The
+next VACUUM will find the half-dead leaf page and continue the deletion.
+
+Before 9.4, we used to keep track of incomplete splits and page deletions
+during recovery and finish them immediately at end of recovery, instead of
+doing it lazily at the next insertion or vacuum.  However, that made the
+recovery much more complicated, and only fixed the problem when crash
+recovery was performed.  An incomplete split can also occur if an otherwise
+recoverable error, like out-of-memory or out-of-disk-space, happens while
+inserting the downlink to the parent.
+
+Scans during Recovery
+---------------------
+
+nbtree indexes support read queries in Hot Standby mode. Every atomic
+action/WAL record makes isolated changes that leave the tree in a
+consistent state for readers. Readers lock pages according to the same
+rules that readers follow on the primary. (Readers may have to move
+right to recover from a "concurrent" page split or page deletion, just
+like on the primary.)
+
+However, there are a couple of differences in how pages are locked by
+replay/the startup process as compared to the original write operation
+on the primary. The exceptions involve page splits and page deletions.
+The first phase and second phase of a page split are processed
+independently during replay, since they are independent atomic actions.
+We do not attempt to recreate the coupling of parent and child page
+write locks that took place on the primary. This is safe because readers
+never care about the incomplete split flag anyway. Holding on to an
+extra write lock on the primary is only necessary so that a second
+writer cannot observe the incomplete split flag before the first writer
+finishes the split. If we let concurrent writers on the primary observe
+an incomplete split flag on the same page, each writer would attempt to
+complete the unfinished split, corrupting the parent page.  (Similarly,
+replay of page deletion records does not hold a write lock on the target
+leaf page throughout; only the primary needs to block out concurrent
+writers that insert on to the page being deleted.)
+
+WAL replay holds same-level locks in a way that matches the approach
+taken during original execution, though. This prevent readers from
+observing same-level inconsistencies. It's probably possible to be more
+lax about how same-level locks are acquired during recovery (most kinds
+of readers could still move right to recover if we didn't couple
+same-level locks), but we prefer to be conservative here.
+
+During recovery all index scans start with ignore_killed_tuples = false
+and we never set kill_prior_tuple. We do this because the oldest xmin
+on the standby server can be older than the oldest xmin on the primary
+server, which means tuples can be marked LP_DEAD even when they are
+still visible on the standby. We don't WAL log tuple LP_DEAD bits, but
+they can still appear in the standby because of full page writes. So
+we must always ignore them in standby, and that means it's not worth
+setting them either.  (When LP_DEAD-marked tuples are eventually deleted
+on the primary, the deletion is WAL-logged.  Queries that run on a
+standby therefore get much of the benefit of any LP_DEAD setting that
+takes place on the primary.)
+
+Note that we talk about scans that are started during recovery. We go to
+a little trouble to allow a scan to start during recovery and end during
+normal running after recovery has completed. This is a key capability
+because it allows running applications to continue while the standby
+changes state into a normally running server.
+
+The interlocking required to avoid returning incorrect results from
+non-MVCC scans is not required on standby nodes. We still get a
+super-exclusive lock ("cleanup lock") when replaying VACUUM records
+during recovery, but recovery does not need to lock every leaf page
+(only those leaf pages that have items to delete). That is safe because
+HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(),
+HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only ever
+used during write transactions, which cannot exist on the standby. MVCC
+scans are already protected by definition, so HeapTupleSatisfiesMVCC()
+is not a problem. The optimizer looks at the boundaries of value ranges
+using HeapTupleSatisfiesNonVacuumable() with an index-only scan, which
+is also safe. That leaves concern only for HeapTupleSatisfiesToast().
+
+HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's
+because it doesn't need to - if the main heap row is visible then the
+toast rows will also be visible. So as long as we follow a toast
+pointer from a visible (live) tuple the corresponding toast rows
+will also be visible, so we do not need to recheck MVCC on them.
+
+Other Things That Are Handy to Know
+-----------------------------------
+
+Page zero of every btree is a meta-data page.  This page stores the
+location of the root page --- both the true root and the current effective
+root ("fast" root).  To avoid fetching the metapage for every single index
+search, we cache a copy of the meta-data information in the index's
+relcache entry (rd_amcache).  This is a bit ticklish since using the cache
+implies following a root page pointer that could be stale.  However, a
+backend following a cached pointer can sufficiently verify whether it
+reached the intended page; either by checking the is-root flag when it
+is going to the true root, or by checking that the page has no siblings
+when going to the fast root.  At worst, this could result in descending
+some extra tree levels if we have a cached pointer to a fast root that is
+now above the real fast root.  Such cases shouldn't arise often enough to
+be worth optimizing; and in any case we can expect a relcache flush will
+discard the cached metapage before long, since a VACUUM that's moved the
+fast root pointer can be expected to issue a statistics update for the
+index.
+
+The algorithm assumes we can fit at least three items per page
+(a "high key" and two real data items).  Therefore it's unsafe
+to accept items larger than 1/3rd page size.  Larger items would
+work sometimes, but could cause failures later on depending on
+what else gets put on their page.
+
+"ScanKey" data structures are used in two fundamentally different ways
+in this code, which we describe as "search" scankeys and "insertion"
+scankeys.  A search scankey is the kind passed to btbeginscan() or
+btrescan() from outside the btree code.  The sk_func pointers in a search
+scankey point to comparison functions that return boolean, such as int4lt.
+There might be more than one scankey entry for a given index column, or
+none at all.  (We require the keys to appear in index column order, but
+the order of multiple keys for a given column is unspecified.)  An
+insertion scankey ("BTScanInsert" data structure) uses a similar
+array-of-ScanKey data structure, but the sk_func pointers point to btree
+comparison support functions (ie, 3-way comparators that return int4 values
+interpreted as <0, =0, >0).  In an insertion scankey there is at most one
+entry per index column.  There is also other data about the rules used to
+locate where to begin the scan, such as whether or not the scan is a
+"nextkey" scan.  Insertion scankeys are built within the btree code (eg, by
+_bt_mkscankey()) and are used to locate the starting point of a scan, as
+well as for locating the place to insert a new index tuple.  (Note: in the
+case of an insertion scankey built from a search scankey or built from a
+truncated pivot tuple, there might be fewer keys than index columns,
+indicating that we have no constraints for the remaining index columns.)
+After we have located the starting point of a scan, the original search
+scankey is consulted as each index entry is sequentially scanned to decide
+whether to return the entry and whether the scan can stop (see
+_bt_checkkeys()).
+
+Notes about suffix truncation
+-----------------------------
+
+We truncate away suffix key attributes that are not needed for a page high
+key during a leaf page split.  The remaining attributes must distinguish
+the last index tuple on the post-split left page as belonging on the left
+page, and the first index tuple on the post-split right page as belonging
+on the right page.  Tuples logically retain truncated key attributes,
+though they implicitly have "negative infinity" as their value, and have no
+storage overhead.  Since the high key is subsequently reused as the
+downlink in the parent page for the new right page, suffix truncation makes
+pivot tuples short.  INCLUDE indexes are guaranteed to have non-key
+attributes truncated at the time of a leaf page split, but may also have
+some key attributes truncated away, based on the usual criteria for key
+attributes.  They are not a special case, since non-key attributes are
+merely payload to B-Tree searches.
+
+The goal of suffix truncation of key attributes is to improve index
+fan-out.  The technique was first described by Bayer and Unterauer (R.Bayer
+and K.Unterauer, Prefix B-Trees, ACM Transactions on Database Systems, Vol
+2, No. 1, March 1977, pp 11-26).  The Postgres implementation is loosely
+based on their paper.  Note that Postgres only implements what the paper
+refers to as simple prefix B-Trees.  Note also that the paper assumes that
+the tree has keys that consist of single strings that maintain the "prefix
+property", much like strings that are stored in a suffix tree (comparisons
+of earlier bytes must always be more significant than comparisons of later
+bytes, and, in general, the strings must compare in a way that doesn't
+break transitive consistency as they're split into pieces).  Suffix
+truncation in Postgres currently only works at the whole-attribute
+granularity, but it would be straightforward to invent opclass
+infrastructure that manufactures a smaller attribute value in the case of
+variable-length types, such as text.  An opclass support function could
+manufacture the shortest possible key value that still correctly separates
+each half of a leaf page split.
+
+There is sophisticated criteria for choosing a leaf page split point.  The
+general idea is to make suffix truncation effective without unduly
+influencing the balance of space for each half of the page split.  The
+choice of leaf split point can be thought of as a choice among points
+*between* items on the page to be split, at least if you pretend that the
+incoming tuple was placed on the page already (you have to pretend because
+there won't actually be enough space for it on the page).  Choosing the
+split point between two index tuples where the first non-equal attribute
+appears as early as possible results in truncating away as many suffix
+attributes as possible.  Evenly balancing space among each half of the
+split is usually the first concern, but even small adjustments in the
+precise split point can allow truncation to be far more effective.
+
+Suffix truncation is primarily valuable because it makes pivot tuples
+smaller, which delays splits of internal pages, but that isn't the only
+reason why it's effective.  Even truncation that doesn't make pivot tuples
+smaller due to alignment still prevents pivot tuples from being more
+restrictive than truly necessary in how they describe which values belong
+on which pages.
+
+While it's not possible to correctly perform suffix truncation during
+internal page splits, it's still useful to be discriminating when splitting
+an internal page.  The split point that implies a downlink be inserted in
+the parent that's the smallest one available within an acceptable range of
+the fillfactor-wise optimal split point is chosen.  This idea also comes
+from the Prefix B-Tree paper.  This process has much in common with what
+happens at the leaf level to make suffix truncation effective.  The overall
+effect is that suffix truncation tends to produce smaller, more
+discriminating pivot tuples, especially early in the lifetime of the index,
+while biasing internal page splits makes the earlier, smaller pivot tuples
+end up in the root page, delaying root page splits.
+
+Logical duplicates are given special consideration.  The logic for
+selecting a split point goes to great lengths to avoid having duplicates
+span more than one page, and almost always manages to pick a split point
+between two user-key-distinct tuples, accepting a completely lopsided split
+if it must.  When a page that's already full of duplicates must be split,
+the fallback strategy assumes that duplicates are mostly inserted in
+ascending heap TID order.  The page is split in a way that leaves the left
+half of the page mostly full, and the right half of the page mostly empty.
+The overall effect is that leaf page splits gracefully adapt to inserts of
+large groups of duplicates, maximizing space utilization.  Note also that
+"trapping" large groups of duplicates on the same leaf page like this makes
+deduplication more efficient.  Deduplication can be performed infrequently,
+without merging together existing posting list tuples too often.
+
+Notes about deduplication
+-------------------------
+
+We deduplicate non-pivot tuples in non-unique indexes to reduce storage
+overhead, and to avoid (or at least delay) page splits.  Note that the
+goals for deduplication in unique indexes are rather different; see later
+section for details.  Deduplication alters the physical representation of
+tuples without changing the logical contents of the index, and without
+adding overhead to read queries.  Non-pivot tuples are merged together
+into a single physical tuple with a posting list (a simple array of heap
+TIDs with the standard item pointer format).  Deduplication is always
+applied lazily, at the point where it would otherwise be necessary to
+perform a page split.  It occurs only when LP_DEAD items have been
+removed, as our last line of defense against splitting a leaf page
+(bottom-up index deletion may be attempted first, as our second last line
+of defense).  We can set the LP_DEAD bit with posting list tuples, though
+only when all TIDs are known dead.
+
+Our lazy approach to deduplication allows the page space accounting used
+during page splits to have absolutely minimal special case logic for
+posting lists.  Posting lists can be thought of as extra payload that
+suffix truncation will reliably truncate away as needed during page
+splits, just like non-key columns from an INCLUDE index tuple.
+Incoming/new tuples can generally be treated as non-overlapping plain
+items (though see section on posting list splits for information about how
+overlapping new/incoming items are really handled).
+
+The representation of posting lists is almost identical to the posting
+lists used by GIN, so it would be straightforward to apply GIN's varbyte
+encoding compression scheme to individual posting lists.  Posting list
+compression would break the assumptions made by posting list splits about
+page space accounting (see later section), so it's not clear how
+compression could be integrated with nbtree.  Besides, posting list
+compression does not offer a compelling trade-off for nbtree, since in
+general nbtree is optimized for consistent performance with many
+concurrent readers and writers.  Compression would also make the deletion
+of a subset of TIDs from a posting list slow and complicated, which would
+be a big problem for workloads that depend heavily on bottom-up index
+deletion.
+
+A major goal of our lazy approach to deduplication is to limit the
+performance impact of deduplication with random updates.  Even concurrent
+append-only inserts of the same key value will tend to have inserts of
+individual index tuples in an order that doesn't quite match heap TID
+order.  Delaying deduplication minimizes page level fragmentation.
+
+Deduplication in unique indexes
+-------------------------------
+
+Very often, the number of distinct values that can ever be placed on
+almost any given leaf page in a unique index is fixed and permanent.  For
+example, a primary key on an identity column will usually only have leaf
+page splits caused by the insertion of new logical rows within the
+rightmost leaf page.  If there is a split of a non-rightmost leaf page,
+then the split must have been triggered by inserts associated with UPDATEs
+of existing logical rows.  Splitting a leaf page purely to store multiple
+versions is a false economy.  In effect, we're permanently degrading the
+index structure just to absorb a temporary burst of duplicates.
+
+Deduplication in unique indexes helps to prevent these pathological page
+splits.  Storing duplicates in a space efficient manner is not the goal,
+since in the long run there won't be any duplicates anyway.  Rather, we're
+buying time for standard garbage collection mechanisms to run before a
+page split is needed.
+
+Unique index leaf pages only get a deduplication pass when an insertion
+(that might have to split the page) observed an existing duplicate on the
+page in passing.  This is based on the assumption that deduplication will
+only work out when _all_ new insertions are duplicates from UPDATEs.  This
+may mean that we miss an opportunity to delay a page split, but that's
+okay because our ultimate goal is to delay leaf page splits _indefinitely_
+(i.e. to prevent them altogether).  There is little point in trying to
+delay a split that is probably inevitable anyway.  This allows us to avoid
+the overhead of attempting to deduplicate with unique indexes that always
+have few or no duplicates.
+
+Note: Avoiding "unnecessary" page splits driven by version churn is also
+the goal of bottom-up index deletion, which was added to PostgreSQL 14.
+Bottom-up index deletion is now the preferred way to deal with this
+problem (with all kinds of indexes, though especially with unique
+indexes).  Still, deduplication can sometimes augment bottom-up index
+deletion.  When deletion cannot free tuples (due to an old snapshot
+holding up cleanup), falling back on deduplication provides additional
+capacity.  Delaying the page split by deduplicating can allow a future
+bottom-up deletion pass of the same page to succeed.
+
+Posting list splits
+-------------------
+
+When the incoming tuple happens to overlap with an existing posting list,
+a posting list split is performed.  Like a page split, a posting list
+split resolves a situation where a new/incoming item "won't fit", while
+inserting the incoming item in passing (i.e. as part of the same atomic
+action).  It's possible (though not particularly likely) that an insert of
+a new item on to an almost-full page will overlap with a posting list,
+resulting in both a posting list split and a page split.  Even then, the
+atomic action that splits the posting list also inserts the new item
+(since page splits always insert the new item in passing).  Including the
+posting list split in the same atomic action as the insert avoids problems
+caused by concurrent inserts into the same posting list --  the exact
+details of how we change the posting list depend upon the new item, and
+vice-versa.  A single atomic action also minimizes the volume of extra
+WAL required for a posting list split, since we don't have to explicitly
+WAL-log the original posting list tuple.
+
+Despite piggy-backing on the same atomic action that inserts a new tuple,
+posting list splits can be thought of as a separate, extra action to the
+insert itself (or to the page split itself).  Posting list splits
+conceptually "rewrite" an insert that overlaps with an existing posting
+list into an insert that adds its final new item just to the right of the
+posting list instead.  The size of the posting list won't change, and so
+page space accounting code does not need to care about posting list splits
+at all.  This is an important upside of our design; the page split point
+choice logic is very subtle even without it needing to deal with posting
+list splits.
+
+Only a few isolated extra steps are required to preserve the illusion that
+the new item never overlapped with an existing posting list in the first
+place: the heap TID of the incoming tuple has its TID replaced with the
+rightmost/max heap TID from the existing/originally overlapping posting
+list.  Similarly, the original incoming item's TID is relocated to the
+appropriate offset in the posting list (we usually shift TIDs out of the
+way to make a hole for it).  Finally, the posting-split-with-page-split
+case must generate a new high key based on an imaginary version of the
+original page that has both the final new item and the after-list-split
+posting tuple (page splits usually just operate against an imaginary
+version that contains the new item/item that won't fit).
+
+This approach avoids inventing an "eager" atomic posting split operation
+that splits the posting list without simultaneously finishing the insert
+of the incoming item.  This alternative design might seem cleaner, but it
+creates subtle problems for page space accounting.  In general, there
+might not be enough free space on the page to split a posting list such
+that the incoming/new item no longer overlaps with either posting list
+half --- the operation could fail before the actual retail insert of the
+new item even begins.  We'd end up having to handle posting list splits
+that need a page split anyway.  Besides, supporting variable "split points"
+while splitting posting lists won't actually improve overall space
+utilization.
+
+Notes About Data Representation
+-------------------------------
+
+The right-sibling link required by L&Y is kept in the page "opaque
+data" area, as is the left-sibling link, the page level, and some flags.
+The page level counts upwards from zero at the leaf level, to the tree
+depth minus 1 at the root.  (Counting up from the leaves ensures that we
+don't need to renumber any existing pages when splitting the root.)
+
+The Postgres disk block data format (an array of items) doesn't fit
+Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
+so we have to play some games.  (The alternating-keys-and-pointers
+notion is important for internal page splits, which conceptually split
+at the middle of an existing pivot tuple -- the tuple's "separator" key
+goes on the left side of the split as the left side's new high key,
+while the tuple's pointer/downlink goes on the right side as the
+first/minus infinity downlink.)
+
+On a page that is not rightmost in its tree level, the "high key" is
+kept in the page's first item, and real data items start at item 2.
+The link portion of the "high key" item goes unused.  A page that is
+rightmost has no "high key" (it's implicitly positive infinity), so
+data items start with the first item.  Putting the high key at the
+left, rather than the right, may seem odd, but it avoids moving the
+high key as we add data items.
+
+On a leaf page, the data items are simply links to (TIDs of) tuples
+in the relation being indexed, with the associated key values.
+
+On a non-leaf page, the data items are down-links to child pages with
+bounding keys.  The key in each data item is a strict lower bound for
+keys on that child page, so logically the key is to the left of that
+downlink.  The high key (if present) is the upper bound for the last
+downlink.  The first data item on each such page has no lower bound
+--- or lower bound of minus infinity, if you prefer.  The comparison
+routines must treat it accordingly.  The actual key stored in the
+item is irrelevant, and need not be stored at all.  This arrangement
+corresponds to the fact that an L&Y non-leaf page has one more pointer
+than key.  Suffix truncation's negative infinity attributes behave in
+the same way.
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
new file mode 100644
index 0000000..7ac73cb
--- /dev/null
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -0,0 +1,335 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtcompare.c
+ *	  Comparison functions for btree access method.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtcompare.c
+ *
+ * NOTES
+ *
+ *	These functions are stored in pg_amproc.  For each operator class
+ *	defined on btrees, they compute
+ *
+ *				compare(a, b):
+ *						< 0 if a < b,
+ *						= 0 if a == b,
+ *						> 0 if a > b.
+ *
+ *	The result is always an int32 regardless of the input datatype.
+ *
+ *	Although any negative int32 is acceptable for reporting "<",
+ *	and any positive int32 is acceptable for reporting ">", routines
+ *	that work on 32-bit or wider datatypes can't just return "a - b".
+ *	That could overflow and give the wrong answer.
+ *
+ *	NOTE: it is critical that the comparison function impose a total order
+ *	on all non-NULL values of the data type, and that the datatype's
+ *	boolean comparison operators (= < >= etc) yield results consistent
+ *	with the comparison routine.  Otherwise bad behavior may ensue.
+ *	(For example, the comparison operators must NOT punt when faced with
+ *	NAN or other funny values; you must devise some collation sequence for
+ *	all such values.)  If the datatype is not trivial, this is most
+ *	reliably done by having the boolean operators invoke the same
+ *	three-way comparison code that the btree function does.  Therefore,
+ *	this file contains only btree support for "trivial" datatypes ---
+ *	all others are in the /utils/adt/ files that implement their datatypes.
+ *
+ *	NOTE: these routines must not leak memory, since memory allocated
+ *	during an index access won't be recovered till end of query.  This
+ *	primarily affects comparison routines for toastable datatypes;
+ *	they have to be careful to free any detoasted copy of an input datum.
+ *
+ *	NOTE: we used to forbid comparison functions from returning INT_MIN,
+ *	but that proves to be too error-prone because some platforms' versions
+ *	of memcmp() etc can return INT_MIN.  As a means of stress-testing
+ *	callers, this file can be compiled with STRESS_SORT_INT_MIN defined
+ *	to cause many of these functions to return INT_MIN or INT_MAX instead of
+ *	their customary -1/+1.  For production, though, that's not a good idea
+ *	since users or third-party code might expect the traditional results.
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "utils/builtins.h"
+#include "utils/sortsupport.h"
+
+#ifdef STRESS_SORT_INT_MIN
+#define A_LESS_THAN_B		INT_MIN
+#define A_GREATER_THAN_B	INT_MAX
+#else
+#define A_LESS_THAN_B		(-1)
+#define A_GREATER_THAN_B	1
+#endif
+
+
+Datum
+btboolcmp(PG_FUNCTION_ARGS)
+{
+	bool		a = PG_GETARG_BOOL(0);
+	bool		b = PG_GETARG_BOOL(1);
+
+	PG_RETURN_INT32((int32) a - (int32) b);
+}
+
+Datum
+btint2cmp(PG_FUNCTION_ARGS)
+{
+	int16		a = PG_GETARG_INT16(0);
+	int16		b = PG_GETARG_INT16(1);
+
+	PG_RETURN_INT32((int32) a - (int32) b);
+}
+
+static int
+btint2fastcmp(Datum x, Datum y, SortSupport ssup)
+{
+	int16		a = DatumGetInt16(x);
+	int16		b = DatumGetInt16(y);
+
+	return (int) a - (int) b;
+}
+
+Datum
+btint2sortsupport(PG_FUNCTION_ARGS)
+{
+	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+	ssup->comparator = btint2fastcmp;
+	PG_RETURN_VOID();
+}
+
+Datum
+btint4cmp(PG_FUNCTION_ARGS)
+{
+	int32		a = PG_GETARG_INT32(0);
+	int32		b = PG_GETARG_INT32(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+static int
+btint4fastcmp(Datum x, Datum y, SortSupport ssup)
+{
+	int32		a = DatumGetInt32(x);
+	int32		b = DatumGetInt32(y);
+
+	if (a > b)
+		return A_GREATER_THAN_B;
+	else if (a == b)
+		return 0;
+	else
+		return A_LESS_THAN_B;
+}
+
+Datum
+btint4sortsupport(PG_FUNCTION_ARGS)
+{
+	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+	ssup->comparator = btint4fastcmp;
+	PG_RETURN_VOID();
+}
+
+Datum
+btint8cmp(PG_FUNCTION_ARGS)
+{
+	int64		a = PG_GETARG_INT64(0);
+	int64		b = PG_GETARG_INT64(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+static int
+btint8fastcmp(Datum x, Datum y, SortSupport ssup)
+{
+	int64		a = DatumGetInt64(x);
+	int64		b = DatumGetInt64(y);
+
+	if (a > b)
+		return A_GREATER_THAN_B;
+	else if (a == b)
+		return 0;
+	else
+		return A_LESS_THAN_B;
+}
+
+Datum
+btint8sortsupport(PG_FUNCTION_ARGS)
+{
+	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+	ssup->comparator = btint8fastcmp;
+	PG_RETURN_VOID();
+}
+
+Datum
+btint48cmp(PG_FUNCTION_ARGS)
+{
+	int32		a = PG_GETARG_INT32(0);
+	int64		b = PG_GETARG_INT64(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint84cmp(PG_FUNCTION_ARGS)
+{
+	int64		a = PG_GETARG_INT64(0);
+	int32		b = PG_GETARG_INT32(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint24cmp(PG_FUNCTION_ARGS)
+{
+	int16		a = PG_GETARG_INT16(0);
+	int32		b = PG_GETARG_INT32(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint42cmp(PG_FUNCTION_ARGS)
+{
+	int32		a = PG_GETARG_INT32(0);
+	int16		b = PG_GETARG_INT16(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint28cmp(PG_FUNCTION_ARGS)
+{
+	int16		a = PG_GETARG_INT16(0);
+	int64		b = PG_GETARG_INT64(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint82cmp(PG_FUNCTION_ARGS)
+{
+	int64		a = PG_GETARG_INT64(0);
+	int16		b = PG_GETARG_INT16(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btoidcmp(PG_FUNCTION_ARGS)
+{
+	Oid			a = PG_GETARG_OID(0);
+	Oid			b = PG_GETARG_OID(1);
+
+	if (a > b)
+		PG_RETURN_INT32(A_GREATER_THAN_B);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+static int
+btoidfastcmp(Datum x, Datum y, SortSupport ssup)
+{
+	Oid			a = DatumGetObjectId(x);
+	Oid			b = DatumGetObjectId(y);
+
+	if (a > b)
+		return A_GREATER_THAN_B;
+	else if (a == b)
+		return 0;
+	else
+		return A_LESS_THAN_B;
+}
+
+Datum
+btoidsortsupport(PG_FUNCTION_ARGS)
+{
+	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+	ssup->comparator = btoidfastcmp;
+	PG_RETURN_VOID();
+}
+
+Datum
+btoidvectorcmp(PG_FUNCTION_ARGS)
+{
+	oidvector  *a = (oidvector *) PG_GETARG_POINTER(0);
+	oidvector  *b = (oidvector *) PG_GETARG_POINTER(1);
+	int			i;
+
+	/* We arbitrarily choose to sort first by vector length */
+	if (a->dim1 != b->dim1)
+		PG_RETURN_INT32(a->dim1 - b->dim1);
+
+	for (i = 0; i < a->dim1; i++)
+	{
+		if (a->values[i] != b->values[i])
+		{
+			if (a->values[i] > b->values[i])
+				PG_RETURN_INT32(A_GREATER_THAN_B);
+			else
+				PG_RETURN_INT32(A_LESS_THAN_B);
+		}
+	}
+	PG_RETURN_INT32(0);
+}
+
+Datum
+btcharcmp(PG_FUNCTION_ARGS)
+{
+	char		a = PG_GETARG_CHAR(0);
+	char		b = PG_GETARG_CHAR(1);
+
+	/* Be careful to compare chars as unsigned */
+	PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b));
+}
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
new file mode 100644
index 0000000..1cd1b59
--- /dev/null
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -0,0 +1,1098 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtdedup.c
+ *	  Deduplicate or bottom-up delete items in Postgres btrees.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtdedup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
+										   TM_IndexDeleteOp *delstate);
+static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
+							 OffsetNumber minoff, IndexTuple newitem);
+static void _bt_singleval_fillfactor(Page page, BTDedupState state,
+									 Size newitemsz);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_posting_valid(IndexTuple posting);
+#endif
+
+/*
+ * Perform a deduplication pass.
+ *
+ * The general approach taken here is to perform as much deduplication as
+ * possible to free as much space as possible.  Note, however, that "single
+ * value" strategy is used for !bottomupdedup callers when the page is full of
+ * tuples of a single value.  Deduplication passes that apply the strategy
+ * will leave behind a few untouched tuples at the end of the page, preparing
+ * the page for an anticipated page split that uses nbtsplitloc.c's own single
+ * value strategy.  Our high level goal is to delay merging the untouched
+ * tuples until after the page splits.
+ *
+ * When a call to _bt_bottomupdel_pass() just took place (and failed), our
+ * high level goal is to prevent a page split entirely by buying more time.
+ * We still hope that a page split can be avoided altogether.  That's why
+ * single value strategy is not even considered for bottomupdedup callers.
+ *
+ * The page will have to be split if we cannot successfully free at least
+ * newitemsz (we also need space for newitem's line pointer, which isn't
+ * included in caller's newitemsz).
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
+ */
+void
+_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
+			   Size newitemsz, bool bottomupdedup)
+{
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Page		page = BufferGetPage(buf);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Page		newpage;
+	BTDedupState state;
+	Size		pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
+	bool		singlevalstrat = false;
+	int			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+
+	/*
+	 * Initialize deduplication state.
+	 *
+	 * It would be possible for maxpostingsize (limit on posting list tuple
+	 * size) to be set to one third of the page.  However, it seems like a
+	 * good idea to limit the size of posting lists to one sixth of a page.
+	 * That ought to leave us with a good split point when pages full of
+	 * duplicates can be split several times.
+	 */
+	state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+	state->deduplicate = true;
+	state->nmaxitems = 0;
+	state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
+	/* Metadata about base tuple of current pending posting list */
+	state->base = NULL;
+	state->baseoff = InvalidOffsetNumber;
+	state->basetupsize = 0;
+	/* Metadata about current pending posting list TIDs */
+	state->htids = palloc(state->maxpostingsize);
+	state->nhtids = 0;
+	state->nitems = 0;
+	/* Size of all physical tuples to be replaced by pending posting list */
+	state->phystupsize = 0;
+	/* nintervals should be initialized to zero */
+	state->nintervals = 0;
+
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * Consider applying "single value" strategy, though only if the page
+	 * seems likely to be split in the near future
+	 */
+	if (!bottomupdedup)
+		singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
+
+	/*
+	 * Deduplicate items from page, and write them to newpage.
+	 *
+	 * Copy the original page's LSN into newpage copy.  This will become the
+	 * updated version of the page.  We need this because XLogInsert will
+	 * examine the LSN and possibly dump it in a page image.
+	 */
+	newpage = PageGetTempPageCopySpecial(page);
+	PageSetLSN(newpage, PageGetLSN(page));
+
+	/* Copy high key, if any */
+	if (!P_RIGHTMOST(opaque))
+	{
+		ItemId		hitemid = PageGetItemId(page, P_HIKEY);
+		Size		hitemsz = ItemIdGetLength(hitemid);
+		IndexTuple	hitem = (IndexTuple) PageGetItem(page, hitemid);
+
+		if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "deduplication failed to add highkey");
+	}
+
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(!ItemIdIsDead(itemid));
+
+		if (offnum == minoff)
+		{
+			/*
+			 * No previous/base tuple for the data item -- use the data item
+			 * as base tuple of pending posting list
+			 */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+		else if (state->deduplicate &&
+				 _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
+				 _bt_dedup_save_htid(state, itup))
+		{
+			/*
+			 * Tuple is equal to base tuple of pending posting list.  Heap
+			 * TID(s) for itup have been saved in state.
+			 */
+		}
+		else
+		{
+			/*
+			 * Tuple is not equal to pending posting list tuple, or
+			 * _bt_dedup_save_htid() opted to not merge current item into
+			 * pending posting list for some other reason (e.g., adding more
+			 * TIDs would have caused posting list to exceed current
+			 * maxpostingsize).
+			 *
+			 * If state contains pending posting list with more than one item,
+			 * form new posting tuple, and actually update the page.  Else
+			 * reset the state and move on without modifying the page.
+			 */
+			pagesaving += _bt_dedup_finish_pending(newpage, state);
+
+			if (singlevalstrat)
+			{
+				/*
+				 * Single value strategy's extra steps.
+				 *
+				 * Lower maxpostingsize for sixth and final large posting list
+				 * tuple at the point where 5 maxpostingsize-capped tuples
+				 * have either been formed or observed.
+				 *
+				 * When a sixth maxpostingsize-capped item is formed/observed,
+				 * stop merging together tuples altogether.  The few tuples
+				 * that remain at the end of the page won't be merged together
+				 * at all (at least not until after a future page split takes
+				 * place).
+				 */
+				if (state->nmaxitems == 5)
+					_bt_singleval_fillfactor(page, state, newitemsz);
+				else if (state->nmaxitems == 6)
+				{
+					state->deduplicate = false;
+					singlevalstrat = false; /* won't be back here */
+				}
+			}
+
+			/* itup starts new pending posting list */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+	}
+
+	/* Handle the last item */
+	pagesaving += _bt_dedup_finish_pending(newpage, state);
+
+	/*
+	 * If no items suitable for deduplication were found, newpage must be
+	 * exactly the same as the original page, so just return from function.
+	 *
+	 * We could determine whether or not to proceed on the basis the space
+	 * savings being sufficient to avoid an immediate page split instead.  We
+	 * don't do that because there is some small value in nbtsplitloc.c always
+	 * operating against a page that is fully deduplicated (apart from
+	 * newitem).  Besides, most of the cost has already been paid.
+	 */
+	if (state->nintervals == 0)
+	{
+		/* cannot leak memory here */
+		pfree(newpage);
+		pfree(state->htids);
+		pfree(state);
+		return;
+	}
+
+	/*
+	 * By here, it's clear that deduplication will definitely go ahead.
+	 *
+	 * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
+	 * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
+	 * But keep things tidy.
+	 */
+	if (P_HAS_GARBAGE(opaque))
+	{
+		BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+		nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+	}
+
+	START_CRIT_SECTION();
+
+	PageRestoreTempPage(newpage, page);
+	MarkBufferDirty(buf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		XLogRecPtr	recptr;
+		xl_btree_dedup xlrec_dedup;
+
+		xlrec_dedup.nintervals = state->nintervals;
+
+		XLogBeginInsert();
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+		XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
+
+		/*
+		 * The intervals array is not in the buffer, but pretend that it is.
+		 * When XLogInsert stores the whole buffer, the array need not be
+		 * stored too.
+		 */
+		XLogRegisterBufData(0, (char *) state->intervals,
+							state->nintervals * sizeof(BTDedupInterval));
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
+
+		PageSetLSN(page, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* Local space accounting should agree with page accounting */
+	Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
+
+	/* cannot leak memory here */
+	pfree(state->htids);
+	pfree(state);
+}
+
+/*
+ * Perform bottom-up index deletion pass.
+ *
+ * See if duplicate index tuples (plus certain nearby tuples) are eligible to
+ * be deleted via bottom-up index deletion.  The high level goal here is to
+ * entirely prevent "unnecessary" page splits caused by MVCC version churn
+ * from UPDATEs (when the UPDATEs don't logically modify any of the columns
+ * covered by the 'rel' index).  This is qualitative, not quantitative -- we
+ * do not particularly care about once-off opportunities to delete many index
+ * tuples together.
+ *
+ * See nbtree/README for details on the design of nbtree bottom-up deletion.
+ * See access/tableam.h for a description of how we're expected to cooperate
+ * with the tableam.
+ *
+ * Returns true on success, in which case caller can assume page split will be
+ * avoided for a reasonable amount of time.  Returns false when caller should
+ * deduplicate the page (if possible at all).
+ *
+ * Note: Occasionally we return true despite failing to delete enough items to
+ * avoid a split.  This makes caller skip deduplication and go split the page
+ * right away.  Our return value is always just advisory information.
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
+ */
+bool
+_bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
+					 Size newitemsz)
+{
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Page		page = BufferGetPage(buf);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	BTDedupState state;
+	TM_IndexDeleteOp delstate;
+	bool		neverdedup;
+	int			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+
+	/* Initialize deduplication state */
+	state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+	state->deduplicate = true;
+	state->nmaxitems = 0;
+	state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
+	state->base = NULL;
+	state->baseoff = InvalidOffsetNumber;
+	state->basetupsize = 0;
+	state->htids = palloc(state->maxpostingsize);
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->phystupsize = 0;
+	state->nintervals = 0;
+
+	/*
+	 * Initialize tableam state that describes bottom-up index deletion
+	 * operation.
+	 *
+	 * We'll go on to ask the tableam to search for TIDs whose index tuples we
+	 * can safely delete.  The tableam will search until our leaf page space
+	 * target is satisfied, or until the cost of continuing with the tableam
+	 * operation seems too high.  It focuses its efforts on TIDs associated
+	 * with duplicate index tuples that we mark "promising".
+	 *
+	 * This space target is a little arbitrary.  The tableam must be able to
+	 * keep the costs and benefits in balance.  We provide the tableam with
+	 * exhaustive information about what might work, without directly
+	 * concerning ourselves with avoiding work during the tableam call.  Our
+	 * role in costing the bottom-up deletion process is strictly advisory.
+	 */
+	delstate.bottomup = true;
+	delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
+	delstate.ndeltids = 0;
+	delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
+	delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
+
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(!ItemIdIsDead(itemid));
+
+		if (offnum == minoff)
+		{
+			/* itup starts first pending interval */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+		else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
+				 _bt_dedup_save_htid(state, itup))
+		{
+			/* Tuple is equal; just added its TIDs to pending interval */
+		}
+		else
+		{
+			/* Finalize interval -- move its TIDs to delete state */
+			_bt_bottomupdel_finish_pending(page, state, &delstate);
+
+			/* itup starts new pending interval */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+	}
+	/* Finalize final interval -- move its TIDs to delete state */
+	_bt_bottomupdel_finish_pending(page, state, &delstate);
+
+	/*
+	 * We don't give up now in the event of having few (or even zero)
+	 * promising tuples for the tableam because it's not up to us as the index
+	 * AM to manage costs (note that the tableam might have heuristics of its
+	 * own that work out what to do).  We should at least avoid having our
+	 * caller do a useless deduplication pass after we return in the event of
+	 * zero promising tuples, though.
+	 */
+	neverdedup = false;
+	if (state->nintervals == 0)
+		neverdedup = true;
+
+	pfree(state->htids);
+	pfree(state);
+
+	/* Ask tableam which TIDs are deletable, then physically delete them */
+	_bt_delitems_delete_check(rel, buf, heapRel, &delstate);
+
+	pfree(delstate.deltids);
+	pfree(delstate.status);
+
+	/* Report "success" to caller unconditionally to avoid deduplication */
+	if (neverdedup)
+		return true;
+
+	/* Don't dedup when we won't end up back here any time soon anyway */
+	return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
+}
+
+/*
+ * Create a new pending posting list tuple based on caller's base tuple.
+ *
+ * Every tuple processed by deduplication either becomes the base tuple for a
+ * posting list, or gets its heap TID(s) accepted into a pending posting list.
+ * A tuple that starts out as the base tuple for a posting list will only
+ * actually be rewritten within _bt_dedup_finish_pending() when it turns out
+ * that there are duplicates that can be merged into the base tuple.
+ */
+void
+_bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+						OffsetNumber baseoff)
+{
+	Assert(state->nhtids == 0);
+	Assert(state->nitems == 0);
+	Assert(!BTreeTupleIsPivot(base));
+
+	/*
+	 * Copy heap TID(s) from new base tuple for new candidate posting list
+	 * into working state's array
+	 */
+	if (!BTreeTupleIsPosting(base))
+	{
+		memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
+		state->nhtids = 1;
+		state->basetupsize = IndexTupleSize(base);
+	}
+	else
+	{
+		int			nposting;
+
+		nposting = BTreeTupleGetNPosting(base);
+		memcpy(state->htids, BTreeTupleGetPosting(base),
+			   sizeof(ItemPointerData) * nposting);
+		state->nhtids = nposting;
+		/* basetupsize should not include existing posting list */
+		state->basetupsize = BTreeTupleGetPostingOffset(base);
+	}
+
+	/*
+	 * Save new base tuple itself -- it'll be needed if we actually create a
+	 * new posting list from new pending posting list.
+	 *
+	 * Must maintain physical size of all existing tuples (including line
+	 * pointer overhead) so that we can calculate space savings on page.
+	 */
+	state->nitems = 1;
+	state->base = base;
+	state->baseoff = baseoff;
+	state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
+	/* Also save baseoff in pending state for interval */
+	state->intervals[state->nintervals].baseoff = state->baseoff;
+}
+
+/*
+ * Save itup heap TID(s) into pending posting list where possible.
+ *
+ * Returns bool indicating if the pending posting list managed by state now
+ * includes itup's heap TID(s).
+ */
+bool
+_bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
+{
+	int			nhtids;
+	ItemPointer htids;
+	Size		mergedtupsz;
+
+	Assert(!BTreeTupleIsPivot(itup));
+
+	if (!BTreeTupleIsPosting(itup))
+	{
+		nhtids = 1;
+		htids = &itup->t_tid;
+	}
+	else
+	{
+		nhtids = BTreeTupleGetNPosting(itup);
+		htids = BTreeTupleGetPosting(itup);
+	}
+
+	/*
+	 * Don't append (have caller finish pending posting list as-is) if
+	 * appending heap TID(s) from itup would put us over maxpostingsize limit.
+	 *
+	 * This calculation needs to match the code used within _bt_form_posting()
+	 * for new posting list tuples.
+	 */
+	mergedtupsz = MAXALIGN(state->basetupsize +
+						   (state->nhtids + nhtids) * sizeof(ItemPointerData));
+
+	if (mergedtupsz > state->maxpostingsize)
+	{
+		/*
+		 * Count this as an oversized item for single value strategy, though
+		 * only when there are 50 TIDs in the final posting list tuple.  This
+		 * limit (which is fairly arbitrary) avoids confusion about how many
+		 * 1/6 of a page tuples have been encountered/created by the current
+		 * deduplication pass.
+		 *
+		 * Note: We deliberately don't consider which deduplication pass
+		 * merged together tuples to create this item (could be a previous
+		 * deduplication pass, or current pass).  See _bt_do_singleval()
+		 * comments.
+		 */
+		if (state->nhtids > 50)
+			state->nmaxitems++;
+
+		return false;
+	}
+
+	/*
+	 * Save heap TIDs to pending posting list tuple -- itup can be merged into
+	 * pending posting list
+	 */
+	state->nitems++;
+	memcpy(state->htids + state->nhtids, htids,
+		   sizeof(ItemPointerData) * nhtids);
+	state->nhtids += nhtids;
+	state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+
+	return true;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the page.  Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * Returns space saving from deduplicating to make a new posting list tuple.
+ * Note that this includes line pointer overhead.  This is zero in the case
+ * where no deduplication was possible.
+ */
+Size
+_bt_dedup_finish_pending(Page newpage, BTDedupState state)
+{
+	OffsetNumber tupoff;
+	Size		tuplesz;
+	Size		spacesaving;
+
+	Assert(state->nitems > 0);
+	Assert(state->nitems <= state->nhtids);
+	Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+	tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
+	if (state->nitems == 1)
+	{
+		/* Use original, unchanged base tuple */
+		tuplesz = IndexTupleSize(state->base);
+		if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "deduplication failed to add tuple to page");
+
+		spacesaving = 0;
+	}
+	else
+	{
+		IndexTuple	final;
+
+		/* Form a tuple with a posting list */
+		final = _bt_form_posting(state->base, state->htids, state->nhtids);
+		tuplesz = IndexTupleSize(final);
+		Assert(tuplesz <= state->maxpostingsize);
+
+		/* Save final number of items for posting list */
+		state->intervals[state->nintervals].nitems = state->nitems;
+
+		Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
+		if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
+						false) == InvalidOffsetNumber)
+			elog(ERROR, "deduplication failed to add tuple to page");
+
+		pfree(final);
+		spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
+		/* Increment nintervals, since we wrote a new posting list tuple */
+		state->nintervals++;
+		Assert(spacesaving > 0 && spacesaving < BLCKSZ);
+	}
+
+	/* Reset state for next pending posting list */
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->phystupsize = 0;
+
+	return spacesaving;
+}
+
+/*
+ * Finalize interval during bottom-up index deletion.
+ *
+ * During a bottom-up pass we expect that TIDs will be recorded in dedup state
+ * first, and then get moved over to delstate (in variable-sized batches) by
+ * calling here.  Call here happens when the number of TIDs in a dedup
+ * interval is known, and interval gets finalized (i.e. when caller sees next
+ * tuple on the page is not a duplicate, or when caller runs out of tuples to
+ * process from leaf page).
+ *
+ * This is where bottom-up deletion determines and remembers which entries are
+ * duplicates.  This will be important information to the tableam delete
+ * infrastructure later on.  Plain index tuple duplicates are marked
+ * "promising" here, per tableam contract.
+ *
+ * Our approach to marking entries whose TIDs come from posting lists is more
+ * complicated.  Posting lists can only be formed by a deduplication pass (or
+ * during an index build), so recent version churn affecting the pointed-to
+ * logical rows is not particularly likely.  We may still give a weak signal
+ * about posting list tuples' entries (by marking just one of its TIDs/entries
+ * promising), though this is only a possibility in the event of further
+ * duplicate index tuples in final interval that covers posting list tuple (as
+ * in the plain tuple case).  A weak signal/hint will be useful to the tableam
+ * when it has no stronger signal to go with for the deletion operation as a
+ * whole.
+ *
+ * The heuristics we use work well in practice because we only need to give
+ * the tableam the right _general_ idea about where to look.  Garbage tends to
+ * naturally get concentrated in relatively few table blocks with workloads
+ * that bottom-up deletion targets.  The tableam cannot possibly rank all
+ * available table blocks sensibly based on the hints we provide, but that's
+ * okay -- only the extremes matter.  The tableam just needs to be able to
+ * predict which few table blocks will have the most tuples that are safe to
+ * delete for each deletion operation, with low variance across related
+ * deletion operations.
+ */
+static void
+_bt_bottomupdel_finish_pending(Page page, BTDedupState state,
+							   TM_IndexDeleteOp *delstate)
+{
+	bool		dupinterval = (state->nitems > 1);
+
+	Assert(state->nitems > 0);
+	Assert(state->nitems <= state->nhtids);
+	Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+	for (int i = 0; i < state->nitems; i++)
+	{
+		OffsetNumber offnum = state->baseoff + i;
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+		TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
+		TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			/* Simple case: A plain non-pivot tuple */
+			ideltid->tid = itup->t_tid;
+			ideltid->id = delstate->ndeltids;
+			istatus->idxoffnum = offnum;
+			istatus->knowndeletable = false;	/* for now */
+			istatus->promising = dupinterval;	/* simple rule */
+			istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
+
+			delstate->ndeltids++;
+		}
+		else
+		{
+			/*
+			 * Complicated case: A posting list tuple.
+			 *
+			 * We make the conservative assumption that there can only be at
+			 * most one affected logical row per posting list tuple.  There
+			 * will be at most one promising entry in deltids to represent
+			 * this presumed lone logical row.  Note that this isn't even
+			 * considered unless the posting list tuple is also in an interval
+			 * of duplicates -- this complicated rule is just a variant of the
+			 * simple rule used to decide if plain index tuples are promising.
+			 */
+			int			nitem = BTreeTupleGetNPosting(itup);
+			bool		firstpromising = false;
+			bool		lastpromising = false;
+
+			Assert(_bt_posting_valid(itup));
+
+			if (dupinterval)
+			{
+				/*
+				 * Complicated rule: either the first or last TID in the
+				 * posting list gets marked promising (if any at all)
+				 */
+				BlockNumber minblocklist,
+							midblocklist,
+							maxblocklist;
+				ItemPointer mintid,
+							midtid,
+							maxtid;
+
+				mintid = BTreeTupleGetHeapTID(itup);
+				midtid = BTreeTupleGetPostingN(itup, nitem / 2);
+				maxtid = BTreeTupleGetMaxHeapTID(itup);
+				minblocklist = ItemPointerGetBlockNumber(mintid);
+				midblocklist = ItemPointerGetBlockNumber(midtid);
+				maxblocklist = ItemPointerGetBlockNumber(maxtid);
+
+				/* Only entry with predominant table block can be promising */
+				firstpromising = (minblocklist == midblocklist);
+				lastpromising = (!firstpromising &&
+								 midblocklist == maxblocklist);
+			}
+
+			for (int p = 0; p < nitem; p++)
+			{
+				ItemPointer htid = BTreeTupleGetPostingN(itup, p);
+
+				ideltid->tid = *htid;
+				ideltid->id = delstate->ndeltids;
+				istatus->idxoffnum = offnum;
+				istatus->knowndeletable = false;	/* for now */
+				istatus->promising = false;
+				if ((firstpromising && p == 0) ||
+					(lastpromising && p == nitem - 1))
+					istatus->promising = true;
+				istatus->freespace = sizeof(ItemPointerData);	/* at worst */
+
+				ideltid++;
+				istatus++;
+				delstate->ndeltids++;
+			}
+		}
+	}
+
+	if (dupinterval)
+	{
+		state->intervals[state->nintervals].nitems = state->nitems;
+		state->nintervals++;
+	}
+
+	/* Reset state for next interval */
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->phystupsize = 0;
+}
+
+/*
+ * Determine if page non-pivot tuples (data items) are all duplicates of the
+ * same value -- if they are, deduplication's "single value" strategy should
+ * be applied.  The general goal of this strategy is to ensure that
+ * nbtsplitloc.c (which uses its own single value strategy) will find a useful
+ * split point as further duplicates are inserted, and successive rightmost
+ * page splits occur among pages that store the same duplicate value.  When
+ * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
+ * just like it would if deduplication were disabled.
+ *
+ * We expect that affected workloads will require _several_ single value
+ * strategy deduplication passes (over a page that only stores duplicates)
+ * before the page is finally split.  The first deduplication pass should only
+ * find regular non-pivot tuples.  Later deduplication passes will find
+ * existing maxpostingsize-capped posting list tuples, which must be skipped
+ * over.  The penultimate pass is generally the first pass that actually
+ * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
+ * few untouched non-pivot tuples.  The final deduplication pass won't free
+ * any space -- it will skip over everything without merging anything (it
+ * retraces the steps of the penultimate pass).
+ *
+ * Fortunately, having several passes isn't too expensive.  Each pass (after
+ * the first pass) won't spend many cycles on the large posting list tuples
+ * left by previous passes.  Each pass will find a large contiguous group of
+ * smaller duplicate tuples to merge together at the end of the page.
+ */
+static bool
+_bt_do_singleval(Relation rel, Page page, BTDedupState state,
+				 OffsetNumber minoff, IndexTuple newitem)
+{
+	int			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	ItemId		itemid;
+	IndexTuple	itup;
+
+	itemid = PageGetItemId(page, minoff);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+
+	if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
+	{
+		itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
+		itup = (IndexTuple) PageGetItem(page, itemid);
+
+		if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
+ * and final maxpostingsize-capped tuple.  The sixth and final posting list
+ * tuple will end up somewhat smaller than the first five.  (Note: The first
+ * five tuples could actually just be very large duplicate tuples that
+ * couldn't be merged together at all.  Deduplication will simply not modify
+ * the page when that happens.)
+ *
+ * When there are six posting lists on the page (after current deduplication
+ * pass goes on to create/observe a sixth very large tuple), caller should end
+ * its deduplication pass.  It isn't useful to try to deduplicate items that
+ * are supposed to end up on the new right sibling page following the
+ * anticipated page split.  A future deduplication pass of future right
+ * sibling page might take care of it.  (This is why the first single value
+ * strategy deduplication pass for a given leaf page will generally find only
+ * plain non-pivot tuples -- see _bt_do_singleval() comments.)
+ */
+static void
+_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
+{
+	Size		leftfree;
+	int			reduction;
+
+	/* This calculation needs to match nbtsplitloc.c */
+	leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
+		MAXALIGN(sizeof(BTPageOpaqueData));
+	/* Subtract size of new high key (includes pivot heap TID space) */
+	leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
+
+	/*
+	 * Reduce maxpostingsize by an amount equal to target free space on left
+	 * half of page
+	 */
+	reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
+	if (state->maxpostingsize > reduction)
+		state->maxpostingsize -= reduction;
+	else
+		state->maxpostingsize = 0;
+}
+
+/*
+ * Build a posting list tuple based on caller's "base" index tuple and list of
+ * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
+ * posting list. (Posting list tuples can never have a single heap TID, partly
+ * because that ensures that deduplication always reduces final MAXALIGN()'d
+ * size of entire tuple.)
+ *
+ * Convention is that posting list starts at a MAXALIGN()'d offset (rather
+ * than a SHORTALIGN()'d offset), in line with the approach taken when
+ * appending a heap TID to new pivot tuple/high key during suffix truncation.
+ * This sometimes wastes a little space that was only needed as alignment
+ * padding in the original tuple.  Following this convention simplifies the
+ * space accounting used when deduplicating a page (the same convention
+ * simplifies the accounting for choosing a point to split a page at).
+ *
+ * Note: Caller's "htids" array must be unique and already in ascending TID
+ * order.  Any existing heap TIDs from "base" won't automatically appear in
+ * returned posting list tuple (they must be included in htids array.)
+ */
+IndexTuple
+_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
+{
+	uint32		keysize,
+				newsize;
+	IndexTuple	itup;
+
+	if (BTreeTupleIsPosting(base))
+		keysize = BTreeTupleGetPostingOffset(base);
+	else
+		keysize = IndexTupleSize(base);
+
+	Assert(!BTreeTupleIsPivot(base));
+	Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
+	Assert(keysize == MAXALIGN(keysize));
+
+	/* Determine final size of new tuple */
+	if (nhtids > 1)
+		newsize = MAXALIGN(keysize +
+						   nhtids * sizeof(ItemPointerData));
+	else
+		newsize = keysize;
+
+	Assert(newsize <= INDEX_SIZE_MASK);
+	Assert(newsize == MAXALIGN(newsize));
+
+	/* Allocate memory using palloc0() (matches index_form_tuple()) */
+	itup = palloc0(newsize);
+	memcpy(itup, base, keysize);
+	itup->t_info &= ~INDEX_SIZE_MASK;
+	itup->t_info |= newsize;
+	if (nhtids > 1)
+	{
+		/* Form posting list tuple */
+		BTreeTupleSetPosting(itup, nhtids, keysize);
+		memcpy(BTreeTupleGetPosting(itup), htids,
+			   sizeof(ItemPointerData) * nhtids);
+		Assert(_bt_posting_valid(itup));
+	}
+	else
+	{
+		/* Form standard non-pivot tuple */
+		itup->t_info &= ~INDEX_ALT_TID_MASK;
+		ItemPointerCopy(htids, &itup->t_tid);
+		Assert(ItemPointerIsValid(&itup->t_tid));
+	}
+
+	return itup;
+}
+
+/*
+ * Generate a replacement tuple by "updating" a posting list tuple so that it
+ * no longer has TIDs that need to be deleted.
+ *
+ * Used by both VACUUM and index deletion.  Caller's vacposting argument
+ * points to the existing posting list tuple to be updated.
+ *
+ * On return, caller's vacposting argument will point to final "updated"
+ * tuple, which will be palloc()'d in caller's memory context.
+ */
+void
+_bt_update_posting(BTVacuumPosting vacposting)
+{
+	IndexTuple	origtuple = vacposting->itup;
+	uint32		keysize,
+				newsize;
+	IndexTuple	itup;
+	int			nhtids;
+	int			ui,
+				d;
+	ItemPointer htids;
+
+	nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
+
+	Assert(_bt_posting_valid(origtuple));
+	Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
+
+	/*
+	 * Determine final size of new tuple.
+	 *
+	 * This calculation needs to match the code used within _bt_form_posting()
+	 * for new posting list tuples.  We avoid calling _bt_form_posting() here
+	 * to save ourselves a second memory allocation for a htids workspace.
+	 */
+	keysize = BTreeTupleGetPostingOffset(origtuple);
+	if (nhtids > 1)
+		newsize = MAXALIGN(keysize +
+						   nhtids * sizeof(ItemPointerData));
+	else
+		newsize = keysize;
+
+	Assert(newsize <= INDEX_SIZE_MASK);
+	Assert(newsize == MAXALIGN(newsize));
+
+	/* Allocate memory using palloc0() (matches index_form_tuple()) */
+	itup = palloc0(newsize);
+	memcpy(itup, origtuple, keysize);
+	itup->t_info &= ~INDEX_SIZE_MASK;
+	itup->t_info |= newsize;
+
+	if (nhtids > 1)
+	{
+		/* Form posting list tuple */
+		BTreeTupleSetPosting(itup, nhtids, keysize);
+		htids = BTreeTupleGetPosting(itup);
+	}
+	else
+	{
+		/* Form standard non-pivot tuple */
+		itup->t_info &= ~INDEX_ALT_TID_MASK;
+		htids = &itup->t_tid;
+	}
+
+	ui = 0;
+	d = 0;
+	for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
+	{
+		if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
+		{
+			d++;
+			continue;
+		}
+		htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
+	}
+	Assert(ui == nhtids);
+	Assert(d == vacposting->ndeletedtids);
+	Assert(nhtids == 1 || _bt_posting_valid(itup));
+	Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
+
+	/* vacposting arg's itup will now point to updated version */
+	vacposting->itup = itup;
+}
+
+/*
+ * Prepare for a posting list split by swapping heap TID in newitem with heap
+ * TID from original posting list (the 'oposting' heap TID located at offset
+ * 'postingoff').  Modifies newitem, so caller should pass their own private
+ * copy that can safely be modified.
+ *
+ * Returns new posting list tuple, which is palloc()'d in caller's context.
+ * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
+ * what caller actually inserts. (This happens inside the same critical
+ * section that performs an in-place update of old posting list using new
+ * posting list returned here.)
+ *
+ * While the keys from newitem and oposting must be opclass equal, and must
+ * generate identical output when run through the underlying type's output
+ * function, it doesn't follow that their representations match exactly.
+ * Caller must avoid assuming that there can't be representational differences
+ * that make datums from oposting bigger or smaller than the corresponding
+ * datums from newitem.  For example, differences in TOAST input state might
+ * break a faulty assumption about tuple size (the executor is entitled to
+ * apply TOAST compression based on its own criteria).  It also seems possible
+ * that further representational variation will be introduced in the future,
+ * in order to support nbtree features like page-level prefix compression.
+ *
+ * See nbtree/README for details on the design of posting list splits.
+ */
+IndexTuple
+_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
+{
+	int			nhtids;
+	char	   *replacepos;
+	char	   *replaceposright;
+	Size		nmovebytes;
+	IndexTuple	nposting;
+
+	nhtids = BTreeTupleGetNPosting(oposting);
+	Assert(_bt_posting_valid(oposting));
+
+	/*
+	 * The postingoff argument originated as a _bt_binsrch_posting() return
+	 * value.  It will be 0 in the event of corruption that makes a leaf page
+	 * contain a non-pivot tuple that's somehow identical to newitem (no two
+	 * non-pivot tuples should ever have the same TID).  This has been known
+	 * to happen in the field from time to time.
+	 *
+	 * Perform a basic sanity check to catch this case now.
+	 */
+	if (!(postingoff > 0 && postingoff < nhtids))
+		elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
+			 nhtids, postingoff);
+
+	/*
+	 * Move item pointers in posting list to make a gap for the new item's
+	 * heap TID.  We shift TIDs one place to the right, losing original
+	 * rightmost TID. (nmovebytes must not include TIDs to the left of
+	 * postingoff, nor the existing rightmost/max TID that gets overwritten.)
+	 */
+	nposting = CopyIndexTuple(oposting);
+	replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
+	replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
+	nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
+	memmove(replaceposright, replacepos, nmovebytes);
+
+	/* Fill the gap at postingoff with TID of new item (original new TID) */
+	Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
+	ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
+
+	/* Now copy oposting's rightmost/max TID into new item (final new TID) */
+	ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
+
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
+							  BTreeTupleGetHeapTID(newitem)) < 0);
+	Assert(_bt_posting_valid(nposting));
+
+	return nposting;
+}
+
+/*
+ * Verify posting list invariants for "posting", which must be a posting list
+ * tuple.  Used within assertions.
+ */
+#ifdef USE_ASSERT_CHECKING
+static bool
+_bt_posting_valid(IndexTuple posting)
+{
+	ItemPointerData last;
+	ItemPointer htid;
+
+	if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
+		return false;
+
+	/* Remember first heap TID for loop */
+	ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
+	if (!ItemPointerIsValid(&last))
+		return false;
+
+	/* Iterate, starting from second TID */
+	for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
+	{
+		htid = BTreeTupleGetPostingN(posting, i);
+
+		if (!ItemPointerIsValid(htid))
+			return false;
+		if (ItemPointerCompare(htid, &last) <= 0)
+			return false;
+		ItemPointerCopy(htid, &last);
+	}
+
+	return true;
+}
+#endif
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
new file mode 100644
index 0000000..1241c56
--- /dev/null
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -0,0 +1,3009 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtinsert.c
+ *	  Item insertion in Lehman and Yao btrees for Postgres.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtinsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/transam.h"
+#include "access/xloginsert.h"
+#include "lib/qunique.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/smgr.h"
+
+/* Minimum tree height for application of fastpath optimization */
+#define BTREE_FASTPATH_MIN_LEVEL	2
+
+
+static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
+static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
+									  Relation heapRel,
+									  IndexUniqueCheck checkUnique, bool *is_unique,
+									  uint32 *speculativeToken);
+static OffsetNumber _bt_findinsertloc(Relation rel,
+									  BTInsertState insertstate,
+									  bool checkingunique,
+									  bool indexUnchanged,
+									  BTStack stack,
+									  Relation heapRel);
+static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
+static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
+						   Buffer buf,
+						   Buffer cbuf,
+						   BTStack stack,
+						   IndexTuple itup,
+						   Size itemsz,
+						   OffsetNumber newitemoff,
+						   int postingoff,
+						   bool split_only_page);
+static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
+						Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
+						IndexTuple newitem, IndexTuple orignewitem,
+						IndexTuple nposting, uint16 postingoff);
+static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+							  BTStack stack, bool isroot, bool isonly);
+static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
+								OffsetNumber itup_off, bool newfirstdataitem);
+static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
+										 BTInsertState insertstate,
+										 bool simpleonly, bool checkingunique,
+										 bool uniquedup, bool indexUnchanged);
+static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
+							   OffsetNumber *deletable, int ndeletable,
+							   IndexTuple newitem, OffsetNumber minoff,
+							   OffsetNumber maxoff);
+static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
+								   int ndeletable, IndexTuple newitem,
+								   int *nblocks);
+static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
+
+/*
+ *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
+ *
+ *		This routine is called by the public interface routine, btinsert.
+ *		By here, itup is filled in, including the TID.
+ *
+ *		If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
+ *		will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
+ *		UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
+ *		For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
+ *		don't actually insert.
+ *
+ *		indexUnchanged executor hint indicates if itup is from an
+ *		UPDATE that didn't logically change the indexed value, but
+ *		must nevertheless have a new entry to point to a successor
+ *		version.
+ *
+ *		The result value is only significant for UNIQUE_CHECK_PARTIAL:
+ *		it must be true if the entry is known unique, else false.
+ *		(In the current implementation we'll also return true after a
+ *		successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
+ *		that's just a coding artifact.)
+ */
+bool
+_bt_doinsert(Relation rel, IndexTuple itup,
+			 IndexUniqueCheck checkUnique, bool indexUnchanged,
+			 Relation heapRel)
+{
+	bool		is_unique = false;
+	BTInsertStateData insertstate;
+	BTScanInsert itup_key;
+	BTStack		stack;
+	bool		checkingunique = (checkUnique != UNIQUE_CHECK_NO);
+
+	/* we need an insertion scan key to do our search, so build one */
+	itup_key = _bt_mkscankey(rel, itup);
+
+	if (checkingunique)
+	{
+		if (!itup_key->anynullkeys)
+		{
+			/* No (heapkeyspace) scantid until uniqueness established */
+			itup_key->scantid = NULL;
+		}
+		else
+		{
+			/*
+			 * Scan key for new tuple contains NULL key values.  Bypass
+			 * checkingunique steps.  They are unnecessary because core code
+			 * considers NULL unequal to every value, including NULL.
+			 *
+			 * This optimization avoids O(N^2) behavior within the
+			 * _bt_findinsertloc() heapkeyspace path when a unique index has a
+			 * large number of "duplicates" with NULL key values.
+			 */
+			checkingunique = false;
+			/* Tuple is unique in the sense that core code cares about */
+			Assert(checkUnique != UNIQUE_CHECK_EXISTING);
+			is_unique = true;
+		}
+	}
+
+	/*
+	 * Fill in the BTInsertState working area, to track the current page and
+	 * position within the page to insert on.
+	 *
+	 * Note that itemsz is passed down to lower level code that deals with
+	 * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
+	 * accounting code consistently considers the alignment overhead that we
+	 * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
+	 * already conservative about alignment, but we don't rely on that from
+	 * this distance.  Besides, preserving the "true" tuple size in index
+	 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
+	 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
+	 */
+	insertstate.itup = itup;
+	insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
+	insertstate.itup_key = itup_key;
+	insertstate.bounds_valid = false;
+	insertstate.buf = InvalidBuffer;
+	insertstate.postingoff = 0;
+
+search:
+
+	/*
+	 * Find and lock the leaf page that the tuple should be added to by
+	 * searching from the root page.  insertstate.buf will hold a buffer that
+	 * is locked in exclusive mode afterwards.
+	 */
+	stack = _bt_search_insert(rel, &insertstate);
+
+	/*
+	 * checkingunique inserts are not allowed to go ahead when two tuples with
+	 * equal key attribute values would be visible to new MVCC snapshots once
+	 * the xact commits.  Check for conflicts in the locked page/buffer (if
+	 * needed) here.
+	 *
+	 * It might be necessary to check a page to the right in _bt_check_unique,
+	 * though that should be very rare.  In practice the first page the value
+	 * could be on (with scantid omitted) is almost always also the only page
+	 * that a matching tuple might be found on.  This is due to the behavior
+	 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
+	 * only be allowed to cross a page boundary when there is no candidate
+	 * leaf page split point that avoids it.  Also, _bt_check_unique can use
+	 * the leaf page high key to determine that there will be no duplicates on
+	 * the right sibling without actually visiting it (it uses the high key in
+	 * cases where the new item happens to belong at the far right of the leaf
+	 * page).
+	 *
+	 * NOTE: obviously, _bt_check_unique can only detect keys that are already
+	 * in the index; so it cannot defend against concurrent insertions of the
+	 * same key.  We protect against that by means of holding a write lock on
+	 * the first page the value could be on, with omitted/-inf value for the
+	 * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
+	 * the same key must acquire a write lock on the same page, so only one
+	 * would-be inserter can be making the check at one time.  Furthermore,
+	 * once we are past the check we hold write locks continuously until we
+	 * have performed our insertion, so no later inserter can fail to see our
+	 * insertion.  (This requires some care in _bt_findinsertloc.)
+	 *
+	 * If we must wait for another xact, we release the lock while waiting,
+	 * and then must perform a new search.
+	 *
+	 * For a partial uniqueness check, we don't wait for the other xact. Just
+	 * let the tuple in and return false for possibly non-unique, or true for
+	 * definitely unique.
+	 */
+	if (checkingunique)
+	{
+		TransactionId xwait;
+		uint32		speculativeToken;
+
+		xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
+								 &is_unique, &speculativeToken);
+
+		if (unlikely(TransactionIdIsValid(xwait)))
+		{
+			/* Have to wait for the other guy ... */
+			_bt_relbuf(rel, insertstate.buf);
+			insertstate.buf = InvalidBuffer;
+
+			/*
+			 * If it's a speculative insertion, wait for it to finish (ie. to
+			 * go ahead with the insertion, or kill the tuple).  Otherwise
+			 * wait for the transaction to finish as usual.
+			 */
+			if (speculativeToken)
+				SpeculativeInsertionWait(xwait, speculativeToken);
+			else
+				XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
+
+			/* start over... */
+			if (stack)
+				_bt_freestack(stack);
+			goto search;
+		}
+
+		/* Uniqueness is established -- restore heap tid as scantid */
+		if (itup_key->heapkeyspace)
+			itup_key->scantid = &itup->t_tid;
+	}
+
+	if (checkUnique != UNIQUE_CHECK_EXISTING)
+	{
+		OffsetNumber newitemoff;
+
+		/*
+		 * The only conflict predicate locking cares about for indexes is when
+		 * an index tuple insert conflicts with an existing lock.  We don't
+		 * know the actual page we're going to insert on for sure just yet in
+		 * checkingunique and !heapkeyspace cases, but it's okay to use the
+		 * first page the value could be on (with scantid omitted) instead.
+		 */
+		CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
+
+		/*
+		 * Do the insertion.  Note that insertstate contains cached binary
+		 * search bounds established within _bt_check_unique when insertion is
+		 * checkingunique.
+		 */
+		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
+									   indexUnchanged, stack, heapRel);
+		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
+					   itup, insertstate.itemsz, newitemoff,
+					   insertstate.postingoff, false);
+	}
+	else
+	{
+		/* just release the buffer */
+		_bt_relbuf(rel, insertstate.buf);
+	}
+
+	/* be tidy */
+	if (stack)
+		_bt_freestack(stack);
+	pfree(itup_key);
+
+	return is_unique;
+}
+
+/*
+ *	_bt_search_insert() -- _bt_search() wrapper for inserts
+ *
+ * Search the tree for a particular scankey, or more precisely for the first
+ * leaf page it could be on.  Try to make use of the fastpath optimization's
+ * rightmost leaf page cache before actually searching the tree from the root
+ * page, though.
+ *
+ * Return value is a stack of parent-page pointers (though see notes about
+ * fastpath optimization and page splits below).  insertstate->buf is set to
+ * the address of the leaf-page buffer, which is write-locked and pinned in
+ * all cases (if necessary by creating a new empty root page for caller).
+ *
+ * The fastpath optimization avoids most of the work of searching the tree
+ * repeatedly when a single backend inserts successive new tuples on the
+ * rightmost leaf page of an index.  A backend cache of the rightmost leaf
+ * page is maintained within _bt_insertonpg(), and used here.  The cache is
+ * invalidated here when an insert of a non-pivot tuple must take place on a
+ * non-rightmost leaf page.
+ *
+ * The optimization helps with indexes on an auto-incremented field.  It also
+ * helps with indexes on datetime columns, as well as indexes with lots of
+ * NULL values.  (NULLs usually get inserted in the rightmost page for single
+ * column indexes, since they usually get treated as coming after everything
+ * else in the key space.  Individual NULL tuples will generally be placed on
+ * the rightmost leaf page due to the influence of the heap TID column.)
+ *
+ * Note that we avoid applying the optimization when there is insufficient
+ * space on the rightmost page to fit caller's new item.  This is necessary
+ * because we'll need to return a real descent stack when a page split is
+ * expected (actually, caller can cope with a leaf page split that uses a NULL
+ * stack, but that's very slow and so must be avoided).  Note also that the
+ * fastpath optimization acquires the lock on the page conditionally as a way
+ * of reducing extra contention when there are concurrent insertions into the
+ * rightmost page (we give up if we'd have to wait for the lock).  We assume
+ * that it isn't useful to apply the optimization when there is contention,
+ * since each per-backend cache won't stay valid for long.
+ */
+static BTStack
+_bt_search_insert(Relation rel, BTInsertState insertstate)
+{
+	Assert(insertstate->buf == InvalidBuffer);
+	Assert(!insertstate->bounds_valid);
+	Assert(insertstate->postingoff == 0);
+
+	if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
+	{
+		/* Simulate a _bt_getbuf() call with conditional locking */
+		insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
+		if (_bt_conditionallockbuf(rel, insertstate->buf))
+		{
+			Page		page;
+			BTPageOpaque opaque;
+
+			_bt_checkpage(rel, insertstate->buf);
+			page = BufferGetPage(insertstate->buf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+			/*
+			 * Check if the page is still the rightmost leaf page and has
+			 * enough free space to accommodate the new tuple.  Also check
+			 * that the insertion scan key is strictly greater than the first
+			 * non-pivot tuple on the page.  (Note that we expect itup_key's
+			 * scantid to be unset when our caller is a checkingunique
+			 * inserter.)
+			 */
+			if (P_RIGHTMOST(opaque) &&
+				P_ISLEAF(opaque) &&
+				!P_IGNORE(opaque) &&
+				PageGetFreeSpace(page) > insertstate->itemsz &&
+				PageGetMaxOffsetNumber(page) >= P_HIKEY &&
+				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
+			{
+				/*
+				 * Caller can use the fastpath optimization because cached
+				 * block is still rightmost leaf page, which can fit caller's
+				 * new tuple without splitting.  Keep block in local cache for
+				 * next insert, and have caller use NULL stack.
+				 *
+				 * Note that _bt_insert_parent() has an assertion that catches
+				 * leaf page splits that somehow follow from a fastpath insert
+				 * (it should only be passed a NULL stack when it must deal
+				 * with a concurrent root page split, and never because a NULL
+				 * stack was returned here).
+				 */
+				return NULL;
+			}
+
+			/* Page unsuitable for caller, drop lock and pin */
+			_bt_relbuf(rel, insertstate->buf);
+		}
+		else
+		{
+			/* Lock unavailable, drop pin */
+			ReleaseBuffer(insertstate->buf);
+		}
+
+		/* Forget block, since cache doesn't appear to be useful */
+		RelationSetTargetBlock(rel, InvalidBlockNumber);
+	}
+
+	/* Cannot use optimization -- descend tree, return proper descent stack */
+	return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
+					  NULL);
+}
+
+/*
+ *	_bt_check_unique() -- Check for violation of unique index constraint
+ *
+ * Returns InvalidTransactionId if there is no conflict, else an xact ID
+ * we must wait for to see if it commits a conflicting tuple.   If an actual
+ * conflict is detected, no return --- just ereport().  If an xact ID is
+ * returned, and the conflicting tuple still has a speculative insertion in
+ * progress, *speculativeToken is set to non-zero, and the caller can wait for
+ * the verdict on the insertion using SpeculativeInsertionWait().
+ *
+ * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
+ * InvalidTransactionId because we don't want to wait.  In this case we
+ * set *is_unique to false if there is a potential conflict, and the
+ * core code must redo the uniqueness check later.
+ *
+ * As a side-effect, sets state in insertstate that can later be used by
+ * _bt_findinsertloc() to reuse most of the binary search work we do
+ * here.
+ *
+ * Do not call here when there are NULL values in scan key.  NULL should be
+ * considered unequal to NULL when checking for duplicates, but we are not
+ * prepared to handle that correctly.
+ */
+static TransactionId
+_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
+				 IndexUniqueCheck checkUnique, bool *is_unique,
+				 uint32 *speculativeToken)
+{
+	IndexTuple	itup = insertstate->itup;
+	IndexTuple	curitup = NULL;
+	ItemId		curitemid = NULL;
+	BTScanInsert itup_key = insertstate->itup_key;
+	SnapshotData SnapshotDirty;
+	OffsetNumber offset;
+	OffsetNumber maxoff;
+	Page		page;
+	BTPageOpaque opaque;
+	Buffer		nbuf = InvalidBuffer;
+	bool		found = false;
+	bool		inposting = false;
+	bool		prevalldead = true;
+	int			curposti = 0;
+
+	/* Assume unique until we find a duplicate */
+	*is_unique = true;
+
+	InitDirtySnapshot(SnapshotDirty);
+
+	page = BufferGetPage(insertstate->buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * Find the first tuple with the same key.
+	 *
+	 * This also saves the binary search bounds in insertstate.  We use them
+	 * in the fastpath below, but also in the _bt_findinsertloc() call later.
+	 */
+	Assert(!insertstate->bounds_valid);
+	offset = _bt_binsrch_insert(rel, insertstate);
+
+	/*
+	 * Scan over all equal tuples, looking for live conflicts.
+	 */
+	Assert(!insertstate->bounds_valid || insertstate->low == offset);
+	Assert(!itup_key->anynullkeys);
+	Assert(itup_key->scantid == NULL);
+	for (;;)
+	{
+		/*
+		 * Each iteration of the loop processes one heap TID, not one index
+		 * tuple.  Current offset number for page isn't usually advanced on
+		 * iterations that process heap TIDs from posting list tuples.
+		 *
+		 * "inposting" state is set when _inside_ a posting list --- not when
+		 * we're at the start (or end) of a posting list.  We advance curposti
+		 * at the end of the iteration when inside a posting list tuple.  In
+		 * general, every loop iteration either advances the page offset or
+		 * advances curposti --- an iteration that handles the rightmost/max
+		 * heap TID in a posting list finally advances the page offset (and
+		 * unsets "inposting").
+		 *
+		 * Make sure the offset points to an actual index tuple before trying
+		 * to examine it...
+		 */
+		if (offset <= maxoff)
+		{
+			/*
+			 * Fastpath: In most cases, we can use cached search bounds to
+			 * limit our consideration to items that are definitely
+			 * duplicates.  This fastpath doesn't apply when the original page
+			 * is empty, or when initial offset is past the end of the
+			 * original page, which may indicate that we need to examine a
+			 * second or subsequent page.
+			 *
+			 * Note that this optimization allows us to avoid calling
+			 * _bt_compare() directly when there are no duplicates, as long as
+			 * the offset where the key will go is not at the end of the page.
+			 */
+			if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
+			{
+				Assert(insertstate->bounds_valid);
+				Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
+				Assert(insertstate->low <= insertstate->stricthigh);
+				Assert(_bt_compare(rel, itup_key, page, offset) < 0);
+				break;
+			}
+
+			/*
+			 * We can skip items that are already marked killed.
+			 *
+			 * In the presence of heavy update activity an index may contain
+			 * many killed items with the same key; running _bt_compare() on
+			 * each killed item gets expensive.  Just advance over killed
+			 * items as quickly as we can.  We only apply _bt_compare() when
+			 * we get to a non-killed item.  We could reuse the bounds to
+			 * avoid _bt_compare() calls for known equal tuples, but it
+			 * doesn't seem worth it.
+			 */
+			if (!inposting)
+				curitemid = PageGetItemId(page, offset);
+			if (inposting || !ItemIdIsDead(curitemid))
+			{
+				ItemPointerData htid;
+				bool		all_dead = false;
+
+				if (!inposting)
+				{
+					/* Plain tuple, or first TID in posting list tuple */
+					if (_bt_compare(rel, itup_key, page, offset) != 0)
+						break;	/* we're past all the equal tuples */
+
+					/* Advanced curitup */
+					curitup = (IndexTuple) PageGetItem(page, curitemid);
+					Assert(!BTreeTupleIsPivot(curitup));
+				}
+
+				/* okay, we gotta fetch the heap tuple using htid ... */
+				if (!BTreeTupleIsPosting(curitup))
+				{
+					/* ... htid is from simple non-pivot tuple */
+					Assert(!inposting);
+					htid = curitup->t_tid;
+				}
+				else if (!inposting)
+				{
+					/* ... htid is first TID in new posting list */
+					inposting = true;
+					prevalldead = true;
+					curposti = 0;
+					htid = *BTreeTupleGetPostingN(curitup, 0);
+				}
+				else
+				{
+					/* ... htid is second or subsequent TID in posting list */
+					Assert(curposti > 0);
+					htid = *BTreeTupleGetPostingN(curitup, curposti);
+				}
+
+				/*
+				 * If we are doing a recheck, we expect to find the tuple we
+				 * are rechecking.  It's not a duplicate, but we have to keep
+				 * scanning.
+				 */
+				if (checkUnique == UNIQUE_CHECK_EXISTING &&
+					ItemPointerCompare(&htid, &itup->t_tid) == 0)
+				{
+					found = true;
+				}
+
+				/*
+				 * Check if there's any table tuples for this index entry
+				 * satisfying SnapshotDirty. This is necessary because for AMs
+				 * with optimizations like heap's HOT, we have just a single
+				 * index entry for the entire chain.
+				 */
+				else if (table_index_fetch_tuple_check(heapRel, &htid,
+													   &SnapshotDirty,
+													   &all_dead))
+				{
+					TransactionId xwait;
+
+					/*
+					 * It is a duplicate. If we are only doing a partial
+					 * check, then don't bother checking if the tuple is being
+					 * updated in another transaction. Just return the fact
+					 * that it is a potential conflict and leave the full
+					 * check till later. Don't invalidate binary search
+					 * bounds.
+					 */
+					if (checkUnique == UNIQUE_CHECK_PARTIAL)
+					{
+						if (nbuf != InvalidBuffer)
+							_bt_relbuf(rel, nbuf);
+						*is_unique = false;
+						return InvalidTransactionId;
+					}
+
+					/*
+					 * If this tuple is being updated by other transaction
+					 * then we have to wait for its commit/abort.
+					 */
+					xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
+						SnapshotDirty.xmin : SnapshotDirty.xmax;
+
+					if (TransactionIdIsValid(xwait))
+					{
+						if (nbuf != InvalidBuffer)
+							_bt_relbuf(rel, nbuf);
+						/* Tell _bt_doinsert to wait... */
+						*speculativeToken = SnapshotDirty.speculativeToken;
+						/* Caller releases lock on buf immediately */
+						insertstate->bounds_valid = false;
+						return xwait;
+					}
+
+					/*
+					 * Otherwise we have a definite conflict.  But before
+					 * complaining, look to see if the tuple we want to insert
+					 * is itself now committed dead --- if so, don't complain.
+					 * This is a waste of time in normal scenarios but we must
+					 * do it to support CREATE INDEX CONCURRENTLY.
+					 *
+					 * We must follow HOT-chains here because during
+					 * concurrent index build, we insert the root TID though
+					 * the actual tuple may be somewhere in the HOT-chain.
+					 * While following the chain we might not stop at the
+					 * exact tuple which triggered the insert, but that's OK
+					 * because if we find a live tuple anywhere in this chain,
+					 * we have a unique key conflict.  The other live tuple is
+					 * not part of this chain because it had a different index
+					 * entry.
+					 */
+					htid = itup->t_tid;
+					if (table_index_fetch_tuple_check(heapRel, &htid,
+													  SnapshotSelf, NULL))
+					{
+						/* Normal case --- it's still live */
+					}
+					else
+					{
+						/*
+						 * It's been deleted, so no error, and no need to
+						 * continue searching
+						 */
+						break;
+					}
+
+					/*
+					 * Check for a conflict-in as we would if we were going to
+					 * write to this page.  We aren't actually going to write,
+					 * but we want a chance to report SSI conflicts that would
+					 * otherwise be masked by this unique constraint
+					 * violation.
+					 */
+					CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
+
+					/*
+					 * This is a definite conflict.  Break the tuple down into
+					 * datums and report the error.  But first, make sure we
+					 * release the buffer locks we're holding ---
+					 * BuildIndexValueDescription could make catalog accesses,
+					 * which in the worst case might touch this same index and
+					 * cause deadlocks.
+					 */
+					if (nbuf != InvalidBuffer)
+						_bt_relbuf(rel, nbuf);
+					_bt_relbuf(rel, insertstate->buf);
+					insertstate->buf = InvalidBuffer;
+					insertstate->bounds_valid = false;
+
+					{
+						Datum		values[INDEX_MAX_KEYS];
+						bool		isnull[INDEX_MAX_KEYS];
+						char	   *key_desc;
+
+						index_deform_tuple(itup, RelationGetDescr(rel),
+										   values, isnull);
+
+						key_desc = BuildIndexValueDescription(rel, values,
+															  isnull);
+
+						ereport(ERROR,
+								(errcode(ERRCODE_UNIQUE_VIOLATION),
+								 errmsg("duplicate key value violates unique constraint \"%s\"",
+										RelationGetRelationName(rel)),
+								 key_desc ? errdetail("Key %s already exists.",
+													  key_desc) : 0,
+								 errtableconstraint(heapRel,
+													RelationGetRelationName(rel))));
+					}
+				}
+				else if (all_dead && (!inposting ||
+									  (prevalldead &&
+									   curposti == BTreeTupleGetNPosting(curitup) - 1)))
+				{
+					/*
+					 * The conflicting tuple (or all HOT chains pointed to by
+					 * all posting list TIDs) is dead to everyone, so mark the
+					 * index entry killed.
+					 */
+					ItemIdMarkDead(curitemid);
+					opaque->btpo_flags |= BTP_HAS_GARBAGE;
+
+					/*
+					 * Mark buffer with a dirty hint, since state is not
+					 * crucial. Be sure to mark the proper buffer dirty.
+					 */
+					if (nbuf != InvalidBuffer)
+						MarkBufferDirtyHint(nbuf, true);
+					else
+						MarkBufferDirtyHint(insertstate->buf, true);
+				}
+
+				/*
+				 * Remember if posting list tuple has even a single HOT chain
+				 * whose members are not all dead
+				 */
+				if (!all_dead && inposting)
+					prevalldead = false;
+			}
+		}
+
+		if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
+		{
+			/* Advance to next TID in same posting list */
+			curposti++;
+			continue;
+		}
+		else if (offset < maxoff)
+		{
+			/* Advance to next tuple */
+			curposti = 0;
+			inposting = false;
+			offset = OffsetNumberNext(offset);
+		}
+		else
+		{
+			int			highkeycmp;
+
+			/* If scankey == hikey we gotta check the next page too */
+			if (P_RIGHTMOST(opaque))
+				break;
+			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
+			Assert(highkeycmp <= 0);
+			if (highkeycmp != 0)
+				break;
+			/* Advance to next non-dead page --- there must be one */
+			for (;;)
+			{
+				BlockNumber nblkno = opaque->btpo_next;
+
+				nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
+				page = BufferGetPage(nbuf);
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (!P_IGNORE(opaque))
+					break;
+				if (P_RIGHTMOST(opaque))
+					elog(ERROR, "fell off the end of index \"%s\"",
+						 RelationGetRelationName(rel));
+			}
+			/* Will also advance to next tuple */
+			curposti = 0;
+			inposting = false;
+			maxoff = PageGetMaxOffsetNumber(page);
+			offset = P_FIRSTDATAKEY(opaque);
+			/* Don't invalidate binary search bounds */
+		}
+	}
+
+	/*
+	 * If we are doing a recheck then we should have found the tuple we are
+	 * checking.  Otherwise there's something very wrong --- probably, the
+	 * index is on a non-immutable expression.
+	 */
+	if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("failed to re-find tuple within index \"%s\"",
+						RelationGetRelationName(rel)),
+				 errhint("This may be because of a non-immutable index expression."),
+				 errtableconstraint(heapRel,
+									RelationGetRelationName(rel))));
+
+	if (nbuf != InvalidBuffer)
+		_bt_relbuf(rel, nbuf);
+
+	return InvalidTransactionId;
+}
+
+
+/*
+ *	_bt_findinsertloc() -- Finds an insert location for a tuple
+ *
+ *		On entry, insertstate buffer contains the page the new tuple belongs
+ *		on.  It is exclusive-locked and pinned by the caller.
+ *
+ *		If 'checkingunique' is true, the buffer on entry is the first page
+ *		that contains duplicates of the new key.  If there are duplicates on
+ *		multiple pages, the correct insertion position might be some page to
+ *		the right, rather than the first page.  In that case, this function
+ *		moves right to the correct target page.
+ *
+ *		(In a !heapkeyspace index, there can be multiple pages with the same
+ *		high key, where the new tuple could legitimately be placed on.  In
+ *		that case, the caller passes the first page containing duplicates,
+ *		just like when checkingunique=true.  If that page doesn't have enough
+ *		room for the new tuple, this function moves right, trying to find a
+ *		legal page that does.)
+ *
+ *		If 'indexUnchanged' is true, this is for an UPDATE that didn't
+ *		logically change the indexed value, but must nevertheless have a new
+ *		entry to point to a successor version.  This hint from the executor
+ *		will influence our behavior when the page might have to be split and
+ *		we must consider our options.  Bottom-up index deletion can avoid
+ *		pathological version-driven page splits, but we only want to go to the
+ *		trouble of trying it when we already have moderate confidence that
+ *		it's appropriate.  The hint should not significantly affect our
+ *		behavior over time unless practically all inserts on to the leaf page
+ *		get the hint.
+ *
+ *		On exit, insertstate buffer contains the chosen insertion page, and
+ *		the offset within that page is returned.  If _bt_findinsertloc needed
+ *		to move right, the lock and pin on the original page are released, and
+ *		the new buffer is exclusively locked and pinned instead.
+ *
+ *		If insertstate contains cached binary search bounds, we will take
+ *		advantage of them.  This avoids repeating comparisons that we made in
+ *		_bt_check_unique() already.
+ *
+ *		If there is not enough room on the page for the new tuple, we try to
+ *		make room by removing any LP_DEAD tuples.
+ */
+static OffsetNumber
+_bt_findinsertloc(Relation rel,
+				  BTInsertState insertstate,
+				  bool checkingunique,
+				  bool indexUnchanged,
+				  BTStack stack,
+				  Relation heapRel)
+{
+	BTScanInsert itup_key = insertstate->itup_key;
+	Page		page = BufferGetPage(insertstate->buf);
+	BTPageOpaque opaque;
+	OffsetNumber newitemoff;
+
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* Check 1/3 of a page restriction */
+	if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
+		_bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
+							 insertstate->itup);
+
+	Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
+	Assert(!insertstate->bounds_valid || checkingunique);
+	Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
+	Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
+	Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
+
+	if (itup_key->heapkeyspace)
+	{
+		/* Keep track of whether checkingunique duplicate seen */
+		bool		uniquedup = indexUnchanged;
+
+		/*
+		 * If we're inserting into a unique index, we may have to walk right
+		 * through leaf pages to find the one leaf page that we must insert on
+		 * to.
+		 *
+		 * This is needed for checkingunique callers because a scantid was not
+		 * used when we called _bt_search().  scantid can only be set after
+		 * _bt_check_unique() has checked for duplicates.  The buffer
+		 * initially stored in insertstate->buf has the page where the first
+		 * duplicate key might be found, which isn't always the page that new
+		 * tuple belongs on.  The heap TID attribute for new tuple (scantid)
+		 * could force us to insert on a sibling page, though that should be
+		 * very rare in practice.
+		 */
+		if (checkingunique)
+		{
+			if (insertstate->low < insertstate->stricthigh)
+			{
+				/* Encountered a duplicate in _bt_check_unique() */
+				Assert(insertstate->bounds_valid);
+				uniquedup = true;
+			}
+
+			for (;;)
+			{
+				/*
+				 * Does the new tuple belong on this page?
+				 *
+				 * The earlier _bt_check_unique() call may well have
+				 * established a strict upper bound on the offset for the new
+				 * item.  If it's not the last item of the page (i.e. if there
+				 * is at least one tuple on the page that goes after the tuple
+				 * we're inserting) then we know that the tuple belongs on
+				 * this page.  We can skip the high key check.
+				 */
+				if (insertstate->bounds_valid &&
+					insertstate->low <= insertstate->stricthigh &&
+					insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+					break;
+
+				/* Test '<=', not '!=', since scantid is set now */
+				if (P_RIGHTMOST(opaque) ||
+					_bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
+					break;
+
+				_bt_stepright(rel, insertstate, stack);
+				/* Update local state after stepping right */
+				page = BufferGetPage(insertstate->buf);
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				/* Assume duplicates (if checkingunique) */
+				uniquedup = true;
+			}
+		}
+
+		/*
+		 * If the target page cannot fit newitem, try to avoid splitting the
+		 * page on insert by performing deletion or deduplication now
+		 */
+		if (PageGetFreeSpace(page) < insertstate->itemsz)
+			_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
+										 checkingunique, uniquedup,
+										 indexUnchanged);
+	}
+	else
+	{
+		/*----------
+		 * This is a !heapkeyspace (version 2 or 3) index.  The current page
+		 * is the first page that we could insert the new tuple to, but there
+		 * may be other pages to the right that we could opt to use instead.
+		 *
+		 * If the new key is equal to one or more existing keys, we can
+		 * legitimately place it anywhere in the series of equal keys.  In
+		 * fact, if the new key is equal to the page's "high key" we can place
+		 * it on the next page.  If it is equal to the high key, and there's
+		 * not room to insert the new tuple on the current page without
+		 * splitting, then we move right hoping to find more free space and
+		 * avoid a split.
+		 *
+		 * Keep scanning right until we
+		 *		(a) find a page with enough free space,
+		 *		(b) reach the last page where the tuple can legally go, or
+		 *		(c) get tired of searching.
+		 * (c) is not flippant; it is important because if there are many
+		 * pages' worth of equal keys, it's better to split one of the early
+		 * pages than to scan all the way to the end of the run of equal keys
+		 * on every insert.  We implement "get tired" as a random choice,
+		 * since stopping after scanning a fixed number of pages wouldn't work
+		 * well (we'd never reach the right-hand side of previously split
+		 * pages).  The probability of moving right is set at 0.99, which may
+		 * seem too high to change the behavior much, but it does an excellent
+		 * job of preventing O(N^2) behavior with many equal keys.
+		 *----------
+		 */
+		while (PageGetFreeSpace(page) < insertstate->itemsz)
+		{
+			/*
+			 * Before considering moving right, see if we can obtain enough
+			 * space by erasing LP_DEAD items
+			 */
+			if (P_HAS_GARBAGE(opaque))
+			{
+				/* Perform simple deletion */
+				_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
+											 false, false, false);
+
+				if (PageGetFreeSpace(page) >= insertstate->itemsz)
+					break;		/* OK, now we have enough space */
+			}
+
+			/*
+			 * Nope, so check conditions (b) and (c) enumerated above
+			 *
+			 * The earlier _bt_check_unique() call may well have established a
+			 * strict upper bound on the offset for the new item.  If it's not
+			 * the last item of the page (i.e. if there is at least one tuple
+			 * on the page that's greater than the tuple we're inserting to)
+			 * then we know that the tuple belongs on this page.  We can skip
+			 * the high key check.
+			 */
+			if (insertstate->bounds_valid &&
+				insertstate->low <= insertstate->stricthigh &&
+				insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+				break;
+
+			if (P_RIGHTMOST(opaque) ||
+				_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
+				random() <= (MAX_RANDOM_VALUE / 100))
+				break;
+
+			_bt_stepright(rel, insertstate, stack);
+			/* Update local state after stepping right */
+			page = BufferGetPage(insertstate->buf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
+	}
+
+	/*
+	 * We should now be on the correct page.  Find the offset within the page
+	 * for the new tuple. (Possibly reusing earlier search bounds.)
+	 */
+	Assert(P_RIGHTMOST(opaque) ||
+		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
+
+	newitemoff = _bt_binsrch_insert(rel, insertstate);
+
+	if (insertstate->postingoff == -1)
+	{
+		/*
+		 * There is an overlapping posting list tuple with its LP_DEAD bit
+		 * set.  We don't want to unnecessarily unset its LP_DEAD bit while
+		 * performing a posting list split, so perform simple index tuple
+		 * deletion early.
+		 */
+		_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
+									 false, false, false);
+
+		/*
+		 * Do new binary search.  New insert location cannot overlap with any
+		 * posting list now.
+		 */
+		Assert(!insertstate->bounds_valid);
+		insertstate->postingoff = 0;
+		newitemoff = _bt_binsrch_insert(rel, insertstate);
+		Assert(insertstate->postingoff == 0);
+	}
+
+	return newitemoff;
+}
+
+/*
+ * Step right to next non-dead page, during insertion.
+ *
+ * This is a bit more complicated than moving right in a search.  We must
+ * write-lock the target page before releasing write lock on current page;
+ * else someone else's _bt_check_unique scan could fail to see our insertion.
+ * Write locks on intermediate dead pages won't do because we don't know when
+ * they will get de-linked from the tree.
+ *
+ * This is more aggressive than it needs to be for non-unique !heapkeyspace
+ * indexes.
+ */
+static void
+_bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
+{
+	Page		page;
+	BTPageOpaque opaque;
+	Buffer		rbuf;
+	BlockNumber rblkno;
+
+	page = BufferGetPage(insertstate->buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	rbuf = InvalidBuffer;
+	rblkno = opaque->btpo_next;
+	for (;;)
+	{
+		rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
+		page = BufferGetPage(rbuf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		/*
+		 * If this page was incompletely split, finish the split now.  We do
+		 * this while holding a lock on the left sibling, which is not good
+		 * because finishing the split could be a fairly lengthy operation.
+		 * But this should happen very seldom.
+		 */
+		if (P_INCOMPLETE_SPLIT(opaque))
+		{
+			_bt_finish_split(rel, rbuf, stack);
+			rbuf = InvalidBuffer;
+			continue;
+		}
+
+		if (!P_IGNORE(opaque))
+			break;
+		if (P_RIGHTMOST(opaque))
+			elog(ERROR, "fell off the end of index \"%s\"",
+				 RelationGetRelationName(rel));
+
+		rblkno = opaque->btpo_next;
+	}
+	/* rbuf locked; unlock buf, update state for caller */
+	_bt_relbuf(rel, insertstate->buf);
+	insertstate->buf = rbuf;
+	insertstate->bounds_valid = false;
+}
+
+/*----------
+ *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ *		This recursive procedure does the following things:
+ *
+ *			+  if postingoff != 0, splits existing posting list tuple
+ *			   (since it overlaps with new 'itup' tuple).
+ *			+  if necessary, splits the target page, using 'itup_key' for
+ *			   suffix truncation on leaf pages (caller passes NULL for
+ *			   non-leaf pages).
+ *			+  inserts the new tuple (might be split from posting list).
+ *			+  if the page was split, pops the parent stack, and finds the
+ *			   right place to insert the new child pointer (by walking
+ *			   right using information stored in the parent stack).
+ *			+  invokes itself with the appropriate tuple for the right
+ *			   child page on the parent.
+ *			+  updates the metapage if a true root or fast root is split.
+ *
+ *		On entry, we must have the correct buffer in which to do the
+ *		insertion, and the buffer must be pinned and write-locked.  On return,
+ *		we will have dropped both the pin and the lock on the buffer.
+ *
+ *		This routine only performs retail tuple insertions.  'itup' should
+ *		always be either a non-highkey leaf item, or a downlink (new high
+ *		key items are created indirectly, when a page is split).  When
+ *		inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
+ *		we're inserting the downlink for.  This function will clear the
+ *		INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *----------
+ */
+static void
+_bt_insertonpg(Relation rel,
+			   BTScanInsert itup_key,
+			   Buffer buf,
+			   Buffer cbuf,
+			   BTStack stack,
+			   IndexTuple itup,
+			   Size itemsz,
+			   OffsetNumber newitemoff,
+			   int postingoff,
+			   bool split_only_page)
+{
+	Page		page;
+	BTPageOpaque opaque;
+	bool		isleaf,
+				isroot,
+				isrightmost,
+				isonly;
+	IndexTuple	oposting = NULL;
+	IndexTuple	origitup = NULL;
+	IndexTuple	nposting = NULL;
+
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	isleaf = P_ISLEAF(opaque);
+	isroot = P_ISROOT(opaque);
+	isrightmost = P_RIGHTMOST(opaque);
+	isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
+
+	/* child buffer must be given iff inserting on an internal page */
+	Assert(isleaf == !BufferIsValid(cbuf));
+	/* tuple must have appropriate number of attributes */
+	Assert(!isleaf ||
+		   BTreeTupleGetNAtts(itup, rel) ==
+		   IndexRelationGetNumberOfAttributes(rel));
+	Assert(isleaf ||
+		   BTreeTupleGetNAtts(itup, rel) <=
+		   IndexRelationGetNumberOfKeyAttributes(rel));
+	Assert(!BTreeTupleIsPosting(itup));
+	Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
+	/* Caller must always finish incomplete split for us */
+	Assert(!P_INCOMPLETE_SPLIT(opaque));
+
+	/*
+	 * Every internal page should have exactly one negative infinity item at
+	 * all times.  Only _bt_split() and _bt_newroot() should add items that
+	 * become negative infinity items through truncation, since they're the
+	 * only routines that allocate new internal pages.
+	 */
+	Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
+
+	/*
+	 * Do we need to split an existing posting list item?
+	 */
+	if (postingoff != 0)
+	{
+		ItemId		itemid = PageGetItemId(page, newitemoff);
+
+		/*
+		 * The new tuple is a duplicate with a heap TID that falls inside the
+		 * range of an existing posting list tuple on a leaf page.  Prepare to
+		 * split an existing posting list.  Overwriting the posting list with
+		 * its post-split version is treated as an extra step in either the
+		 * insert or page split critical section.
+		 */
+		Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
+		oposting = (IndexTuple) PageGetItem(page, itemid);
+
+		/*
+		 * postingoff value comes from earlier call to _bt_binsrch_posting().
+		 * Its binary search might think that a plain tuple must be a posting
+		 * list tuple that needs to be split.  This can happen with corruption
+		 * involving an existing plain tuple that is a duplicate of the new
+		 * item, up to and including its table TID.  Check for that here in
+		 * passing.
+		 *
+		 * Also verify that our caller has made sure that the existing posting
+		 * list tuple does not have its LP_DEAD bit set.
+		 */
+		if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
+									 ItemPointerGetBlockNumber(&itup->t_tid),
+									 ItemPointerGetOffsetNumber(&itup->t_tid),
+									 newitemoff, BufferGetBlockNumber(buf),
+									 RelationGetRelationName(rel))));
+
+		/* use a mutable copy of itup as our itup from here on */
+		origitup = itup;
+		itup = CopyIndexTuple(origitup);
+		nposting = _bt_swap_posting(itup, oposting, postingoff);
+		/* itup now contains rightmost/max TID from oposting */
+
+		/* Alter offset so that newitem goes after posting list */
+		newitemoff = OffsetNumberNext(newitemoff);
+	}
+
+	/*
+	 * Do we need to split the page to fit the item on it?
+	 *
+	 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
+	 * so this comparison is correct even though we appear to be accounting
+	 * only for the item and not for its line pointer.
+	 */
+	if (PageGetFreeSpace(page) < itemsz)
+	{
+		Buffer		rbuf;
+
+		Assert(!split_only_page);
+
+		/* split the buffer into left and right halves */
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+						 origitup, nposting, postingoff);
+		PredicateLockPageSplit(rel,
+							   BufferGetBlockNumber(buf),
+							   BufferGetBlockNumber(rbuf));
+
+		/*----------
+		 * By here,
+		 *
+		 *		+  our target page has been split;
+		 *		+  the original tuple has been inserted;
+		 *		+  we have write locks on both the old (left half)
+		 *		   and new (right half) buffers, after the split; and
+		 *		+  we know the key we want to insert into the parent
+		 *		   (it's the "high key" on the left child page).
+		 *
+		 * We're ready to do the parent insertion.  We need to hold onto the
+		 * locks for the child pages until we locate the parent, but we can
+		 * at least release the lock on the right child before doing the
+		 * actual insertion.  The lock on the left child will be released
+		 * last of all by parent insertion, where it is the 'cbuf' of parent
+		 * page.
+		 *----------
+		 */
+		_bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly);
+	}
+	else
+	{
+		Buffer		metabuf = InvalidBuffer;
+		Page		metapg = NULL;
+		BTMetaPageData *metad = NULL;
+		BlockNumber blockcache;
+
+		/*
+		 * If we are doing this insert because we split a page that was the
+		 * only one on its tree level, but was not the root, it may have been
+		 * the "fast root".  We need to ensure that the fast root link points
+		 * at or above the current page.  We can safely acquire a lock on the
+		 * metapage here --- see comments for _bt_newroot().
+		 */
+		if (unlikely(split_only_page))
+		{
+			Assert(!isleaf);
+			Assert(BufferIsValid(cbuf));
+
+			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+			metapg = BufferGetPage(metabuf);
+			metad = BTPageGetMeta(metapg);
+
+			if (metad->btm_fastlevel >= opaque->btpo_level)
+			{
+				/* no update wanted */
+				_bt_relbuf(rel, metabuf);
+				metabuf = InvalidBuffer;
+			}
+		}
+
+		/* Do the update.  No ereport(ERROR) until changes are logged */
+		START_CRIT_SECTION();
+
+		if (postingoff != 0)
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+		if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
+						false) == InvalidOffsetNumber)
+			elog(PANIC, "failed to add new item to block %u in index \"%s\"",
+				 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+
+		MarkBufferDirty(buf);
+
+		if (BufferIsValid(metabuf))
+		{
+			/* upgrade meta-page if needed */
+			if (metad->btm_version < BTREE_NOVAC_VERSION)
+				_bt_upgrademetapage(metapg);
+			metad->btm_fastroot = BufferGetBlockNumber(buf);
+			metad->btm_fastlevel = opaque->btpo_level;
+			MarkBufferDirty(metabuf);
+		}
+
+		/*
+		 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
+		 * finishes a split
+		 */
+		if (!isleaf)
+		{
+			Page		cpage = BufferGetPage(cbuf);
+			BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+			Assert(P_INCOMPLETE_SPLIT(cpageop));
+			cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+			MarkBufferDirty(cbuf);
+		}
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			xl_btree_insert xlrec;
+			xl_btree_metadata xlmeta;
+			uint8		xlinfo;
+			XLogRecPtr	recptr;
+			uint16		upostingoff;
+
+			xlrec.offnum = newitemoff;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
+
+			if (isleaf && postingoff == 0)
+			{
+				/* Simple leaf insert */
+				xlinfo = XLOG_BTREE_INSERT_LEAF;
+			}
+			else if (postingoff != 0)
+			{
+				/*
+				 * Leaf insert with posting list split.  Must include
+				 * postingoff field before newitem/orignewitem.
+				 */
+				Assert(isleaf);
+				xlinfo = XLOG_BTREE_INSERT_POST;
+			}
+			else
+			{
+				/* Internal page insert, which finishes a split on cbuf */
+				xlinfo = XLOG_BTREE_INSERT_UPPER;
+				XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
+
+				if (BufferIsValid(metabuf))
+				{
+					/* Actually, it's an internal page insert + meta update */
+					xlinfo = XLOG_BTREE_INSERT_META;
+
+					Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+					xlmeta.version = metad->btm_version;
+					xlmeta.root = metad->btm_root;
+					xlmeta.level = metad->btm_level;
+					xlmeta.fastroot = metad->btm_fastroot;
+					xlmeta.fastlevel = metad->btm_fastlevel;
+					xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
+					xlmeta.allequalimage = metad->btm_allequalimage;
+
+					XLogRegisterBuffer(2, metabuf,
+									   REGBUF_WILL_INIT | REGBUF_STANDARD);
+					XLogRegisterBufData(2, (char *) &xlmeta,
+										sizeof(xl_btree_metadata));
+				}
+			}
+
+			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+			if (postingoff == 0)
+			{
+				/* Just log itup from caller */
+				XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+			}
+			else
+			{
+				/*
+				 * Insert with posting list split (XLOG_BTREE_INSERT_POST
+				 * record) case.
+				 *
+				 * Log postingoff.  Also log origitup, not itup.  REDO routine
+				 * must reconstruct final itup (as well as nposting) using
+				 * _bt_swap_posting().
+				 */
+				upostingoff = postingoff;
+
+				XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
+				XLogRegisterBufData(0, (char *) origitup,
+									IndexTupleSize(origitup));
+			}
+
+			recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+			if (BufferIsValid(metabuf))
+				PageSetLSN(metapg, recptr);
+			if (!isleaf)
+				PageSetLSN(BufferGetPage(cbuf), recptr);
+
+			PageSetLSN(page, recptr);
+		}
+
+		END_CRIT_SECTION();
+
+		/* Release subsidiary buffers */
+		if (BufferIsValid(metabuf))
+			_bt_relbuf(rel, metabuf);
+		if (!isleaf)
+			_bt_relbuf(rel, cbuf);
+
+		/*
+		 * Cache the block number if this is the rightmost leaf page.  Cache
+		 * may be used by a future inserter within _bt_search_insert().
+		 */
+		blockcache = InvalidBlockNumber;
+		if (isrightmost && isleaf && !isroot)
+			blockcache = BufferGetBlockNumber(buf);
+
+		/* Release buffer for insertion target block */
+		_bt_relbuf(rel, buf);
+
+		/*
+		 * If we decided to cache the insertion target block before releasing
+		 * its buffer lock, then cache it now.  Check the height of the tree
+		 * first, though.  We don't go for the optimization with small
+		 * indexes.  Defer final check to this point to ensure that we don't
+		 * call _bt_getrootheight while holding a buffer lock.
+		 */
+		if (BlockNumberIsValid(blockcache) &&
+			_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
+			RelationSetTargetBlock(rel, blockcache);
+	}
+
+	/* be tidy */
+	if (postingoff != 0)
+	{
+		/* itup is actually a modified copy of caller's original */
+		pfree(nposting);
+		pfree(itup);
+	}
+}
+
+/*
+ *	_bt_split() -- split a page in the btree.
+ *
+ *		On entry, buf is the page to split, and is pinned and write-locked.
+ *		newitemoff etc. tell us about the new item that must be inserted
+ *		along with the data from the original page.
+ *
+ *		itup_key is used for suffix truncation on leaf pages (internal
+ *		page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
+ *		is the left-sibling of the page we're inserting the downlink for.
+ *		This function will clear the INCOMPLETE_SPLIT flag on it, and
+ *		release the buffer.
+ *
+ *		orignewitem, nposting, and postingoff are needed when an insert of
+ *		orignewitem results in both a posting list split and a page split.
+ *		These extra posting list split details are used here in the same
+ *		way as they are used in the more common case where a posting list
+ *		split does not coincide with a page split.  We need to deal with
+ *		posting list splits directly in order to ensure that everything
+ *		that follows from the insert of orignewitem is handled as a single
+ *		atomic operation (though caller's insert of a new pivot/downlink
+ *		into parent page will still be a separate operation).  See
+ *		nbtree/README for details on the design of posting list splits.
+ *
+ *		Returns the new right sibling of buf, pinned and write-locked.
+ *		The pin and lock on buf are maintained.
+ */
+static Buffer
+_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
+		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+		  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
+{
+	Buffer		rbuf;
+	Page		origpage;
+	Page		leftpage,
+				rightpage;
+	BlockNumber origpagenumber,
+				rightpagenumber;
+	BTPageOpaque ropaque,
+				lopaque,
+				oopaque;
+	Buffer		sbuf = InvalidBuffer;
+	Page		spage = NULL;
+	BTPageOpaque sopaque = NULL;
+	Size		itemsz;
+	ItemId		itemid;
+	IndexTuple	firstright,
+				lefthighkey;
+	OffsetNumber firstrightoff;
+	OffsetNumber afterleftoff,
+				afterrightoff,
+				minusinfoff;
+	OffsetNumber origpagepostingoff;
+	OffsetNumber maxoff;
+	OffsetNumber i;
+	bool		newitemonleft,
+				isleaf,
+				isrightmost;
+
+	/*
+	 * origpage is the original page to be split.  leftpage is a temporary
+	 * buffer that receives the left-sibling data, which will be copied back
+	 * into origpage on success.  rightpage is the new page that will receive
+	 * the right-sibling data.
+	 *
+	 * leftpage is allocated after choosing a split point.  rightpage's new
+	 * buffer isn't acquired until after leftpage is initialized and has new
+	 * high key, the last point where splitting the page may fail (barring
+	 * corruption).  Failing before acquiring new buffer won't have lasting
+	 * consequences, since origpage won't have been modified and leftpage is
+	 * only workspace.
+	 */
+	origpage = BufferGetPage(buf);
+	oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+	isleaf = P_ISLEAF(oopaque);
+	isrightmost = P_RIGHTMOST(oopaque);
+	maxoff = PageGetMaxOffsetNumber(origpage);
+	origpagenumber = BufferGetBlockNumber(buf);
+
+	/*
+	 * Choose a point to split origpage at.
+	 *
+	 * A split point can be thought of as a point _between_ two existing data
+	 * items on origpage (the lastleft and firstright tuples), provided you
+	 * pretend that the new item that didn't fit is already on origpage.
+	 *
+	 * Since origpage does not actually contain newitem, the representation of
+	 * split points needs to work with two boundary cases: splits where
+	 * newitem is lastleft, and splits where newitem is firstright.
+	 * newitemonleft resolves the ambiguity that would otherwise exist when
+	 * newitemoff == firstrightoff.  In all other cases it's clear which side
+	 * of the split every tuple goes on from context.  newitemonleft is
+	 * usually (but not always) redundant information.
+	 *
+	 * firstrightoff is supposed to be an origpage offset number, but it's
+	 * possible that its value will be maxoff+1, which is "past the end" of
+	 * origpage.  This happens in the rare case where newitem goes after all
+	 * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
+	 * origpage at the point that leaves newitem alone on new right page.  Any
+	 * "!newitemonleft && newitemoff == firstrightoff" split point makes
+	 * newitem the firstright tuple, though, so this case isn't a special
+	 * case.
+	 */
+	firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
+									 newitem, &newitemonleft);
+
+	/* Allocate temp buffer for leftpage */
+	leftpage = PageGetTempPage(origpage);
+	_bt_pageinit(leftpage, BufferGetPageSize(buf));
+	lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+
+	/*
+	 * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
+	 * and HAS_GARBAGE flags.
+	 */
+	lopaque->btpo_flags = oopaque->btpo_flags;
+	lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+	/* set flag in leftpage indicating that rightpage has no downlink yet */
+	lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
+	lopaque->btpo_prev = oopaque->btpo_prev;
+	/* handle btpo_next after rightpage buffer acquired */
+	lopaque->btpo_level = oopaque->btpo_level;
+	/* handle btpo_cycleid after rightpage buffer acquired */
+
+	/*
+	 * Copy the original page's LSN into leftpage, which will become the
+	 * updated version of the page.  We need this because XLogInsert will
+	 * examine the LSN and possibly dump it in a page image.
+	 */
+	PageSetLSN(leftpage, PageGetLSN(origpage));
+
+	/*
+	 * Determine page offset number of existing overlapped-with-orignewitem
+	 * posting list when it is necessary to perform a posting list split in
+	 * passing.  Note that newitem was already changed by caller (newitem no
+	 * longer has the orignewitem TID).
+	 *
+	 * This page offset number (origpagepostingoff) will be used to pretend
+	 * that the posting split has already taken place, even though the
+	 * required modifications to origpage won't occur until we reach the
+	 * critical section.  The lastleft and firstright tuples of our page split
+	 * point should, in effect, come from an imaginary version of origpage
+	 * that has the nposting tuple instead of the original posting list tuple.
+	 *
+	 * Note: _bt_findsplitloc() should have compensated for coinciding posting
+	 * list splits in just the same way, at least in theory.  It doesn't
+	 * bother with that, though.  In practice it won't affect its choice of
+	 * split point.
+	 */
+	origpagepostingoff = InvalidOffsetNumber;
+	if (postingoff != 0)
+	{
+		Assert(isleaf);
+		Assert(ItemPointerCompare(&orignewitem->t_tid,
+								  &newitem->t_tid) < 0);
+		Assert(BTreeTupleIsPosting(nposting));
+		origpagepostingoff = OffsetNumberPrev(newitemoff);
+	}
+
+	/*
+	 * The high key for the new left page is a possibly-truncated copy of
+	 * firstright on the leaf level (it's "firstright itself" on internal
+	 * pages; see !isleaf comments below).  This may seem to be contrary to
+	 * Lehman & Yao's approach of using a copy of lastleft as the new high key
+	 * when splitting on the leaf level.  It isn't, though.
+	 *
+	 * Suffix truncation will leave the left page's high key fully equal to
+	 * lastleft when lastleft and firstright are equal prior to heap TID (that
+	 * is, the tiebreaker TID value comes from lastleft).  It isn't actually
+	 * necessary for a new leaf high key to be a copy of lastleft for the L&Y
+	 * "subtree" invariant to hold.  It's sufficient to make sure that the new
+	 * leaf high key is strictly less than firstright, and greater than or
+	 * equal to (not necessarily equal to) lastleft.  In other words, when
+	 * suffix truncation isn't possible during a leaf page split, we take
+	 * L&Y's exact approach to generating a new high key for the left page.
+	 * (Actually, that is slightly inaccurate.  We don't just use a copy of
+	 * lastleft.  A tuple with all the keys from firstright but the max heap
+	 * TID from lastleft is used, to avoid introducing a special case.)
+	 */
+	if (!newitemonleft && newitemoff == firstrightoff)
+	{
+		/* incoming tuple becomes firstright */
+		itemsz = newitemsz;
+		firstright = newitem;
+	}
+	else
+	{
+		/* existing item at firstrightoff becomes firstright */
+		itemid = PageGetItemId(origpage, firstrightoff);
+		itemsz = ItemIdGetLength(itemid);
+		firstright = (IndexTuple) PageGetItem(origpage, itemid);
+		if (firstrightoff == origpagepostingoff)
+			firstright = nposting;
+	}
+
+	if (isleaf)
+	{
+		IndexTuple	lastleft;
+
+		/* Attempt suffix truncation for leaf page splits */
+		if (newitemonleft && newitemoff == firstrightoff)
+		{
+			/* incoming tuple becomes lastleft */
+			lastleft = newitem;
+		}
+		else
+		{
+			OffsetNumber lastleftoff;
+
+			/* existing item before firstrightoff becomes lastleft */
+			lastleftoff = OffsetNumberPrev(firstrightoff);
+			Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
+			itemid = PageGetItemId(origpage, lastleftoff);
+			lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+			if (lastleftoff == origpagepostingoff)
+				lastleft = nposting;
+		}
+
+		lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
+		itemsz = IndexTupleSize(lefthighkey);
+	}
+	else
+	{
+		/*
+		 * Don't perform suffix truncation on a copy of firstright to make
+		 * left page high key for internal page splits.  Must use firstright
+		 * as new high key directly.
+		 *
+		 * Each distinct separator key value originates as a leaf level high
+		 * key; all other separator keys/pivot tuples are copied from one
+		 * level down.  A separator key in a grandparent page must be
+		 * identical to high key in rightmost parent page of the subtree to
+		 * its left, which must itself be identical to high key in rightmost
+		 * child page of that same subtree (this even applies to separator
+		 * from grandparent's high key).  There must always be an unbroken
+		 * "seam" of identical separator keys that guide index scans at every
+		 * level, starting from the grandparent.  That's why suffix truncation
+		 * is unsafe here.
+		 *
+		 * Internal page splits will truncate firstright into a "negative
+		 * infinity" data item when it gets inserted on the new right page
+		 * below, though.  This happens during the call to _bt_pgaddtup() for
+		 * the new first data item for right page.  Do not confuse this
+		 * mechanism with suffix truncation.  It is just a convenient way of
+		 * implementing page splits that split the internal page "inside"
+		 * firstright.  The lefthighkey separator key cannot appear a second
+		 * time in the right page (only firstright's downlink goes in right
+		 * page).
+		 */
+		lefthighkey = firstright;
+	}
+
+	/*
+	 * Add new high key to leftpage
+	 */
+	afterleftoff = P_HIKEY;
+
+	Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
+	Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
+		   IndexRelationGetNumberOfKeyAttributes(rel));
+	Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
+	if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
+					false) == InvalidOffsetNumber)
+		elog(ERROR, "failed to add high key to the left sibling"
+			 " while splitting block %u of index \"%s\"",
+			 origpagenumber, RelationGetRelationName(rel));
+	afterleftoff = OffsetNumberNext(afterleftoff);
+
+	/*
+	 * Acquire a new right page to split into, now that left page has a new
+	 * high key.  From here on, it's not okay to throw an error without
+	 * zeroing rightpage first.  This coding rule ensures that we won't
+	 * confuse future VACUUM operations, which might otherwise try to re-find
+	 * a downlink to a leftover junk page as the page undergoes deletion.
+	 *
+	 * It would be reasonable to start the critical section just after the new
+	 * rightpage buffer is acquired instead; that would allow us to avoid
+	 * leftover junk pages without bothering to zero rightpage.  We do it this
+	 * way because it avoids an unnecessary PANIC when either origpage or its
+	 * existing sibling page are corrupt.
+	 */
+	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+	rightpage = BufferGetPage(rbuf);
+	rightpagenumber = BufferGetBlockNumber(rbuf);
+	/* rightpage was initialized by _bt_getbuf */
+	ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+
+	/*
+	 * Finish off remaining leftpage special area fields.  They cannot be set
+	 * before both origpage (leftpage) and rightpage buffers are acquired and
+	 * locked.
+	 *
+	 * btpo_cycleid is only used with leaf pages, though we set it here in all
+	 * cases just to be consistent.
+	 */
+	lopaque->btpo_next = rightpagenumber;
+	lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
+
+	/*
+	 * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
+	 * and HAS_GARBAGE flags.
+	 */
+	ropaque->btpo_flags = oopaque->btpo_flags;
+	ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+	ropaque->btpo_prev = origpagenumber;
+	ropaque->btpo_next = oopaque->btpo_next;
+	ropaque->btpo_level = oopaque->btpo_level;
+	ropaque->btpo_cycleid = lopaque->btpo_cycleid;
+
+	/*
+	 * Add new high key to rightpage where necessary.
+	 *
+	 * If the page we're splitting is not the rightmost page at its level in
+	 * the tree, then the first entry on the page is the high key from
+	 * origpage.
+	 */
+	afterrightoff = P_HIKEY;
+
+	if (!isrightmost)
+	{
+		IndexTuple	righthighkey;
+
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		itemsz = ItemIdGetLength(itemid);
+		righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
+		Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
+		Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
+			   IndexRelationGetNumberOfKeyAttributes(rel));
+		if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
+						false, false) == InvalidOffsetNumber)
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			elog(ERROR, "failed to add high key to the right sibling"
+				 " while splitting block %u of index \"%s\"",
+				 origpagenumber, RelationGetRelationName(rel));
+		}
+		afterrightoff = OffsetNumberNext(afterrightoff);
+	}
+
+	/*
+	 * Internal page splits truncate first data item on right page -- it
+	 * becomes "minus infinity" item for the page.  Set this up here.
+	 */
+	minusinfoff = InvalidOffsetNumber;
+	if (!isleaf)
+		minusinfoff = afterrightoff;
+
+	/*
+	 * Now transfer all the data items (non-pivot tuples in isleaf case, or
+	 * additional pivot tuples in !isleaf case) to the appropriate page.
+	 *
+	 * Note: we *must* insert at least the right page's items in item-number
+	 * order, for the benefit of _bt_restore_page().
+	 */
+	for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
+	{
+		IndexTuple	dataitem;
+
+		itemid = PageGetItemId(origpage, i);
+		itemsz = ItemIdGetLength(itemid);
+		dataitem = (IndexTuple) PageGetItem(origpage, itemid);
+
+		/* replace original item with nposting due to posting split? */
+		if (i == origpagepostingoff)
+		{
+			Assert(BTreeTupleIsPosting(dataitem));
+			Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
+			dataitem = nposting;
+		}
+
+		/* does new item belong before this one? */
+		else if (i == newitemoff)
+		{
+			if (newitemonleft)
+			{
+				Assert(newitemoff <= firstrightoff);
+				if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
+								  false))
+				{
+					memset(rightpage, 0, BufferGetPageSize(rbuf));
+					elog(ERROR, "failed to add new item to the left sibling"
+						 " while splitting block %u of index \"%s\"",
+						 origpagenumber, RelationGetRelationName(rel));
+				}
+				afterleftoff = OffsetNumberNext(afterleftoff);
+			}
+			else
+			{
+				Assert(newitemoff >= firstrightoff);
+				if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
+								  afterrightoff == minusinfoff))
+				{
+					memset(rightpage, 0, BufferGetPageSize(rbuf));
+					elog(ERROR, "failed to add new item to the right sibling"
+						 " while splitting block %u of index \"%s\"",
+						 origpagenumber, RelationGetRelationName(rel));
+				}
+				afterrightoff = OffsetNumberNext(afterrightoff);
+			}
+		}
+
+		/* decide which page to put it on */
+		if (i < firstrightoff)
+		{
+			if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
+			{
+				memset(rightpage, 0, BufferGetPageSize(rbuf));
+				elog(ERROR, "failed to add old item to the left sibling"
+					 " while splitting block %u of index \"%s\"",
+					 origpagenumber, RelationGetRelationName(rel));
+			}
+			afterleftoff = OffsetNumberNext(afterleftoff);
+		}
+		else
+		{
+			if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
+							  afterrightoff == minusinfoff))
+			{
+				memset(rightpage, 0, BufferGetPageSize(rbuf));
+				elog(ERROR, "failed to add old item to the right sibling"
+					 " while splitting block %u of index \"%s\"",
+					 origpagenumber, RelationGetRelationName(rel));
+			}
+			afterrightoff = OffsetNumberNext(afterrightoff);
+		}
+	}
+
+	/* Handle case where newitem goes at the end of rightpage */
+	if (i <= newitemoff)
+	{
+		/*
+		 * Can't have newitemonleft here; that would imply we were told to put
+		 * *everything* on the left page, which cannot fit (if it could, we'd
+		 * not be splitting the page).
+		 */
+		Assert(!newitemonleft && newitemoff == maxoff + 1);
+		if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
+						  afterrightoff == minusinfoff))
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			elog(ERROR, "failed to add new item to the right sibling"
+				 " while splitting block %u of index \"%s\"",
+				 origpagenumber, RelationGetRelationName(rel));
+		}
+		afterrightoff = OffsetNumberNext(afterrightoff);
+	}
+
+	/*
+	 * We have to grab the original right sibling (if any) and update its prev
+	 * link.  We are guaranteed that this is deadlock-free, since we couple
+	 * the locks in the standard order: left to right.
+	 */
+	if (!isrightmost)
+	{
+		sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
+		spage = BufferGetPage(sbuf);
+		sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+		if (sopaque->btpo_prev != origpagenumber)
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("right sibling's left-link doesn't match: "
+									 "block %u links to %u instead of expected %u in index \"%s\"",
+									 oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
+									 RelationGetRelationName(rel))));
+		}
+
+		/*
+		 * Check to see if we can set the SPLIT_END flag in the right-hand
+		 * split page; this can save some I/O for vacuum since it need not
+		 * proceed to the right sibling.  We can set the flag if the right
+		 * sibling has a different cycleid: that means it could not be part of
+		 * a group of pages that were all split off from the same ancestor
+		 * page.  If you're confused, imagine that page A splits to A B and
+		 * then again, yielding A C B, while vacuum is in progress.  Tuples
+		 * originally in A could now be in either B or C, hence vacuum must
+		 * examine both pages.  But if D, our right sibling, has a different
+		 * cycleid then it could not contain any tuples that were in A when
+		 * the vacuum started.
+		 */
+		if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
+			ropaque->btpo_flags |= BTP_SPLIT_END;
+	}
+
+	/*
+	 * Right sibling is locked, new siblings are prepared, but original page
+	 * is not updated yet.
+	 *
+	 * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
+	 * not starting the critical section till here because we haven't been
+	 * scribbling on the original page yet; see comments above.
+	 */
+	START_CRIT_SECTION();
+
+	/*
+	 * By here, the original data page has been split into two new halves, and
+	 * these are correct.  The algorithm requires that the left page never
+	 * move during a split, so we copy the new left page back on top of the
+	 * original.  We need to do this before writing the WAL record, so that
+	 * XLogInsert can WAL log an image of the page if necessary.
+	 */
+	PageRestoreTempPage(leftpage, origpage);
+	/* leftpage, lopaque must not be used below here */
+
+	MarkBufferDirty(buf);
+	MarkBufferDirty(rbuf);
+
+	if (!isrightmost)
+	{
+		sopaque->btpo_prev = rightpagenumber;
+		MarkBufferDirty(sbuf);
+	}
+
+	/*
+	 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
+	 * a split
+	 */
+	if (!isleaf)
+	{
+		Page		cpage = BufferGetPage(cbuf);
+		BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+		cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+		MarkBufferDirty(cbuf);
+	}
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_split xlrec;
+		uint8		xlinfo;
+		XLogRecPtr	recptr;
+
+		xlrec.level = ropaque->btpo_level;
+		/* See comments below on newitem, orignewitem, and posting lists */
+		xlrec.firstrightoff = firstrightoff;
+		xlrec.newitemoff = newitemoff;
+		xlrec.postingoff = 0;
+		if (postingoff != 0 && origpagepostingoff < firstrightoff)
+			xlrec.postingoff = postingoff;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
+
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+		XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
+		/* Log original right sibling, since we've changed its prev-pointer */
+		if (!isrightmost)
+			XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
+		if (!isleaf)
+			XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
+
+		/*
+		 * Log the new item, if it was inserted on the left page. (If it was
+		 * put on the right page, we don't need to explicitly WAL log it
+		 * because it's included with all the other items on the right page.)
+		 * Show the new item as belonging to the left page buffer, so that it
+		 * is not stored if XLogInsert decides it needs a full-page image of
+		 * the left page.  We always store newitemoff in the record, though.
+		 *
+		 * The details are sometimes slightly different for page splits that
+		 * coincide with a posting list split.  If both the replacement
+		 * posting list and newitem go on the right page, then we don't need
+		 * to log anything extra, just like the simple !newitemonleft
+		 * no-posting-split case (postingoff is set to zero in the WAL record,
+		 * so recovery doesn't need to process a posting list split at all).
+		 * Otherwise, we set postingoff and log orignewitem instead of
+		 * newitem, despite having actually inserted newitem.  REDO routine
+		 * must reconstruct nposting and newitem using _bt_swap_posting().
+		 *
+		 * Note: It's possible that our page split point is the point that
+		 * makes the posting list lastleft and newitem firstright.  This is
+		 * the only case where we log orignewitem/newitem despite newitem
+		 * going on the right page.  If XLogInsert decides that it can omit
+		 * orignewitem due to logging a full-page image of the left page,
+		 * everything still works out, since recovery only needs to log
+		 * orignewitem for items on the left page (just like the regular
+		 * newitem-logged case).
+		 */
+		if (newitemonleft && xlrec.postingoff == 0)
+			XLogRegisterBufData(0, (char *) newitem, newitemsz);
+		else if (xlrec.postingoff != 0)
+		{
+			Assert(isleaf);
+			Assert(newitemonleft || firstrightoff == newitemoff);
+			Assert(newitemsz == IndexTupleSize(orignewitem));
+			XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
+		}
+
+		/* Log the left page's new high key */
+		if (!isleaf)
+		{
+			/* lefthighkey isn't local copy, get current pointer */
+			itemid = PageGetItemId(origpage, P_HIKEY);
+			lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
+		}
+		XLogRegisterBufData(0, (char *) lefthighkey,
+							MAXALIGN(IndexTupleSize(lefthighkey)));
+
+		/*
+		 * Log the contents of the right page in the format understood by
+		 * _bt_restore_page().  The whole right page will be recreated.
+		 *
+		 * Direct access to page is not good but faster - we should implement
+		 * some new func in page API.  Note we only store the tuples
+		 * themselves, knowing that they were inserted in item-number order
+		 * and so the line pointers can be reconstructed.  See comments for
+		 * _bt_restore_page().
+		 */
+		XLogRegisterBufData(1,
+							(char *) rightpage + ((PageHeader) rightpage)->pd_upper,
+							((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
+
+		xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
+		recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+		PageSetLSN(origpage, recptr);
+		PageSetLSN(rightpage, recptr);
+		if (!isrightmost)
+			PageSetLSN(spage, recptr);
+		if (!isleaf)
+			PageSetLSN(BufferGetPage(cbuf), recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* release the old right sibling */
+	if (!isrightmost)
+		_bt_relbuf(rel, sbuf);
+
+	/* release the child */
+	if (!isleaf)
+		_bt_relbuf(rel, cbuf);
+
+	/* be tidy */
+	if (isleaf)
+		pfree(lefthighkey);
+
+	/* split's done */
+	return rbuf;
+}
+
+/*
+ * _bt_insert_parent() -- Insert downlink into parent, completing split.
+ *
+ * On entry, buf and rbuf are the left and right split pages, which we
+ * still hold write locks on.  Both locks will be released here.  We
+ * release the rbuf lock once we have a write lock on the page that we
+ * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
+ * The lock on buf is released at the same point as the lock on the parent
+ * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
+ * atomic operation that completes the split by inserting a new downlink.
+ *
+ * stack - stack showing how we got here.  Will be NULL when splitting true
+ *			root, or during concurrent root split, where we can be inefficient
+ * isroot - we split the true root
+ * isonly - we split a page alone on its level (might have been fast root)
+ */
+static void
+_bt_insert_parent(Relation rel,
+				  Buffer buf,
+				  Buffer rbuf,
+				  BTStack stack,
+				  bool isroot,
+				  bool isonly)
+{
+	/*
+	 * Here we have to do something Lehman and Yao don't talk about: deal with
+	 * a root split and construction of a new root.  If our stack is empty
+	 * then we have just split a node on what had been the root level when we
+	 * descended the tree.  If it was still the root then we perform a
+	 * new-root construction.  If it *wasn't* the root anymore, search to find
+	 * the next higher level that someone constructed meanwhile, and find the
+	 * right place to insert as for the normal case.
+	 *
+	 * If we have to search for the parent level, we do so by re-descending
+	 * from the root.  This is not super-efficient, but it's rare enough not
+	 * to matter.
+	 */
+	if (isroot)
+	{
+		Buffer		rootbuf;
+
+		Assert(stack == NULL);
+		Assert(isonly);
+		/* create a new root node and update the metapage */
+		rootbuf = _bt_newroot(rel, buf, rbuf);
+		/* release the split buffers */
+		_bt_relbuf(rel, rootbuf);
+		_bt_relbuf(rel, rbuf);
+		_bt_relbuf(rel, buf);
+	}
+	else
+	{
+		BlockNumber bknum = BufferGetBlockNumber(buf);
+		BlockNumber rbknum = BufferGetBlockNumber(rbuf);
+		Page		page = BufferGetPage(buf);
+		IndexTuple	new_item;
+		BTStackData fakestack;
+		IndexTuple	ritem;
+		Buffer		pbuf;
+
+		if (stack == NULL)
+		{
+			BTPageOpaque opaque;
+
+			elog(DEBUG2, "concurrent ROOT page split");
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+			/*
+			 * We should never reach here when a leaf page split takes place
+			 * despite the insert of newitem being able to apply the fastpath
+			 * optimization.  Make sure of that with an assertion.
+			 *
+			 * This is more of a performance issue than a correctness issue.
+			 * The fastpath won't have a descent stack.  Using a phony stack
+			 * here works, but never rely on that.  The fastpath should be
+			 * rejected within _bt_search_insert() when the rightmost leaf
+			 * page will split, since it's faster to go through _bt_search()
+			 * and get a stack in the usual way.
+			 */
+			Assert(!(P_ISLEAF(opaque) &&
+					 BlockNumberIsValid(RelationGetTargetBlock(rel))));
+
+			/* Find the leftmost page at the next level up */
+			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
+			/* Set up a phony stack entry pointing there */
+			stack = &fakestack;
+			stack->bts_blkno = BufferGetBlockNumber(pbuf);
+			stack->bts_offset = InvalidOffsetNumber;
+			stack->bts_parent = NULL;
+			_bt_relbuf(rel, pbuf);
+		}
+
+		/* get high key from left, a strict lower bound for new right page */
+		ritem = (IndexTuple) PageGetItem(page,
+										 PageGetItemId(page, P_HIKEY));
+
+		/* form an index tuple that points at the new right page */
+		new_item = CopyIndexTuple(ritem);
+		BTreeTupleSetDownLink(new_item, rbknum);
+
+		/*
+		 * Re-find and write lock the parent of buf.
+		 *
+		 * It's possible that the location of buf's downlink has changed since
+		 * our initial _bt_search() descent.  _bt_getstackbuf() will detect
+		 * and recover from this, updating the stack, which ensures that the
+		 * new downlink will be inserted at the correct offset. Even buf's
+		 * parent may have changed.
+		 */
+		pbuf = _bt_getstackbuf(rel, stack, bknum);
+
+		/*
+		 * Unlock the right child.  The left child will be unlocked in
+		 * _bt_insertonpg().
+		 *
+		 * Unlocking the right child must be delayed until here to ensure that
+		 * no concurrent VACUUM operation can become confused.  Page deletion
+		 * cannot be allowed to fail to re-find a downlink for the rbuf page.
+		 * (Actually, this is just a vestige of how things used to work.  The
+		 * page deletion code is expected to check for the INCOMPLETE_SPLIT
+		 * flag on the left child.  It won't attempt deletion of the right
+		 * child until the split is complete.  Despite all this, we opt to
+		 * conservatively delay unlocking the right child until here.)
+		 */
+		_bt_relbuf(rel, rbuf);
+
+		if (pbuf == InvalidBuffer)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
+									 RelationGetRelationName(rel), bknum, rbknum)));
+
+		/* Recursively insert into the parent */
+		_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
+					   new_item, MAXALIGN(IndexTupleSize(new_item)),
+					   stack->bts_offset + 1, 0, isonly);
+
+		/* be tidy */
+		pfree(new_item);
+	}
+}
+
+/*
+ * _bt_finish_split() -- Finish an incomplete split
+ *
+ * A crash or other failure can leave a split incomplete.  The insertion
+ * routines won't allow to insert on a page that is incompletely split.
+ * Before inserting on such a page, call _bt_finish_split().
+ *
+ * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
+ * and unpinned.
+ */
+void
+_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
+{
+	Page		lpage = BufferGetPage(lbuf);
+	BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
+	Buffer		rbuf;
+	Page		rpage;
+	BTPageOpaque rpageop;
+	bool		wasroot;
+	bool		wasonly;
+
+	Assert(P_INCOMPLETE_SPLIT(lpageop));
+
+	/* Lock right sibling, the one missing the downlink */
+	rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
+	rpage = BufferGetPage(rbuf);
+	rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+	/* Could this be a root split? */
+	if (!stack)
+	{
+		Buffer		metabuf;
+		Page		metapg;
+		BTMetaPageData *metad;
+
+		/* acquire lock on the metapage */
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+		metapg = BufferGetPage(metabuf);
+		metad = BTPageGetMeta(metapg);
+
+		wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
+
+		_bt_relbuf(rel, metabuf);
+	}
+	else
+		wasroot = false;
+
+	/* Was this the only page on the level before split? */
+	wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
+
+	elog(DEBUG1, "finishing incomplete split of %u/%u",
+		 BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
+
+	_bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
+}
+
+/*
+ *	_bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
+ *						 tuple whose downlink points to child page.
+ *
+ *		Caller passes child's block number, which is used to identify
+ *		associated pivot tuple in parent page using a linear search that
+ *		matches on pivot's downlink/block number.  The expected location of
+ *		the pivot tuple is taken from the stack one level above the child
+ *		page.  This is used as a starting point.  Insertions into the
+ *		parent level could cause the pivot tuple to move right; deletions
+ *		could cause it to move left, but not left of the page we previously
+ *		found it on.
+ *
+ *		Caller can use its stack to relocate the pivot tuple/downlink for
+ *		any same-level page to the right of the page found by its initial
+ *		descent.  This is necessary because of the possibility that caller
+ *		moved right to recover from a concurrent page split.  It's also
+ *		convenient for certain callers to be able to step right when there
+ *		wasn't a concurrent page split, while still using their original
+ *		stack.  For example, the checkingunique _bt_doinsert() case may
+ *		have to step right when there are many physical duplicates, and its
+ *		scantid forces an insertion to the right of the "first page the
+ *		value could be on".  (This is also relied on by all of our callers
+ *		when dealing with !heapkeyspace indexes.)
+ *
+ *		Returns write-locked parent page buffer, or InvalidBuffer if pivot
+ *		tuple not found (should not happen).  Adjusts bts_blkno &
+ *		bts_offset if changed.  Page split caller should insert its new
+ *		pivot tuple for its new right sibling page on parent page, at the
+ *		offset number bts_offset + 1.
+ */
+Buffer
+_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
+{
+	BlockNumber blkno;
+	OffsetNumber start;
+
+	blkno = stack->bts_blkno;
+	start = stack->bts_offset;
+
+	for (;;)
+	{
+		Buffer		buf;
+		Page		page;
+		BTPageOpaque opaque;
+
+		buf = _bt_getbuf(rel, blkno, BT_WRITE);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		if (P_INCOMPLETE_SPLIT(opaque))
+		{
+			_bt_finish_split(rel, buf, stack->bts_parent);
+			continue;
+		}
+
+		if (!P_IGNORE(opaque))
+		{
+			OffsetNumber offnum,
+						minoff,
+						maxoff;
+			ItemId		itemid;
+			IndexTuple	item;
+
+			minoff = P_FIRSTDATAKEY(opaque);
+			maxoff = PageGetMaxOffsetNumber(page);
+
+			/*
+			 * start = InvalidOffsetNumber means "search the whole page". We
+			 * need this test anyway due to possibility that page has a high
+			 * key now when it didn't before.
+			 */
+			if (start < minoff)
+				start = minoff;
+
+			/*
+			 * Need this check too, to guard against possibility that page
+			 * split since we visited it originally.
+			 */
+			if (start > maxoff)
+				start = OffsetNumberNext(maxoff);
+
+			/*
+			 * These loops will check every item on the page --- but in an
+			 * order that's attuned to the probability of where it actually
+			 * is.  Scan to the right first, then to the left.
+			 */
+			for (offnum = start;
+				 offnum <= maxoff;
+				 offnum = OffsetNumberNext(offnum))
+			{
+				itemid = PageGetItemId(page, offnum);
+				item = (IndexTuple) PageGetItem(page, itemid);
+
+				if (BTreeTupleGetDownLink(item) == child)
+				{
+					/* Return accurate pointer to where link is now */
+					stack->bts_blkno = blkno;
+					stack->bts_offset = offnum;
+					return buf;
+				}
+			}
+
+			for (offnum = OffsetNumberPrev(start);
+				 offnum >= minoff;
+				 offnum = OffsetNumberPrev(offnum))
+			{
+				itemid = PageGetItemId(page, offnum);
+				item = (IndexTuple) PageGetItem(page, itemid);
+
+				if (BTreeTupleGetDownLink(item) == child)
+				{
+					/* Return accurate pointer to where link is now */
+					stack->bts_blkno = blkno;
+					stack->bts_offset = offnum;
+					return buf;
+				}
+			}
+		}
+
+		/*
+		 * The item we're looking for moved right at least one page.
+		 *
+		 * Lehman and Yao couple/chain locks when moving right here, which we
+		 * can avoid.  See nbtree/README.
+		 */
+		if (P_RIGHTMOST(opaque))
+		{
+			_bt_relbuf(rel, buf);
+			return InvalidBuffer;
+		}
+		blkno = opaque->btpo_next;
+		start = InvalidOffsetNumber;
+		_bt_relbuf(rel, buf);
+	}
+}
+
+/*
+ *	_bt_newroot() -- Create a new root page for the index.
+ *
+ *		We've just split the old root page and need to create a new one.
+ *		In order to do this, we add a new root page to the file, then lock
+ *		the metadata page and update it.  This is guaranteed to be deadlock-
+ *		free, because all readers release their locks on the metadata page
+ *		before trying to lock the root, and all writers lock the root before
+ *		trying to lock the metadata page.  We have a write lock on the old
+ *		root page, so we have not introduced any cycles into the waits-for
+ *		graph.
+ *
+ *		On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ *		locked. On exit, a new root page exists with entries for the
+ *		two new children, metapage is updated and unlocked/unpinned.
+ *		The new root buffer is returned to caller which has to unlock/unpin
+ *		lbuf, rbuf & rootbuf.
+ */
+static Buffer
+_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
+{
+	Buffer		rootbuf;
+	Page		lpage,
+				rootpage;
+	BlockNumber lbkno,
+				rbkno;
+	BlockNumber rootblknum;
+	BTPageOpaque rootopaque;
+	BTPageOpaque lopaque;
+	ItemId		itemid;
+	IndexTuple	item;
+	IndexTuple	left_item;
+	Size		left_item_sz;
+	IndexTuple	right_item;
+	Size		right_item_sz;
+	Buffer		metabuf;
+	Page		metapg;
+	BTMetaPageData *metad;
+
+	lbkno = BufferGetBlockNumber(lbuf);
+	rbkno = BufferGetBlockNumber(rbuf);
+	lpage = BufferGetPage(lbuf);
+	lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+
+	/* get a new root page */
+	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+	rootpage = BufferGetPage(rootbuf);
+	rootblknum = BufferGetBlockNumber(rootbuf);
+
+	/* acquire lock on the metapage */
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+	metapg = BufferGetPage(metabuf);
+	metad = BTPageGetMeta(metapg);
+
+	/*
+	 * Create downlink item for left page (old root).  The key value used is
+	 * "minus infinity", a sentinel value that's reliably less than any real
+	 * key value that could appear in the left page.
+	 */
+	left_item_sz = sizeof(IndexTupleData);
+	left_item = (IndexTuple) palloc(left_item_sz);
+	left_item->t_info = left_item_sz;
+	BTreeTupleSetDownLink(left_item, lbkno);
+	BTreeTupleSetNAtts(left_item, 0, false);
+
+	/*
+	 * Create downlink item for right page.  The key for it is obtained from
+	 * the "high key" position in the left page.
+	 */
+	itemid = PageGetItemId(lpage, P_HIKEY);
+	right_item_sz = ItemIdGetLength(itemid);
+	item = (IndexTuple) PageGetItem(lpage, itemid);
+	right_item = CopyIndexTuple(item);
+	BTreeTupleSetDownLink(right_item, rbkno);
+
+	/* NO EREPORT(ERROR) from here till newroot op is logged */
+	START_CRIT_SECTION();
+
+	/* upgrade metapage if needed */
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
+		_bt_upgrademetapage(metapg);
+
+	/* set btree special data */
+	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+	rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+	rootopaque->btpo_flags = BTP_ROOT;
+	rootopaque->btpo_level =
+		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1;
+	rootopaque->btpo_cycleid = 0;
+
+	/* update metapage data */
+	metad->btm_root = rootblknum;
+	metad->btm_level = rootopaque->btpo_level;
+	metad->btm_fastroot = rootblknum;
+	metad->btm_fastlevel = rootopaque->btpo_level;
+
+	/*
+	 * Insert the left page pointer into the new root page.  The root page is
+	 * the rightmost page on its level so there is no "high key" in it; the
+	 * two items will go into positions P_HIKEY and P_FIRSTKEY.
+	 *
+	 * Note: we *must* insert the two items in item-number order, for the
+	 * benefit of _bt_restore_page().
+	 */
+	Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
+	if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
+					false, false) == InvalidOffsetNumber)
+		elog(PANIC, "failed to add leftkey to new root page"
+			 " while splitting block %u of index \"%s\"",
+			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
+
+	/*
+	 * insert the right page pointer into the new root page.
+	 */
+	Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
+	Assert(BTreeTupleGetNAtts(right_item, rel) <=
+		   IndexRelationGetNumberOfKeyAttributes(rel));
+	if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
+					false, false) == InvalidOffsetNumber)
+		elog(PANIC, "failed to add rightkey to new root page"
+			 " while splitting block %u of index \"%s\"",
+			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
+
+	/* Clear the incomplete-split flag in the left child */
+	Assert(P_INCOMPLETE_SPLIT(lopaque));
+	lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+	MarkBufferDirty(lbuf);
+
+	MarkBufferDirty(rootbuf);
+	MarkBufferDirty(metabuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_newroot xlrec;
+		XLogRecPtr	recptr;
+		xl_btree_metadata md;
+
+		xlrec.rootblk = rootblknum;
+		xlrec.level = metad->btm_level;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
+
+		XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
+		XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
+		XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+		Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+		md.version = metad->btm_version;
+		md.root = rootblknum;
+		md.level = metad->btm_level;
+		md.fastroot = rootblknum;
+		md.fastlevel = metad->btm_level;
+		md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
+		md.allequalimage = metad->btm_allequalimage;
+
+		XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
+
+		/*
+		 * Direct access to page is not good but faster - we should implement
+		 * some new func in page API.
+		 */
+		XLogRegisterBufData(0,
+							(char *) rootpage + ((PageHeader) rootpage)->pd_upper,
+							((PageHeader) rootpage)->pd_special -
+							((PageHeader) rootpage)->pd_upper);
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
+
+		PageSetLSN(lpage, recptr);
+		PageSetLSN(rootpage, recptr);
+		PageSetLSN(metapg, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* done with metapage */
+	_bt_relbuf(rel, metabuf);
+
+	pfree(left_item);
+	pfree(right_item);
+
+	return rootbuf;
+}
+
+/*
+ *	_bt_pgaddtup() -- add a data item to a particular page during split.
+ *
+ *		The difference between this routine and a bare PageAddItem call is
+ *		that this code can deal with the first data item on an internal btree
+ *		page in passing.  This data item (which is called "firstright" within
+ *		_bt_split()) has a key that must be treated as minus infinity after
+ *		the split.  Therefore, we truncate away all attributes when caller
+ *		specifies it's the first data item on page (downlink is not changed,
+ *		though).  This extra step is only needed for the right page of an
+ *		internal page split.  There is no need to do this for the first data
+ *		item on the existing/left page, since that will already have been
+ *		truncated during an earlier page split.
+ *
+ *		See _bt_split() for a high level explanation of why we truncate here.
+ *		Note that this routine has nothing to do with suffix truncation,
+ *		despite using some of the same infrastructure.
+ */
+static inline bool
+_bt_pgaddtup(Page page,
+			 Size itemsize,
+			 IndexTuple itup,
+			 OffsetNumber itup_off,
+			 bool newfirstdataitem)
+{
+	IndexTupleData trunctuple;
+
+	if (newfirstdataitem)
+	{
+		trunctuple = *itup;
+		trunctuple.t_info = sizeof(IndexTupleData);
+		BTreeTupleSetNAtts(&trunctuple, 0, false);
+		itup = &trunctuple;
+		itemsize = sizeof(IndexTupleData);
+	}
+
+	if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
+							 false) == InvalidOffsetNumber))
+		return false;
+
+	return true;
+}
+
+/*
+ * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
+ *
+ * There are three operations performed here: simple index deletion, bottom-up
+ * index deletion, and deduplication.  If all three operations fail to free
+ * enough space for the incoming item then caller will go on to split the
+ * page.  We always consider simple deletion first.  If that doesn't work out
+ * we consider alternatives.  Callers that only want us to consider simple
+ * deletion (without any fallback) ask for that using the 'simpleonly'
+ * argument.
+ *
+ * We usually pick only one alternative "complex" operation when simple
+ * deletion alone won't prevent a page split.  The 'checkingunique',
+ * 'uniquedup', and 'indexUnchanged' arguments are used for that.
+ *
+ * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
+ * level flag was found set.  The flag was useful back when there wasn't
+ * necessarily one single page for a duplicate tuple to go on (before heap TID
+ * became a part of the key space in version 4 indexes).  But we don't
+ * actually look at the flag anymore (it's not a gating condition for our
+ * caller).  That would cause us to miss tuples that are safe to delete,
+ * without getting any benefit in return.  We know that the alternative is to
+ * split the page; scanning the line pointer array in passing won't have
+ * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
+ * all this because !heapkeyspace indexes must still do a "getting tired"
+ * linear search, and so are likely to get some benefit from using it as a
+ * gating condition.)
+ */
+static void
+_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
+							 BTInsertState insertstate,
+							 bool simpleonly, bool checkingunique,
+							 bool uniquedup, bool indexUnchanged)
+{
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	int			ndeletable = 0;
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Buffer		buffer = insertstate->buf;
+	BTScanInsert itup_key = insertstate->itup_key;
+	Page		page = BufferGetPage(buffer);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	Assert(P_ISLEAF(opaque));
+	Assert(simpleonly || itup_key->heapkeyspace);
+	Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
+
+	/*
+	 * Scan over all items to see which ones need to be deleted according to
+	 * LP_DEAD flags.  We'll usually manage to delete a few extra items that
+	 * are not marked LP_DEAD in passing.  Often the extra items that actually
+	 * end up getting deleted are items that would have had their LP_DEAD bit
+	 * set before long anyway (if we opted not to include them as extras).
+	 */
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+
+		if (ItemIdIsDead(itemId))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+	{
+		_bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
+						   insertstate->itup, minoff, maxoff);
+		insertstate->bounds_valid = false;
+
+		/* Return when a page split has already been avoided */
+		if (PageGetFreeSpace(page) >= insertstate->itemsz)
+			return;
+
+		/* Might as well assume duplicates (if checkingunique) */
+		uniquedup = true;
+	}
+
+	/*
+	 * We're done with simple deletion.  Return early with callers that only
+	 * call here so that simple deletion can be considered.  This includes
+	 * callers that explicitly ask for this and checkingunique callers that
+	 * probably don't have any version churn duplicates on the page.
+	 *
+	 * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
+	 * return at this point (or when we go on the try either or both of our
+	 * other strategies and they also fail).  We do not bother expending a
+	 * separate write to clear it, however.  Caller will definitely clear it
+	 * when it goes on to split the page (note also that the deduplication
+	 * process will clear the flag in passing, just to keep things tidy).
+	 */
+	if (simpleonly || (checkingunique && !uniquedup))
+	{
+		Assert(!indexUnchanged);
+		return;
+	}
+
+	/* Assume bounds about to be invalidated (this is almost certain now) */
+	insertstate->bounds_valid = false;
+
+	/*
+	 * Perform bottom-up index deletion pass when executor hint indicated that
+	 * incoming item is logically unchanged, or for a unique index that is
+	 * known to have physical duplicates for some other reason.  (There is a
+	 * large overlap between these two cases for a unique index.  It's worth
+	 * having both triggering conditions in order to apply the optimization in
+	 * the event of successive related INSERT and DELETE statements.)
+	 *
+	 * We'll go on to do a deduplication pass when a bottom-up pass fails to
+	 * delete an acceptable amount of free space (a significant fraction of
+	 * the page, or space for the new item, whichever is greater).
+	 *
+	 * Note: Bottom-up index deletion uses the same equality/equivalence
+	 * routines as deduplication internally.  However, it does not merge
+	 * together index tuples, so the same correctness considerations do not
+	 * apply.  We deliberately omit an index-is-allequalimage test here.
+	 */
+	if ((indexUnchanged || uniquedup) &&
+		_bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
+		return;
+
+	/* Perform deduplication pass (when enabled and index-is-allequalimage) */
+	if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
+		_bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
+					   insertstate->itemsz, (indexUnchanged || uniquedup));
+}
+
+/*
+ * _bt_simpledel_pass - Simple index tuple deletion pass.
+ *
+ * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
+ * of all such tuples are determined by caller (caller passes these to us as
+ * its 'deletable' argument).
+ *
+ * We might also delete extra index tuples that turn out to be safe to delete
+ * in passing (though they must be cheap to check in passing to begin with).
+ * There is no certainty that any extra tuples will be deleted, though.  The
+ * high level goal of the approach we take is to get the most out of each call
+ * here (without noticeably increasing the per-call overhead compared to what
+ * we need to do just to be able to delete the page's LP_DEAD-marked index
+ * tuples).
+ *
+ * The number of extra index tuples that turn out to be deletable might
+ * greatly exceed the number of LP_DEAD-marked index tuples due to various
+ * locality related effects.  For example, it's possible that the total number
+ * of table blocks (pointed to by all TIDs on the leaf page) is naturally
+ * quite low, in which case we might end up checking if it's possible to
+ * delete _most_ index tuples on the page (without the tableam needing to
+ * access additional table blocks).  The tableam will sometimes stumble upon
+ * _many_ extra deletable index tuples in indexes where this pattern is
+ * common.
+ *
+ * See nbtree/README for further details on simple index tuple deletion.
+ */
+static void
+_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
+				   OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
+				   OffsetNumber minoff, OffsetNumber maxoff)
+{
+	Page		page = BufferGetPage(buffer);
+	BlockNumber *deadblocks;
+	int			ndeadblocks;
+	TM_IndexDeleteOp delstate;
+	OffsetNumber offnum;
+
+	/* Get array of table blocks pointed to by LP_DEAD-set tuples */
+	deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
+								&ndeadblocks);
+
+	/* Initialize tableam state that describes index deletion operation */
+	delstate.bottomup = false;
+	delstate.bottomupfreespace = 0;
+	delstate.ndeltids = 0;
+	delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
+	delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
+
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+		TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
+		TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
+		BlockNumber tidblock;
+		void	   *match;
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
+			match = bsearch(&tidblock, deadblocks, ndeadblocks,
+							sizeof(BlockNumber), _bt_blk_cmp);
+
+			if (!match)
+			{
+				Assert(!ItemIdIsDead(itemid));
+				continue;
+			}
+
+			/*
+			 * TID's table block is among those pointed to by the TIDs from
+			 * LP_DEAD-bit set tuples on page -- add TID to deltids
+			 */
+			odeltid->tid = itup->t_tid;
+			odeltid->id = delstate.ndeltids;
+			ostatus->idxoffnum = offnum;
+			ostatus->knowndeletable = ItemIdIsDead(itemid);
+			ostatus->promising = false; /* unused */
+			ostatus->freespace = 0; /* unused */
+
+			delstate.ndeltids++;
+		}
+		else
+		{
+			int			nitem = BTreeTupleGetNPosting(itup);
+
+			for (int p = 0; p < nitem; p++)
+			{
+				ItemPointer tid = BTreeTupleGetPostingN(itup, p);
+
+				tidblock = ItemPointerGetBlockNumber(tid);
+				match = bsearch(&tidblock, deadblocks, ndeadblocks,
+								sizeof(BlockNumber), _bt_blk_cmp);
+
+				if (!match)
+				{
+					Assert(!ItemIdIsDead(itemid));
+					continue;
+				}
+
+				/*
+				 * TID's table block is among those pointed to by the TIDs
+				 * from LP_DEAD-bit set tuples on page -- add TID to deltids
+				 */
+				odeltid->tid = *tid;
+				odeltid->id = delstate.ndeltids;
+				ostatus->idxoffnum = offnum;
+				ostatus->knowndeletable = ItemIdIsDead(itemid);
+				ostatus->promising = false; /* unused */
+				ostatus->freespace = 0; /* unused */
+
+				odeltid++;
+				ostatus++;
+				delstate.ndeltids++;
+			}
+		}
+	}
+
+	pfree(deadblocks);
+
+	Assert(delstate.ndeltids >= ndeletable);
+
+	/* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
+	_bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
+
+	pfree(delstate.deltids);
+	pfree(delstate.status);
+}
+
+/*
+ * _bt_deadblocks() -- Get LP_DEAD related table blocks.
+ *
+ * Builds sorted and unique-ified array of table block numbers from index
+ * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
+ * block from incoming newitem just in case it isn't among the LP_DEAD-related
+ * table blocks.
+ *
+ * Always counting the newitem's table block as an LP_DEAD related block makes
+ * sense because the cost is consistently low; it is practically certain that
+ * the table block will not incur a buffer miss in tableam.  On the other hand
+ * the benefit is often quite high.  There is a decent chance that there will
+ * be some deletable items from this block, since in general most garbage
+ * tuples became garbage in the recent past (in many cases this won't be the
+ * first logical row that core code added to/modified in table block
+ * recently).
+ *
+ * Returns final array, and sets *nblocks to its final size for caller.
+ */
+static BlockNumber *
+_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
+			   IndexTuple newitem, int *nblocks)
+{
+	int			spacentids,
+				ntids;
+	BlockNumber *tidblocks;
+
+	/*
+	 * Accumulate each TID's block in array whose initial size has space for
+	 * one table block per LP_DEAD-set tuple (plus space for the newitem table
+	 * block).  Array will only need to grow when there are LP_DEAD-marked
+	 * posting list tuples (which is not that common).
+	 */
+	spacentids = ndeletable + 1;
+	ntids = 0;
+	tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
+
+	/*
+	 * First add the table block for the incoming newitem.  This is the one
+	 * case where simple deletion can visit a table block that doesn't have
+	 * any known deletable items.
+	 */
+	Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
+	tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
+
+	for (int i = 0; i < ndeletable; i++)
+	{
+		ItemId		itemid = PageGetItemId(page, deletable[i]);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(ItemIdIsDead(itemid));
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			if (ntids + 1 > spacentids)
+			{
+				spacentids *= 2;
+				tidblocks = (BlockNumber *)
+					repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
+			}
+
+			tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
+		}
+		else
+		{
+			int			nposting = BTreeTupleGetNPosting(itup);
+
+			if (ntids + nposting > spacentids)
+			{
+				spacentids = Max(spacentids * 2, ntids + nposting);
+				tidblocks = (BlockNumber *)
+					repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
+			}
+
+			for (int j = 0; j < nposting; j++)
+			{
+				ItemPointer tid = BTreeTupleGetPostingN(itup, j);
+
+				tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
+			}
+		}
+	}
+
+	qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
+	*nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
+
+	return tidblocks;
+}
+
+/*
+ * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
+ */
+static inline int
+_bt_blk_cmp(const void *arg1, const void *arg2)
+{
+	BlockNumber b1 = *((BlockNumber *) arg1);
+	BlockNumber b2 = *((BlockNumber *) arg2);
+
+	if (b1 < b2)
+		return -1;
+	else if (b1 > b2)
+		return 1;
+
+	return 0;
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
new file mode 100644
index 0000000..ebec8fa
--- /dev/null
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -0,0 +1,3073 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtpage.c
+ *	  BTree-specific page management code for the Postgres btree access
+ *	  method.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtpage.c
+ *
+ *	NOTES
+ *	   Postgres btree pages look like ordinary relation pages.  The opaque
+ *	   data at high addresses includes pointers to left and right siblings
+ *	   and flag data describing page state.  The first page in a btree, page
+ *	   zero, is special -- it stores meta-information describing the tree.
+ *	   Pages one and higher store the actual tree data.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/procarray.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+
+static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
+static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
+							   FullTransactionId safexid);
+static void _bt_delitems_delete(Relation rel, Buffer buf,
+								TransactionId latestRemovedXid,
+								OffsetNumber *deletable, int ndeletable,
+								BTVacuumPosting *updatable, int nupdatable);
+static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
+								 OffsetNumber *updatedoffsets,
+								 Size *updatedbuflen, bool needswal);
+static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
+								   BTStack stack);
+static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
+									 BlockNumber scanblkno,
+									 bool *rightsib_empty,
+									 BTVacState *vstate);
+static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
+									BTStack stack,
+									Buffer *subtreeparent,
+									OffsetNumber *poffset,
+									BlockNumber *topparent,
+									BlockNumber *topparentrightsib);
+static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
+							   FullTransactionId safexid);
+
+/*
+ *	_bt_initmetapage() -- Fill a page buffer with a correct metapage image
+ */
+void
+_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+				 bool allequalimage)
+{
+	BTMetaPageData *metad;
+	BTPageOpaque metaopaque;
+
+	_bt_pageinit(page, BLCKSZ);
+
+	metad = BTPageGetMeta(page);
+	metad->btm_magic = BTREE_MAGIC;
+	metad->btm_version = BTREE_VERSION;
+	metad->btm_root = rootbknum;
+	metad->btm_level = level;
+	metad->btm_fastroot = rootbknum;
+	metad->btm_fastlevel = level;
+	metad->btm_last_cleanup_num_delpages = 0;
+	metad->btm_last_cleanup_num_heap_tuples = -1.0;
+	metad->btm_allequalimage = allequalimage;
+
+	metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	metaopaque->btpo_flags = BTP_META;
+
+	/*
+	 * Set pd_lower just past the end of the metadata.  This is essential,
+	 * because without doing so, metadata will be lost if xlog.c compresses
+	 * the page.
+	 */
+	((PageHeader) page)->pd_lower =
+		((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
+}
+
+/*
+ *	_bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
+ *		3, the last version that can be updated without broadly affecting
+ *		on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
+ *
+ *		This routine does purely in-memory image upgrade.  Caller is
+ *		responsible for locking, WAL-logging etc.
+ */
+void
+_bt_upgrademetapage(Page page)
+{
+	BTMetaPageData *metad;
+	BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
+
+	metad = BTPageGetMeta(page);
+	metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* It must be really a meta page of upgradable version */
+	Assert(metaopaque->btpo_flags & BTP_META);
+	Assert(metad->btm_version < BTREE_NOVAC_VERSION);
+	Assert(metad->btm_version >= BTREE_MIN_VERSION);
+
+	/* Set version number and fill extra fields added into version 3 */
+	metad->btm_version = BTREE_NOVAC_VERSION;
+	metad->btm_last_cleanup_num_delpages = 0;
+	metad->btm_last_cleanup_num_heap_tuples = -1.0;
+	/* Only a REINDEX can set this field */
+	Assert(!metad->btm_allequalimage);
+	metad->btm_allequalimage = false;
+
+	/* Adjust pd_lower (see _bt_initmetapage() for details) */
+	((PageHeader) page)->pd_lower =
+		((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
+}
+
+/*
+ * Get metadata from share-locked buffer containing metapage, while performing
+ * standard sanity checks.
+ *
+ * Callers that cache data returned here in local cache should note that an
+ * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
+ * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
+ */
+static BTMetaPageData *
+_bt_getmeta(Relation rel, Buffer metabuf)
+{
+	Page		metapg;
+	BTPageOpaque metaopaque;
+	BTMetaPageData *metad;
+
+	metapg = BufferGetPage(metabuf);
+	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+	metad = BTPageGetMeta(metapg);
+
+	/* sanity-check the metapage */
+	if (!P_ISMETA(metaopaque) ||
+		metad->btm_magic != BTREE_MAGIC)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("index \"%s\" is not a btree",
+						RelationGetRelationName(rel))));
+
+	if (metad->btm_version < BTREE_MIN_VERSION ||
+		metad->btm_version > BTREE_VERSION)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("version mismatch in index \"%s\": file version %d, "
+						"current version %d, minimal supported version %d",
+						RelationGetRelationName(rel),
+						metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+
+	return metad;
+}
+
+/*
+ * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
+ *
+ * Called by btvacuumcleanup when btbulkdelete was never called because no
+ * index tuples needed to be deleted.
+ */
+bool
+_bt_vacuum_needs_cleanup(Relation rel)
+{
+	Buffer		metabuf;
+	Page		metapg;
+	BTMetaPageData *metad;
+	uint32		btm_version;
+	BlockNumber prev_num_delpages;
+
+	/*
+	 * Copy details from metapage to local variables quickly.
+	 *
+	 * Note that we deliberately avoid using cached version of metapage here.
+	 */
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+	metapg = BufferGetPage(metabuf);
+	metad = BTPageGetMeta(metapg);
+	btm_version = metad->btm_version;
+
+	if (btm_version < BTREE_NOVAC_VERSION)
+	{
+		/*
+		 * Metapage needs to be dynamically upgraded to store fields that are
+		 * only present when btm_version >= BTREE_NOVAC_VERSION
+		 */
+		_bt_relbuf(rel, metabuf);
+		return true;
+	}
+
+	prev_num_delpages = metad->btm_last_cleanup_num_delpages;
+	_bt_relbuf(rel, metabuf);
+
+	/*
+	 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
+	 * total size of the index.  We can reasonably expect (though are not
+	 * guaranteed) to be able to recycle this many pages if we decide to do a
+	 * btvacuumscan call during the ongoing btvacuumcleanup.  For further
+	 * details see the nbtree/README section on placing deleted pages in the
+	 * FSM.
+	 */
+	if (prev_num_delpages > 0 &&
+		prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
+		return true;
+
+	return false;
+}
+
+/*
+ * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
+ *
+ * Called at the end of btvacuumcleanup, when num_delpages value has been
+ * finalized.
+ */
+void
+_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
+{
+	Buffer		metabuf;
+	Page		metapg;
+	BTMetaPageData *metad;
+
+	/*
+	 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
+	 * field started out as a TransactionId field called btm_oldest_btpo_xact.
+	 * Both "versions" are just uint32 fields.  It was convenient to repurpose
+	 * the field when we began to use 64-bit XIDs in deleted pages.
+	 *
+	 * It's possible that a pg_upgrade'd database will contain an XID value in
+	 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
+	 * field.  _bt_vacuum_needs_cleanup() may even believe that this value
+	 * indicates that there are lots of pages that it needs to recycle, when
+	 * in reality there are only one or two.  The worst that can happen is
+	 * that there will be a call to btvacuumscan a little earlier, which will
+	 * set btm_last_cleanup_num_delpages to a sane value when we're called.
+	 *
+	 * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
+	 * no longer used as of PostgreSQL 14.  We set it to -1.0 on rewrite, just
+	 * to be consistent.
+	 */
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+	metapg = BufferGetPage(metabuf);
+	metad = BTPageGetMeta(metapg);
+
+	/* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
+	if (metad->btm_version >= BTREE_NOVAC_VERSION &&
+		metad->btm_last_cleanup_num_delpages == num_delpages)
+	{
+		/* Usually means index continues to have num_delpages of 0 */
+		_bt_relbuf(rel, metabuf);
+		return;
+	}
+
+	/* trade in our read lock for a write lock */
+	_bt_unlockbuf(rel, metabuf);
+	_bt_lockbuf(rel, metabuf, BT_WRITE);
+
+	START_CRIT_SECTION();
+
+	/* upgrade meta-page if needed */
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
+		_bt_upgrademetapage(metapg);
+
+	/* update cleanup-related information */
+	metad->btm_last_cleanup_num_delpages = num_delpages;
+	metad->btm_last_cleanup_num_heap_tuples = -1.0;
+	MarkBufferDirty(metabuf);
+
+	/* write wal record if needed */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_metadata md;
+		XLogRecPtr	recptr;
+
+		XLogBeginInsert();
+		XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+		Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+		md.version = metad->btm_version;
+		md.root = metad->btm_root;
+		md.level = metad->btm_level;
+		md.fastroot = metad->btm_fastroot;
+		md.fastlevel = metad->btm_fastlevel;
+		md.last_cleanup_num_delpages = num_delpages;
+		md.allequalimage = metad->btm_allequalimage;
+
+		XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
+
+		PageSetLSN(metapg, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	_bt_relbuf(rel, metabuf);
+}
+
+/*
+ *	_bt_getroot() -- Get the root page of the btree.
+ *
+ *		Since the root page can move around the btree file, we have to read
+ *		its location from the metadata page, and then read the root page
+ *		itself.  If no root page exists yet, we have to create one.
+ *
+ *		The access type parameter (BT_READ or BT_WRITE) controls whether
+ *		a new root page will be created or not.  If access = BT_READ,
+ *		and no root page exists, we just return InvalidBuffer.  For
+ *		BT_WRITE, we try to create the root page if it doesn't exist.
+ *		NOTE that the returned root page will have only a read lock set
+ *		on it even if access = BT_WRITE!
+ *
+ *		The returned page is not necessarily the true root --- it could be
+ *		a "fast root" (a page that is alone in its level due to deletions).
+ *		Also, if the root page is split while we are "in flight" to it,
+ *		what we will return is the old root, which is now just the leftmost
+ *		page on a probably-not-very-wide level.  For most purposes this is
+ *		as good as or better than the true root, so we do not bother to
+ *		insist on finding the true root.  We do, however, guarantee to
+ *		return a live (not deleted or half-dead) page.
+ *
+ *		On successful return, the root page is pinned and read-locked.
+ *		The metadata page is not locked or pinned on exit.
+ */
+Buffer
+_bt_getroot(Relation rel, int access)
+{
+	Buffer		metabuf;
+	Buffer		rootbuf;
+	Page		rootpage;
+	BTPageOpaque rootopaque;
+	BlockNumber rootblkno;
+	uint32		rootlevel;
+	BTMetaPageData *metad;
+
+	/*
+	 * Try to use previously-cached metapage data to find the root.  This
+	 * normally saves one buffer access per index search, which is a very
+	 * helpful savings in bufmgr traffic and hence contention.
+	 */
+	if (rel->rd_amcache != NULL)
+	{
+		metad = (BTMetaPageData *) rel->rd_amcache;
+		/* We shouldn't have cached it if any of these fail */
+		Assert(metad->btm_magic == BTREE_MAGIC);
+		Assert(metad->btm_version >= BTREE_MIN_VERSION);
+		Assert(metad->btm_version <= BTREE_VERSION);
+		Assert(!metad->btm_allequalimage ||
+			   metad->btm_version > BTREE_NOVAC_VERSION);
+		Assert(metad->btm_root != P_NONE);
+
+		rootblkno = metad->btm_fastroot;
+		Assert(rootblkno != P_NONE);
+		rootlevel = metad->btm_fastlevel;
+
+		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+		rootpage = BufferGetPage(rootbuf);
+		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+		/*
+		 * Since the cache might be stale, we check the page more carefully
+		 * here than normal.  We *must* check that it's not deleted. If it's
+		 * not alone on its level, then we reject too --- this may be overly
+		 * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
+		 * because that's not set in a "fast root".
+		 */
+		if (!P_IGNORE(rootopaque) &&
+			rootopaque->btpo_level == rootlevel &&
+			P_LEFTMOST(rootopaque) &&
+			P_RIGHTMOST(rootopaque))
+		{
+			/* OK, accept cached page as the root */
+			return rootbuf;
+		}
+		_bt_relbuf(rel, rootbuf);
+		/* Cache is stale, throw it away */
+		if (rel->rd_amcache)
+			pfree(rel->rd_amcache);
+		rel->rd_amcache = NULL;
+	}
+
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+	metad = _bt_getmeta(rel, metabuf);
+
+	/* if no root page initialized yet, do it */
+	if (metad->btm_root == P_NONE)
+	{
+		Page		metapg;
+
+		/* If access = BT_READ, caller doesn't want us to create root yet */
+		if (access == BT_READ)
+		{
+			_bt_relbuf(rel, metabuf);
+			return InvalidBuffer;
+		}
+
+		/* trade in our read lock for a write lock */
+		_bt_unlockbuf(rel, metabuf);
+		_bt_lockbuf(rel, metabuf, BT_WRITE);
+
+		/*
+		 * Race condition:	if someone else initialized the metadata between
+		 * the time we released the read lock and acquired the write lock, we
+		 * must avoid doing it again.
+		 */
+		if (metad->btm_root != P_NONE)
+		{
+			/*
+			 * Metadata initialized by someone else.  In order to guarantee no
+			 * deadlocks, we have to release the metadata page and start all
+			 * over again.  (Is that really true? But it's hardly worth trying
+			 * to optimize this case.)
+			 */
+			_bt_relbuf(rel, metabuf);
+			return _bt_getroot(rel, access);
+		}
+
+		/*
+		 * Get, initialize, write, and leave a lock of the appropriate type on
+		 * the new root page.  Since this is the first page in the tree, it's
+		 * a leaf as well as the root.
+		 */
+		rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+		rootblkno = BufferGetBlockNumber(rootbuf);
+		rootpage = BufferGetPage(rootbuf);
+		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+		rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+		rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
+		rootopaque->btpo_level = 0;
+		rootopaque->btpo_cycleid = 0;
+		/* Get raw page pointer for metapage */
+		metapg = BufferGetPage(metabuf);
+
+		/* NO ELOG(ERROR) till meta is updated */
+		START_CRIT_SECTION();
+
+		/* upgrade metapage if needed */
+		if (metad->btm_version < BTREE_NOVAC_VERSION)
+			_bt_upgrademetapage(metapg);
+
+		metad->btm_root = rootblkno;
+		metad->btm_level = 0;
+		metad->btm_fastroot = rootblkno;
+		metad->btm_fastlevel = 0;
+		metad->btm_last_cleanup_num_delpages = 0;
+		metad->btm_last_cleanup_num_heap_tuples = -1.0;
+
+		MarkBufferDirty(rootbuf);
+		MarkBufferDirty(metabuf);
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			xl_btree_newroot xlrec;
+			XLogRecPtr	recptr;
+			xl_btree_metadata md;
+
+			XLogBeginInsert();
+			XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
+			XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+			Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+			md.version = metad->btm_version;
+			md.root = rootblkno;
+			md.level = 0;
+			md.fastroot = rootblkno;
+			md.fastlevel = 0;
+			md.last_cleanup_num_delpages = 0;
+			md.allequalimage = metad->btm_allequalimage;
+
+			XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
+
+			xlrec.rootblk = rootblkno;
+			xlrec.level = 0;
+
+			XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
+
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
+
+			PageSetLSN(rootpage, recptr);
+			PageSetLSN(metapg, recptr);
+		}
+
+		END_CRIT_SECTION();
+
+		/*
+		 * swap root write lock for read lock.  There is no danger of anyone
+		 * else accessing the new root page while it's unlocked, since no one
+		 * else knows where it is yet.
+		 */
+		_bt_unlockbuf(rel, rootbuf);
+		_bt_lockbuf(rel, rootbuf, BT_READ);
+
+		/* okay, metadata is correct, release lock on it without caching */
+		_bt_relbuf(rel, metabuf);
+	}
+	else
+	{
+		rootblkno = metad->btm_fastroot;
+		Assert(rootblkno != P_NONE);
+		rootlevel = metad->btm_fastlevel;
+
+		/*
+		 * Cache the metapage data for next time
+		 */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(BTMetaPageData));
+		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+
+		/*
+		 * We are done with the metapage; arrange to release it via first
+		 * _bt_relandgetbuf call
+		 */
+		rootbuf = metabuf;
+
+		for (;;)
+		{
+			rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
+			rootpage = BufferGetPage(rootbuf);
+			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+			if (!P_IGNORE(rootopaque))
+				break;
+
+			/* it's dead, Jim.  step right one page */
+			if (P_RIGHTMOST(rootopaque))
+				elog(ERROR, "no live root page found in index \"%s\"",
+					 RelationGetRelationName(rel));
+			rootblkno = rootopaque->btpo_next;
+		}
+
+		if (rootopaque->btpo_level != rootlevel)
+			elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
+				 rootblkno, RelationGetRelationName(rel),
+				 rootopaque->btpo_level, rootlevel);
+	}
+
+	/*
+	 * By here, we have a pin and read lock on the root page, and no lock set
+	 * on the metadata page.  Return the root page's buffer.
+	 */
+	return rootbuf;
+}
+
+/*
+ *	_bt_gettrueroot() -- Get the true root page of the btree.
+ *
+ *		This is the same as the BT_READ case of _bt_getroot(), except
+ *		we follow the true-root link not the fast-root link.
+ *
+ * By the time we acquire lock on the root page, it might have been split and
+ * not be the true root anymore.  This is okay for the present uses of this
+ * routine; we only really need to be able to move up at least one tree level
+ * from whatever non-root page we were at.  If we ever do need to lock the
+ * one true root page, we could loop here, re-reading the metapage on each
+ * failure.  (Note that it wouldn't do to hold the lock on the metapage while
+ * moving to the root --- that'd deadlock against any concurrent root split.)
+ */
+Buffer
+_bt_gettrueroot(Relation rel)
+{
+	Buffer		metabuf;
+	Page		metapg;
+	BTPageOpaque metaopaque;
+	Buffer		rootbuf;
+	Page		rootpage;
+	BTPageOpaque rootopaque;
+	BlockNumber rootblkno;
+	uint32		rootlevel;
+	BTMetaPageData *metad;
+
+	/*
+	 * We don't try to use cached metapage data here, since (a) this path is
+	 * not performance-critical, and (b) if we are here it suggests our cache
+	 * is out-of-date anyway.  In light of point (b), it's probably safest to
+	 * actively flush any cached metapage info.
+	 */
+	if (rel->rd_amcache)
+		pfree(rel->rd_amcache);
+	rel->rd_amcache = NULL;
+
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+	metapg = BufferGetPage(metabuf);
+	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+	metad = BTPageGetMeta(metapg);
+
+	if (!P_ISMETA(metaopaque) ||
+		metad->btm_magic != BTREE_MAGIC)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("index \"%s\" is not a btree",
+						RelationGetRelationName(rel))));
+
+	if (metad->btm_version < BTREE_MIN_VERSION ||
+		metad->btm_version > BTREE_VERSION)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("version mismatch in index \"%s\": file version %d, "
+						"current version %d, minimal supported version %d",
+						RelationGetRelationName(rel),
+						metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+
+	/* if no root page initialized yet, fail */
+	if (metad->btm_root == P_NONE)
+	{
+		_bt_relbuf(rel, metabuf);
+		return InvalidBuffer;
+	}
+
+	rootblkno = metad->btm_root;
+	rootlevel = metad->btm_level;
+
+	/*
+	 * We are done with the metapage; arrange to release it via first
+	 * _bt_relandgetbuf call
+	 */
+	rootbuf = metabuf;
+
+	for (;;)
+	{
+		rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
+		rootpage = BufferGetPage(rootbuf);
+		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+		if (!P_IGNORE(rootopaque))
+			break;
+
+		/* it's dead, Jim.  step right one page */
+		if (P_RIGHTMOST(rootopaque))
+			elog(ERROR, "no live root page found in index \"%s\"",
+				 RelationGetRelationName(rel));
+		rootblkno = rootopaque->btpo_next;
+	}
+
+	if (rootopaque->btpo_level != rootlevel)
+		elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
+			 rootblkno, RelationGetRelationName(rel),
+			 rootopaque->btpo_level, rootlevel);
+
+	return rootbuf;
+}
+
+/*
+ *	_bt_getrootheight() -- Get the height of the btree search tree.
+ *
+ *		We return the level (counting from zero) of the current fast root.
+ *		This represents the number of tree levels we'd have to descend through
+ *		to start any btree index search.
+ *
+ *		This is used by the planner for cost-estimation purposes.  Since it's
+ *		only an estimate, slightly-stale data is fine, hence we don't worry
+ *		about updating previously cached data.
+ */
+int
+_bt_getrootheight(Relation rel)
+{
+	BTMetaPageData *metad;
+
+	if (rel->rd_amcache == NULL)
+	{
+		Buffer		metabuf;
+
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+		metad = _bt_getmeta(rel, metabuf);
+
+		/*
+		 * If there's no root page yet, _bt_getroot() doesn't expect a cache
+		 * to be made, so just stop here and report the index height is zero.
+		 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
+		 */
+		if (metad->btm_root == P_NONE)
+		{
+			_bt_relbuf(rel, metabuf);
+			return 0;
+		}
+
+		/*
+		 * Cache the metapage data for next time
+		 */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(BTMetaPageData));
+		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+		_bt_relbuf(rel, metabuf);
+	}
+
+	/* Get cached page */
+	metad = (BTMetaPageData *) rel->rd_amcache;
+	/* We shouldn't have cached it if any of these fail */
+	Assert(metad->btm_magic == BTREE_MAGIC);
+	Assert(metad->btm_version >= BTREE_MIN_VERSION);
+	Assert(metad->btm_version <= BTREE_VERSION);
+	Assert(!metad->btm_allequalimage ||
+		   metad->btm_version > BTREE_NOVAC_VERSION);
+	Assert(metad->btm_fastroot != P_NONE);
+
+	return metad->btm_fastlevel;
+}
+
+/*
+ *	_bt_metaversion() -- Get version/status info from metapage.
+ *
+ *		Sets caller's *heapkeyspace and *allequalimage arguments using data
+ *		from the B-Tree metapage (could be locally-cached version).  This
+ *		information needs to be stashed in insertion scankey, so we provide a
+ *		single function that fetches both at once.
+ *
+ *		This is used to determine the rules that must be used to descend a
+ *		btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
+ *		pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
+ *		performance when inserting a new BTScanInsert-wise duplicate tuple
+ *		among many leaf pages already full of such duplicates.
+ *
+ *		Also sets allequalimage field, which indicates whether or not it is
+ *		safe to apply deduplication.  We rely on the assumption that
+ *		btm_allequalimage will be zero'ed on heapkeyspace indexes that were
+ *		pg_upgrade'd from Postgres 12.
+ */
+void
+_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
+{
+	BTMetaPageData *metad;
+
+	if (rel->rd_amcache == NULL)
+	{
+		Buffer		metabuf;
+
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+		metad = _bt_getmeta(rel, metabuf);
+
+		/*
+		 * If there's no root page yet, _bt_getroot() doesn't expect a cache
+		 * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
+		 * be changed to allow this case.)
+		 */
+		if (metad->btm_root == P_NONE)
+		{
+			*heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+			*allequalimage = metad->btm_allequalimage;
+
+			_bt_relbuf(rel, metabuf);
+			return;
+		}
+
+		/*
+		 * Cache the metapage data for next time
+		 *
+		 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
+		 * can change the nbtree version for an index without invalidating any
+		 * local cache.  This is okay because it can only happen when moving
+		 * from version 2 to version 3, both of which are !heapkeyspace
+		 * versions.
+		 */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(BTMetaPageData));
+		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+		_bt_relbuf(rel, metabuf);
+	}
+
+	/* Get cached page */
+	metad = (BTMetaPageData *) rel->rd_amcache;
+	/* We shouldn't have cached it if any of these fail */
+	Assert(metad->btm_magic == BTREE_MAGIC);
+	Assert(metad->btm_version >= BTREE_MIN_VERSION);
+	Assert(metad->btm_version <= BTREE_VERSION);
+	Assert(!metad->btm_allequalimage ||
+		   metad->btm_version > BTREE_NOVAC_VERSION);
+	Assert(metad->btm_fastroot != P_NONE);
+
+	*heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+	*allequalimage = metad->btm_allequalimage;
+}
+
+/*
+ *	_bt_checkpage() -- Verify that a freshly-read page looks sane.
+ */
+void
+_bt_checkpage(Relation rel, Buffer buf)
+{
+	Page		page = BufferGetPage(buf);
+
+	/*
+	 * ReadBuffer verifies that every newly-read page passes
+	 * PageHeaderIsValid, which means it either contains a reasonably sane
+	 * page header or is all-zero.  We have to defend against the all-zero
+	 * case, however.
+	 */
+	if (PageIsNew(page))
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("index \"%s\" contains unexpected zero page at block %u",
+						RelationGetRelationName(rel),
+						BufferGetBlockNumber(buf)),
+				 errhint("Please REINDEX it.")));
+
+	/*
+	 * Additionally check that the special area looks sane.
+	 */
+	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("index \"%s\" contains corrupted page at block %u",
+						RelationGetRelationName(rel),
+						BufferGetBlockNumber(buf)),
+				 errhint("Please REINDEX it.")));
+}
+
+/*
+ * Log the reuse of a page from the FSM.
+ */
+static void
+_bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
+{
+	xl_btree_reuse_page xlrec_reuse;
+
+	/*
+	 * Note that we don't register the buffer with the record, because this
+	 * operation doesn't modify the page. This record only exists to provide a
+	 * conflict point for Hot Standby.
+	 */
+
+	/* XLOG stuff */
+	xlrec_reuse.node = rel->rd_node;
+	xlrec_reuse.block = blkno;
+	xlrec_reuse.latestRemovedFullXid = safexid;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
+
+	XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
+}
+
+/*
+ *	_bt_getbuf() -- Get a buffer by block number for read or write.
+ *
+ *		blkno == P_NEW means to get an unallocated index page.  The page
+ *		will be initialized before returning it.
+ *
+ *		The general rule in nbtree is that it's never okay to access a
+ *		page without holding both a buffer pin and a buffer lock on
+ *		the page's buffer.
+ *
+ *		When this routine returns, the appropriate lock is set on the
+ *		requested buffer and its reference count has been incremented
+ *		(ie, the buffer is "locked and pinned").  Also, we apply
+ *		_bt_checkpage to sanity-check the page (except in P_NEW case),
+ *		and perform Valgrind client requests that help Valgrind detect
+ *		unsafe page accesses.
+ *
+ *		Note: raw LockBuffer() calls are disallowed in nbtree; all
+ *		buffer lock requests need to go through wrapper functions such
+ *		as _bt_lockbuf().
+ */
+Buffer
+_bt_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+	Buffer		buf;
+
+	if (blkno != P_NEW)
+	{
+		/* Read an existing block of the relation */
+		buf = ReadBuffer(rel, blkno);
+		_bt_lockbuf(rel, buf, access);
+		_bt_checkpage(rel, buf);
+	}
+	else
+	{
+		bool		needLock;
+		Page		page;
+
+		Assert(access == BT_WRITE);
+
+		/*
+		 * First see if the FSM knows of any free pages.
+		 *
+		 * We can't trust the FSM's report unreservedly; we have to check that
+		 * the page is still free.  (For example, an already-free page could
+		 * have been re-used between the time the last VACUUM scanned it and
+		 * the time the VACUUM made its FSM updates.)
+		 *
+		 * In fact, it's worse than that: we can't even assume that it's safe
+		 * to take a lock on the reported page.  If somebody else has a lock
+		 * on it, or even worse our own caller does, we could deadlock.  (The
+		 * own-caller scenario is actually not improbable. Consider an index
+		 * on a serial or timestamp column.  Nearly all splits will be at the
+		 * rightmost page, so it's entirely likely that _bt_split will call us
+		 * while holding a lock on the page most recently acquired from FSM. A
+		 * VACUUM running concurrently with the previous split could well have
+		 * placed that page back in FSM.)
+		 *
+		 * To get around that, we ask for only a conditional lock on the
+		 * reported page.  If we fail, then someone else is using the page,
+		 * and we may reasonably assume it's not free.  (If we happen to be
+		 * wrong, the worst consequence is the page will be lost to use till
+		 * the next VACUUM, which is no big problem.)
+		 */
+		for (;;)
+		{
+			blkno = GetFreeIndexPage(rel);
+			if (blkno == InvalidBlockNumber)
+				break;
+			buf = ReadBuffer(rel, blkno);
+			if (_bt_conditionallockbuf(rel, buf))
+			{
+				page = BufferGetPage(buf);
+
+				/*
+				 * It's possible to find an all-zeroes page in an index.  For
+				 * example, a backend might successfully extend the relation
+				 * one page and then crash before it is able to make a WAL
+				 * entry for adding the page.  If we find a zeroed page then
+				 * reclaim it immediately.
+				 */
+				if (PageIsNew(page))
+				{
+					/* Okay to use page.  Initialize and return it. */
+					_bt_pageinit(page, BufferGetPageSize(buf));
+					return buf;
+				}
+
+				if (BTPageIsRecyclable(page))
+				{
+					/*
+					 * If we are generating WAL for Hot Standby then create a
+					 * WAL record that will allow us to conflict with queries
+					 * running on standby, in case they have snapshots older
+					 * than safexid value
+					 */
+					if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+						_bt_log_reuse_page(rel, blkno,
+										   BTPageGetDeleteXid(page));
+
+					/* Okay to use page.  Re-initialize and return it. */
+					_bt_pageinit(page, BufferGetPageSize(buf));
+					return buf;
+				}
+				elog(DEBUG2, "FSM returned nonrecyclable page");
+				_bt_relbuf(rel, buf);
+			}
+			else
+			{
+				elog(DEBUG2, "FSM returned nonlockable page");
+				/* couldn't get lock, so just drop pin */
+				ReleaseBuffer(buf);
+			}
+		}
+
+		/*
+		 * Extend the relation by one page.
+		 *
+		 * We have to use a lock to ensure no one else is extending the rel at
+		 * the same time, else we will both try to initialize the same new
+		 * page.  We can skip locking for new or temp relations, however,
+		 * since no one else could be accessing them.
+		 */
+		needLock = !RELATION_IS_LOCAL(rel);
+
+		if (needLock)
+			LockRelationForExtension(rel, ExclusiveLock);
+
+		buf = ReadBuffer(rel, P_NEW);
+
+		/* Acquire buffer lock on new page */
+		_bt_lockbuf(rel, buf, BT_WRITE);
+
+		/*
+		 * Release the file-extension lock; it's now OK for someone else to
+		 * extend the relation some more.  Note that we cannot release this
+		 * lock before we have buffer lock on the new page, or we risk a race
+		 * condition against btvacuumscan --- see comments therein.
+		 */
+		if (needLock)
+			UnlockRelationForExtension(rel, ExclusiveLock);
+
+		/* Initialize the new page before returning it */
+		page = BufferGetPage(buf);
+		Assert(PageIsNew(page));
+		_bt_pageinit(page, BufferGetPageSize(buf));
+	}
+
+	/* ref count and lock type are correct */
+	return buf;
+}
+
+/*
+ *	_bt_relandgetbuf() -- release a locked buffer and get another one.
+ *
+ * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
+ * exception that blkno may not be P_NEW.  Also, if obuf is InvalidBuffer
+ * then it reduces to just _bt_getbuf; allowing this case simplifies some
+ * callers.
+ *
+ * The original motivation for using this was to avoid two entries to the
+ * bufmgr when one would do.  However, now it's mainly just a notational
+ * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
+ * is when the target page is the same one already in the buffer.
+ */
+Buffer
+_bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
+{
+	Buffer		buf;
+
+	Assert(blkno != P_NEW);
+	if (BufferIsValid(obuf))
+		_bt_unlockbuf(rel, obuf);
+	buf = ReleaseAndReadBuffer(obuf, rel, blkno);
+	_bt_lockbuf(rel, buf, access);
+
+	_bt_checkpage(rel, buf);
+	return buf;
+}
+
+/*
+ *	_bt_relbuf() -- release a locked buffer.
+ *
+ * Lock and pin (refcount) are both dropped.
+ */
+void
+_bt_relbuf(Relation rel, Buffer buf)
+{
+	_bt_unlockbuf(rel, buf);
+	ReleaseBuffer(buf);
+}
+
+/*
+ *	_bt_lockbuf() -- lock a pinned buffer.
+ *
+ * Lock is acquired without acquiring another pin.  This is like a raw
+ * LockBuffer() call, but performs extra steps needed by Valgrind.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+void
+_bt_lockbuf(Relation rel, Buffer buf, int access)
+{
+	/* LockBuffer() asserts that pin is held by this backend */
+	LockBuffer(buf, access);
+
+	/*
+	 * It doesn't matter that _bt_unlockbuf() won't get called in the event of
+	 * an nbtree error (e.g. a unique violation error).  That won't cause
+	 * Valgrind false positives.
+	 *
+	 * The nbtree client requests are superimposed on top of the bufmgr.c
+	 * buffer pin client requests.  In the event of an nbtree error the buffer
+	 * will certainly get marked as defined when the backend once again
+	 * acquires its first pin on the buffer. (Of course, if the backend never
+	 * touches the buffer again then it doesn't matter that it remains
+	 * non-accessible to Valgrind.)
+	 *
+	 * Note: When an IndexTuple C pointer gets computed using an ItemId read
+	 * from a page while a lock was held, the C pointer becomes unsafe to
+	 * dereference forever as soon as the lock is released.  Valgrind can only
+	 * detect cases where the pointer gets dereferenced with no _current_
+	 * lock/pin held, though.
+	 */
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ *	_bt_unlockbuf() -- unlock a pinned buffer.
+ */
+void
+_bt_unlockbuf(Relation rel, Buffer buf)
+{
+	/*
+	 * Buffer is pinned and locked, which means that it is expected to be
+	 * defined and addressable.  Check that proactively.
+	 */
+	VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	/* LockBuffer() asserts that pin is held by this backend */
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ *	_bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
+ *	buffer.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+bool
+_bt_conditionallockbuf(Relation rel, Buffer buf)
+{
+	/* ConditionalLockBuffer() asserts that pin is held by this backend */
+	if (!ConditionalLockBuffer(buf))
+		return false;
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	return true;
+}
+
+/*
+ *	_bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
+ *	lock.
+ */
+void
+_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
+{
+	/*
+	 * Buffer is pinned and locked, which means that it is expected to be
+	 * defined and addressable.  Check that proactively.
+	 */
+	VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	/* LockBuffer() asserts that pin is held by this backend */
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	LockBufferForCleanup(buf);
+}
+
+/*
+ *	_bt_pageinit() -- Initialize a new page.
+ *
+ * On return, the page header is initialized; data space is empty;
+ * special space is zeroed out.
+ */
+void
+_bt_pageinit(Page page, Size size)
+{
+	PageInit(page, size, sizeof(BTPageOpaqueData));
+}
+
+/*
+ * Delete item(s) from a btree leaf page during VACUUM.
+ *
+ * This routine assumes that the caller has a super-exclusive write lock on
+ * the buffer.  Also, the given deletable and updatable arrays *must* be
+ * sorted in ascending order.
+ *
+ * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
+ * in an existing posting list item are to be removed.  This works by
+ * updating/overwriting an existing item with caller's new version of the item
+ * (a version that lacks the TIDs that are to be deleted).
+ *
+ * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
+ * generate their own latestRemovedXid by accessing the table directly,
+ * whereas VACUUMs rely on the initial VACUUM table scan performing
+ * WAL-logging that takes care of the issue for the table's indexes
+ * indirectly.  Also, we remove the VACUUM cycle ID from pages, which b-tree
+ * deletes don't do.
+ */
+void
+_bt_delitems_vacuum(Relation rel, Buffer buf,
+					OffsetNumber *deletable, int ndeletable,
+					BTVacuumPosting *updatable, int nupdatable)
+{
+	Page		page = BufferGetPage(buf);
+	BTPageOpaque opaque;
+	bool		needswal = RelationNeedsWAL(rel);
+	char	   *updatedbuf = NULL;
+	Size		updatedbuflen = 0;
+	OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
+
+	/* Shouldn't be called unless there's something to do */
+	Assert(ndeletable > 0 || nupdatable > 0);
+
+	/* Generate new version of posting lists without deleted TIDs */
+	if (nupdatable > 0)
+		updatedbuf = _bt_delitems_update(updatable, nupdatable,
+										 updatedoffsets, &updatedbuflen,
+										 needswal);
+
+	/* No ereport(ERROR) until changes are logged */
+	START_CRIT_SECTION();
+
+	/*
+	 * Handle posting tuple updates.
+	 *
+	 * Deliberately do this before handling simple deletes.  If we did it the
+	 * other way around (i.e. WAL record order -- simple deletes before
+	 * updates) then we'd have to make compensating changes to the 'updatable'
+	 * array of offset numbers.
+	 *
+	 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
+	 * happens to already be set.  It's important that we not interfere with
+	 * _bt_delitems_delete().
+	 */
+	for (int i = 0; i < nupdatable; i++)
+	{
+		OffsetNumber updatedoffset = updatedoffsets[i];
+		IndexTuple	itup;
+		Size		itemsz;
+
+		itup = updatable[i]->itup;
+		itemsz = MAXALIGN(IndexTupleSize(itup));
+		if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
+									 itemsz))
+			elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
+				 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+	}
+
+	/* Now handle simple deletes of entire tuples */
+	if (ndeletable > 0)
+		PageIndexMultiDelete(page, deletable, ndeletable);
+
+	/*
+	 * We can clear the vacuum cycle ID since this page has certainly been
+	 * processed by the current vacuum scan.
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	opaque->btpo_cycleid = 0;
+
+	/*
+	 * Clear the BTP_HAS_GARBAGE page flag.
+	 *
+	 * This flag indicates the presence of LP_DEAD items on the page (though
+	 * not reliably).  Note that we only rely on it with pg_upgrade'd
+	 * !heapkeyspace indexes.  That's why clearing it here won't usually
+	 * interfere with _bt_delitems_delete().
+	 */
+	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	MarkBufferDirty(buf);
+
+	/* XLOG stuff */
+	if (needswal)
+	{
+		XLogRecPtr	recptr;
+		xl_btree_vacuum xlrec_vacuum;
+
+		xlrec_vacuum.ndeleted = ndeletable;
+		xlrec_vacuum.nupdated = nupdatable;
+
+		XLogBeginInsert();
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+		XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
+
+		if (ndeletable > 0)
+			XLogRegisterBufData(0, (char *) deletable,
+								ndeletable * sizeof(OffsetNumber));
+
+		if (nupdatable > 0)
+		{
+			XLogRegisterBufData(0, (char *) updatedoffsets,
+								nupdatable * sizeof(OffsetNumber));
+			XLogRegisterBufData(0, updatedbuf, updatedbuflen);
+		}
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
+
+		PageSetLSN(page, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* can't leak memory here */
+	if (updatedbuf != NULL)
+		pfree(updatedbuf);
+	/* free tuples allocated within _bt_delitems_update() */
+	for (int i = 0; i < nupdatable; i++)
+		pfree(updatable[i]->itup);
+}
+
+/*
+ * Delete item(s) from a btree leaf page during single-page cleanup.
+ *
+ * This routine assumes that the caller has pinned and write locked the
+ * buffer.  Also, the given deletable and updatable arrays *must* be sorted in
+ * ascending order.
+ *
+ * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
+ * in an existing posting list item are to be removed.  This works by
+ * updating/overwriting an existing item with caller's new version of the item
+ * (a version that lacks the TIDs that are to be deleted).
+ *
+ * This is nearly the same as _bt_delitems_vacuum as far as what it does to
+ * the page, but it needs its own latestRemovedXid from caller (caller gets
+ * this from tableam).  This is used by the REDO routine to generate recovery
+ * conflicts.  The other difference is that only _bt_delitems_vacuum will
+ * clear page's VACUUM cycle ID.
+ */
+static void
+_bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid,
+					OffsetNumber *deletable, int ndeletable,
+					BTVacuumPosting *updatable, int nupdatable)
+{
+	Page		page = BufferGetPage(buf);
+	BTPageOpaque opaque;
+	bool		needswal = RelationNeedsWAL(rel);
+	char	   *updatedbuf = NULL;
+	Size		updatedbuflen = 0;
+	OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
+
+	/* Shouldn't be called unless there's something to do */
+	Assert(ndeletable > 0 || nupdatable > 0);
+
+	/* Generate new versions of posting lists without deleted TIDs */
+	if (nupdatable > 0)
+		updatedbuf = _bt_delitems_update(updatable, nupdatable,
+										 updatedoffsets, &updatedbuflen,
+										 needswal);
+
+	/* No ereport(ERROR) until changes are logged */
+	START_CRIT_SECTION();
+
+	/* Handle updates and deletes just like _bt_delitems_vacuum */
+	for (int i = 0; i < nupdatable; i++)
+	{
+		OffsetNumber updatedoffset = updatedoffsets[i];
+		IndexTuple	itup;
+		Size		itemsz;
+
+		itup = updatable[i]->itup;
+		itemsz = MAXALIGN(IndexTupleSize(itup));
+		if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
+									 itemsz))
+			elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
+				 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+	}
+
+	if (ndeletable > 0)
+		PageIndexMultiDelete(page, deletable, ndeletable);
+
+	/*
+	 * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
+	 * this point.  The VACUUM command alone controls vacuum cycle IDs.
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/*
+	 * Clear the BTP_HAS_GARBAGE page flag.
+	 *
+	 * This flag indicates the presence of LP_DEAD items on the page (though
+	 * not reliably).  Note that we only rely on it with pg_upgrade'd
+	 * !heapkeyspace indexes.
+	 */
+	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	MarkBufferDirty(buf);
+
+	/* XLOG stuff */
+	if (needswal)
+	{
+		XLogRecPtr	recptr;
+		xl_btree_delete xlrec_delete;
+
+		xlrec_delete.latestRemovedXid = latestRemovedXid;
+		xlrec_delete.ndeleted = ndeletable;
+		xlrec_delete.nupdated = nupdatable;
+
+		XLogBeginInsert();
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+		XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
+
+		if (ndeletable > 0)
+			XLogRegisterBufData(0, (char *) deletable,
+								ndeletable * sizeof(OffsetNumber));
+
+		if (nupdatable > 0)
+		{
+			XLogRegisterBufData(0, (char *) updatedoffsets,
+								nupdatable * sizeof(OffsetNumber));
+			XLogRegisterBufData(0, updatedbuf, updatedbuflen);
+		}
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
+
+		PageSetLSN(page, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* can't leak memory here */
+	if (updatedbuf != NULL)
+		pfree(updatedbuf);
+	/* free tuples allocated within _bt_delitems_update() */
+	for (int i = 0; i < nupdatable; i++)
+		pfree(updatable[i]->itup);
+}
+
+/*
+ * Set up state needed to delete TIDs from posting list tuples via "updating"
+ * the tuple.  Performs steps common to both _bt_delitems_vacuum and
+ * _bt_delitems_delete.  These steps must take place before each function's
+ * critical section begins.
+ *
+ * updatable and nupdatable are inputs, though note that we will use
+ * _bt_update_posting() to replace the original itup with a pointer to a final
+ * version in palloc()'d memory.  Caller should free the tuples when its done.
+ *
+ * The first nupdatable entries from updatedoffsets are set to the page offset
+ * number for posting list tuples that caller updates.  This is mostly useful
+ * because caller may need to WAL-log the page offsets (though we always do
+ * this for caller out of convenience).
+ *
+ * Returns buffer consisting of an array of xl_btree_update structs that
+ * describe the steps we perform here for caller (though only when needswal is
+ * true).  Also sets *updatedbuflen to the final size of the buffer.  This
+ * buffer is used by caller when WAL logging is required.
+ */
+static char *
+_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
+					OffsetNumber *updatedoffsets, Size *updatedbuflen,
+					bool needswal)
+{
+	char	   *updatedbuf = NULL;
+	Size		buflen = 0;
+
+	/* Shouldn't be called unless there's something to do */
+	Assert(nupdatable > 0);
+
+	for (int i = 0; i < nupdatable; i++)
+	{
+		BTVacuumPosting vacposting = updatable[i];
+		Size		itemsz;
+
+		/* Replace work area IndexTuple with updated version */
+		_bt_update_posting(vacposting);
+
+		/* Keep track of size of xl_btree_update for updatedbuf in passing */
+		itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
+		buflen += itemsz;
+
+		/* Build updatedoffsets buffer in passing */
+		updatedoffsets[i] = vacposting->updatedoffset;
+	}
+
+	/* XLOG stuff */
+	if (needswal)
+	{
+		Size		offset = 0;
+
+		/* Allocate, set final size for caller */
+		updatedbuf = palloc(buflen);
+		*updatedbuflen = buflen;
+		for (int i = 0; i < nupdatable; i++)
+		{
+			BTVacuumPosting vacposting = updatable[i];
+			Size		itemsz;
+			xl_btree_update update;
+
+			update.ndeletedtids = vacposting->ndeletedtids;
+			memcpy(updatedbuf + offset, &update.ndeletedtids,
+				   SizeOfBtreeUpdate);
+			offset += SizeOfBtreeUpdate;
+
+			itemsz = update.ndeletedtids * sizeof(uint16);
+			memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
+			offset += itemsz;
+		}
+	}
+
+	return updatedbuf;
+}
+
+/*
+ * Comparator used by _bt_delitems_delete_check() to restore deltids array
+ * back to its original leaf-page-wise sort order
+ */
+static int
+_bt_delitems_cmp(const void *a, const void *b)
+{
+	TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
+	TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
+
+	if (indexdelete1->id > indexdelete2->id)
+		return 1;
+	if (indexdelete1->id < indexdelete2->id)
+		return -1;
+
+	Assert(false);
+
+	return 0;
+}
+
+/*
+ * Try to delete item(s) from a btree leaf page during single-page cleanup.
+ *
+ * nbtree interface to table_index_delete_tuples().  Deletes a subset of index
+ * tuples from caller's deltids array: those whose TIDs are found safe to
+ * delete by the tableam (or already marked LP_DEAD in index, and so already
+ * known to be deletable by our simple index deletion caller).  We physically
+ * delete index tuples from buf leaf page last of all (for index tuples where
+ * that is known to be safe following our table_index_delete_tuples() call).
+ *
+ * Simple index deletion caller only includes TIDs from index tuples marked
+ * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
+ * included without increasing the total number of distinct table blocks for
+ * the deletion operation as a whole.  This approach often allows us to delete
+ * some extra index tuples that were practically free for tableam to check in
+ * passing (when they actually turn out to be safe to delete).  It probably
+ * only makes sense for the tableam to go ahead with these extra checks when
+ * it is block-oriented (otherwise the checks probably won't be practically
+ * free, which we rely on).  The tableam interface requires the tableam side
+ * to handle the problem, though, so this is okay (we as an index AM are free
+ * to make the simplifying assumption that all tableams must be block-based).
+ *
+ * Bottom-up index deletion caller provides all the TIDs from the leaf page,
+ * without expecting that tableam will check most of them.  The tableam has
+ * considerable discretion around which entries/blocks it checks.  Our role in
+ * costing the bottom-up deletion operation is strictly advisory.
+ *
+ * Note: Caller must have added deltids entries (i.e. entries that go in
+ * delstate's main array) in leaf-page-wise order: page offset number order,
+ * TID order among entries taken from the same posting list tuple (tiebreak on
+ * TID).  This order is convenient to work with here.
+ *
+ * Note: We also rely on the id field of each deltids element "capturing" this
+ * original leaf-page-wise order.  That is, we expect to be able to get back
+ * to the original leaf-page-wise order just by sorting deltids on the id
+ * field (tableam will sort deltids for its own reasons, so we'll need to put
+ * it back in leaf-page-wise order afterwards).
+ */
+void
+_bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
+						  TM_IndexDeleteOp *delstate)
+{
+	Page		page = BufferGetPage(buf);
+	TransactionId latestRemovedXid;
+	OffsetNumber postingidxoffnum = InvalidOffsetNumber;
+	int			ndeletable = 0,
+				nupdatable = 0;
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	BTVacuumPosting updatable[MaxIndexTuplesPerPage];
+
+	/* Use tableam interface to determine which tuples to delete first */
+	latestRemovedXid = table_index_delete_tuples(heapRel, delstate);
+
+	/* Should not WAL-log latestRemovedXid unless it's required */
+	if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel))
+		latestRemovedXid = InvalidTransactionId;
+
+	/*
+	 * Construct a leaf-page-wise description of what _bt_delitems_delete()
+	 * needs to do to physically delete index tuples from the page.
+	 *
+	 * Must sort deltids array to restore leaf-page-wise order (original order
+	 * before call to tableam).  This is the order that the loop expects.
+	 *
+	 * Note that deltids array might be a lot smaller now.  It might even have
+	 * no entries at all (with bottom-up deletion caller), in which case there
+	 * is nothing left to do.
+	 */
+	qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
+		  _bt_delitems_cmp);
+	if (delstate->ndeltids == 0)
+	{
+		Assert(delstate->bottomup);
+		return;
+	}
+
+	/* We definitely have to delete at least one index tuple (or one TID) */
+	for (int i = 0; i < delstate->ndeltids; i++)
+	{
+		TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
+		OffsetNumber idxoffnum = dstatus->idxoffnum;
+		ItemId		itemid = PageGetItemId(page, idxoffnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+		int			nestedi,
+					nitem;
+		BTVacuumPosting vacposting;
+
+		Assert(OffsetNumberIsValid(idxoffnum));
+
+		if (idxoffnum == postingidxoffnum)
+		{
+			/*
+			 * This deltid entry is a TID from a posting list tuple that has
+			 * already been completely processed
+			 */
+			Assert(BTreeTupleIsPosting(itup));
+			Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
+									  &delstate->deltids[i].tid) < 0);
+			Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
+									  &delstate->deltids[i].tid) >= 0);
+			continue;
+		}
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			/* Plain non-pivot tuple */
+			Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
+			if (dstatus->knowndeletable)
+				deletable[ndeletable++] = idxoffnum;
+			continue;
+		}
+
+		/*
+		 * itup is a posting list tuple whose lowest deltids entry (which may
+		 * or may not be for the first TID from itup) is considered here now.
+		 * We should process all of the deltids entries for the posting list
+		 * together now, though (not just the lowest).  Remember to skip over
+		 * later itup-related entries during later iterations of outermost
+		 * loop.
+		 */
+		postingidxoffnum = idxoffnum;	/* Remember work in outermost loop */
+		nestedi = i;			/* Initialize for first itup deltids entry */
+		vacposting = NULL;		/* Describes final action for itup */
+		nitem = BTreeTupleGetNPosting(itup);
+		for (int p = 0; p < nitem; p++)
+		{
+			ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
+			int			ptidcmp = -1;
+
+			/*
+			 * This nested loop reuses work across ptid TIDs taken from itup.
+			 * We take advantage of the fact that both itup's TIDs and deltids
+			 * entries (within a single itup/posting list grouping) must both
+			 * be in ascending TID order.
+			 */
+			for (; nestedi < delstate->ndeltids; nestedi++)
+			{
+				TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
+				TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
+
+				/* Stop once we get past all itup related deltids entries */
+				Assert(tdstatus->idxoffnum >= idxoffnum);
+				if (tdstatus->idxoffnum != idxoffnum)
+					break;
+
+				/* Skip past non-deletable itup related entries up front */
+				if (!tdstatus->knowndeletable)
+					continue;
+
+				/* Entry is first partial ptid match (or an exact match)? */
+				ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
+				if (ptidcmp >= 0)
+				{
+					/* Greater than or equal (partial or exact) match... */
+					break;
+				}
+			}
+
+			/* ...exact ptid match to a deletable deltids entry? */
+			if (ptidcmp != 0)
+				continue;
+
+			/* Exact match for deletable deltids entry -- ptid gets deleted */
+			if (vacposting == NULL)
+			{
+				vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+									nitem * sizeof(uint16));
+				vacposting->itup = itup;
+				vacposting->updatedoffset = idxoffnum;
+				vacposting->ndeletedtids = 0;
+			}
+			vacposting->deletetids[vacposting->ndeletedtids++] = p;
+		}
+
+		/* Final decision on itup, a posting list tuple */
+
+		if (vacposting == NULL)
+		{
+			/* No TIDs to delete from itup -- do nothing */
+		}
+		else if (vacposting->ndeletedtids == nitem)
+		{
+			/* Straight delete of itup (to delete all TIDs) */
+			deletable[ndeletable++] = idxoffnum;
+			/* Turns out we won't need granular information */
+			pfree(vacposting);
+		}
+		else
+		{
+			/* Delete some (but not all) TIDs from itup */
+			Assert(vacposting->ndeletedtids > 0 &&
+				   vacposting->ndeletedtids < nitem);
+			updatable[nupdatable++] = vacposting;
+		}
+	}
+
+	/* Physically delete tuples (or TIDs) using deletable (or updatable) */
+	_bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable,
+						updatable, nupdatable);
+
+	/* be tidy */
+	for (int i = 0; i < nupdatable; i++)
+		pfree(updatable[i]);
+}
+
+/*
+ * Check that leftsib page (the btpo_prev of target page) is not marked with
+ * INCOMPLETE_SPLIT flag.  Used during page deletion.
+ *
+ * Returning true indicates that page flag is set in leftsib (which is
+ * definitely still the left sibling of target).  When that happens, the
+ * target doesn't have a downlink in parent, and the page deletion algorithm
+ * isn't prepared to handle that.  Deletion of the target page (or the whole
+ * subtree that contains the target page) cannot take place.
+ *
+ * Caller should not have a lock on the target page itself, since pages on the
+ * same level must always be locked left to right to avoid deadlocks.
+ */
+static bool
+_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
+{
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	bool		result;
+
+	/* Easy case: No left sibling */
+	if (leftsib == P_NONE)
+		return false;
+
+	buf = _bt_getbuf(rel, leftsib, BT_READ);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/*
+	 * If the left sibling was concurrently split, so that its next-pointer
+	 * doesn't point to the current page anymore, the split that created
+	 * target must be completed.  Caller can reasonably expect that there will
+	 * be a downlink to the target page that it can relocate using its stack.
+	 * (We don't allow splitting an incompletely split page again until the
+	 * previous split has been completed.)
+	 */
+	result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
+	_bt_relbuf(rel, buf);
+
+	return result;
+}
+
+/*
+ * Check that leafrightsib page (the btpo_next of target leaf page) is not
+ * marked with ISHALFDEAD flag.  Used during page deletion.
+ *
+ * Returning true indicates that page flag is set in leafrightsib, so page
+ * deletion cannot go ahead.  Our caller is not prepared to deal with the case
+ * where the parent page does not have a pivot tuples whose downlink points to
+ * leafrightsib (due to an earlier interrupted VACUUM operation).  It doesn't
+ * seem worth going to the trouble of teaching our caller to deal with it.
+ * The situation will be resolved after VACUUM finishes the deletion of the
+ * half-dead page (when a future VACUUM operation reaches the target page
+ * again).
+ *
+ * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
+ * _bt_rightsib_halfdeadflag() is only called for leaf pages, though.  This is
+ * okay because of the restriction on deleting pages that are the rightmost
+ * page of their parent (i.e. that such deletions can only take place when the
+ * entire subtree must be deleted).  The leaf level check made here will apply
+ * to a right "cousin" leaf page rather than a simple right sibling leaf page
+ * in cases where caller actually goes on to attempt deleting pages that are
+ * above the leaf page.  The right cousin leaf page is representative of the
+ * left edge of the subtree to the right of the to-be-deleted subtree as a
+ * whole, which is exactly the condition that our caller cares about.
+ * (Besides, internal pages are never marked half-dead, so it isn't even
+ * possible to _directly_ assess if an internal page is part of some other
+ * to-be-deleted subtree.)
+ */
+static bool
+_bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
+{
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	bool		result;
+
+	Assert(leafrightsib != P_NONE);
+
+	buf = _bt_getbuf(rel, leafrightsib, BT_READ);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
+	result = P_ISHALFDEAD(opaque);
+	_bt_relbuf(rel, buf);
+
+	return result;
+}
+
+/*
+ * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
+ *
+ * This action unlinks the leaf page from the b-tree structure, removing all
+ * pointers leading to it --- but not touching its own left and right links.
+ * The page cannot be physically reclaimed right away, since other processes
+ * may currently be trying to follow links leading to the page; they have to
+ * be allowed to use its right-link to recover.  See nbtree/README.
+ *
+ * On entry, the target buffer must be pinned and locked (either read or write
+ * lock is OK).  The page must be an empty leaf page, which may be half-dead
+ * already (a half-dead page should only be passed to us when an earlier
+ * VACUUM operation was interrupted, though).  Note in particular that caller
+ * should never pass a buffer containing an existing deleted page here.  The
+ * lock and pin on caller's buffer will be dropped before we return.
+ *
+ * Maintains bulk delete stats for caller, which are taken from vstate.  We
+ * need to cooperate closely with caller here so that whole VACUUM operation
+ * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
+ * delete in passing.  If such pages happen to be from a block number that is
+ * ahead of the current scanblkno position, then caller is expected to count
+ * them directly later on.  It's simpler for us to understand caller's
+ * requirements than it would be for caller to understand when or how a
+ * deleted page became deleted after the fact.
+ *
+ * NOTE: this leaks memory.  Rather than trying to clean up everything
+ * carefully, it's better to run it in a temp context that can be reset
+ * frequently.
+ */
+void
+_bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
+{
+	BlockNumber rightsib;
+	bool		rightsib_empty;
+	Page		page;
+	BTPageOpaque opaque;
+
+	/*
+	 * Save original leafbuf block number from caller.  Only deleted blocks
+	 * that are <= scanblkno are added to bulk delete stat's pages_deleted
+	 * count.
+	 */
+	BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
+
+	/*
+	 * "stack" is a search stack leading (approximately) to the target page.
+	 * It is initially NULL, but when iterating, we keep it to avoid
+	 * duplicated search effort.
+	 *
+	 * Also, when "stack" is not NULL, we have already checked that the
+	 * current page is not the right half of an incomplete split, i.e. the
+	 * left sibling does not have its INCOMPLETE_SPLIT flag set, including
+	 * when the current target page is to the right of caller's initial page
+	 * (the scanblkno page).
+	 */
+	BTStack		stack = NULL;
+
+	for (;;)
+	{
+		page = BufferGetPage(leafbuf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		/*
+		 * Internal pages are never deleted directly, only as part of deleting
+		 * the whole subtree all the way down to leaf level.
+		 *
+		 * Also check for deleted pages here.  Caller never passes us a fully
+		 * deleted page.  Only VACUUM can delete pages, so there can't have
+		 * been a concurrent deletion.  Assume that we reached any deleted
+		 * page encountered here by following a sibling link, and that the
+		 * index is corrupt.
+		 */
+		Assert(!P_ISDELETED(opaque));
+		if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
+		{
+			/*
+			 * Pre-9.4 page deletion only marked internal pages as half-dead,
+			 * but now we only use that flag on leaf pages. The old algorithm
+			 * was never supposed to leave half-dead pages in the tree, it was
+			 * just a transient state, but it was nevertheless possible in
+			 * error scenarios. We don't know how to deal with them here. They
+			 * are harmless as far as searches are considered, but inserts
+			 * into the deleted keyspace could add out-of-order downlinks in
+			 * the upper levels. Log a notice, hopefully the admin will notice
+			 * and reindex.
+			 */
+			if (P_ISHALFDEAD(opaque))
+				ereport(LOG,
+						(errcode(ERRCODE_INDEX_CORRUPTED),
+						 errmsg("index \"%s\" contains a half-dead internal page",
+								RelationGetRelationName(rel)),
+						 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
+
+			if (P_ISDELETED(opaque))
+				ereport(LOG,
+						(errcode(ERRCODE_INDEX_CORRUPTED),
+						 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
+										 BufferGetBlockNumber(leafbuf),
+										 scanblkno,
+										 RelationGetRelationName(rel))));
+
+			_bt_relbuf(rel, leafbuf);
+			return;
+		}
+
+		/*
+		 * We can never delete rightmost pages nor root pages.  While at it,
+		 * check that page is empty, since it's possible that the leafbuf page
+		 * was empty a moment ago, but has since had some inserts.
+		 *
+		 * To keep the algorithm simple, we also never delete an incompletely
+		 * split page (they should be rare enough that this doesn't make any
+		 * meaningful difference to disk usage):
+		 *
+		 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
+		 * left half of an incomplete split, but ensuring that it's not the
+		 * right half is more complicated.  For that, we have to check that
+		 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
+		 * _bt_leftsib_splitflag().  On the first iteration, we temporarily
+		 * release the lock on scanblkno/leafbuf, check the left sibling, and
+		 * construct a search stack to scanblkno.  On subsequent iterations,
+		 * we know we stepped right from a page that passed these tests, so
+		 * it's OK.
+		 */
+		if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
+			P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
+			P_INCOMPLETE_SPLIT(opaque))
+		{
+			/* Should never fail to delete a half-dead page */
+			Assert(!P_ISHALFDEAD(opaque));
+
+			_bt_relbuf(rel, leafbuf);
+			return;
+		}
+
+		/*
+		 * First, remove downlink pointing to the page (or a parent of the
+		 * page, if we are going to delete a taller subtree), and mark the
+		 * leafbuf page half-dead
+		 */
+		if (!P_ISHALFDEAD(opaque))
+		{
+			/*
+			 * We need an approximate pointer to the page's parent page.  We
+			 * use a variant of the standard search mechanism to search for
+			 * the page's high key; this will give us a link to either the
+			 * current parent or someplace to its left (if there are multiple
+			 * equal high keys, which is possible with !heapkeyspace indexes).
+			 *
+			 * Also check if this is the right-half of an incomplete split
+			 * (see comment above).
+			 */
+			if (!stack)
+			{
+				BTScanInsert itup_key;
+				ItemId		itemid;
+				IndexTuple	targetkey;
+				BlockNumber leftsib,
+							leafblkno;
+				Buffer		sleafbuf;
+
+				itemid = PageGetItemId(page, P_HIKEY);
+				targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
+
+				leftsib = opaque->btpo_prev;
+				leafblkno = BufferGetBlockNumber(leafbuf);
+
+				/*
+				 * To avoid deadlocks, we'd better drop the leaf page lock
+				 * before going further.
+				 */
+				_bt_unlockbuf(rel, leafbuf);
+
+				/*
+				 * Check that the left sibling of leafbuf (if any) is not
+				 * marked with INCOMPLETE_SPLIT flag before proceeding
+				 */
+				Assert(leafblkno == scanblkno);
+				if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
+				{
+					ReleaseBuffer(leafbuf);
+					return;
+				}
+
+				/* we need an insertion scan key for the search, so build one */
+				itup_key = _bt_mkscankey(rel, targetkey);
+				/* find the leftmost leaf page with matching pivot/high key */
+				itup_key->pivotsearch = true;
+				stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
+				/* won't need a second lock or pin on leafbuf */
+				_bt_relbuf(rel, sleafbuf);
+
+				/*
+				 * Re-lock the leaf page, and start over to use our stack
+				 * within _bt_mark_page_halfdead.  We must do it that way
+				 * because it's possible that leafbuf can no longer be
+				 * deleted.  We need to recheck.
+				 *
+				 * Note: We can't simply hold on to the sleafbuf lock instead,
+				 * because it's barely possible that sleafbuf is not the same
+				 * page as leafbuf.  This happens when leafbuf split after our
+				 * original lock was dropped, but before _bt_search finished
+				 * its descent.  We rely on the assumption that we'll find
+				 * leafbuf isn't safe to delete anymore in this scenario.
+				 * (Page deletion can cope with the stack being to the left of
+				 * leafbuf, but not to the right of leafbuf.)
+				 */
+				_bt_lockbuf(rel, leafbuf, BT_WRITE);
+				continue;
+			}
+
+			/*
+			 * See if it's safe to delete the leaf page, and determine how
+			 * many parent/internal pages above the leaf level will be
+			 * deleted.  If it's safe then _bt_mark_page_halfdead will also
+			 * perform the first phase of deletion, which includes marking the
+			 * leafbuf page half-dead.
+			 */
+			Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
+			if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
+			{
+				_bt_relbuf(rel, leafbuf);
+				return;
+			}
+		}
+
+		/*
+		 * Then unlink it from its siblings.  Each call to
+		 * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
+		 * making it shallower.  Iterate until the leafbuf page is deleted.
+		 */
+		rightsib_empty = false;
+		Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
+		while (P_ISHALFDEAD(opaque))
+		{
+			/* Check for interrupts in _bt_unlink_halfdead_page */
+			if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
+										  &rightsib_empty, vstate))
+			{
+				/*
+				 * _bt_unlink_halfdead_page should never fail, since we
+				 * established that deletion is generally safe in
+				 * _bt_mark_page_halfdead -- index must be corrupt.
+				 *
+				 * Note that _bt_unlink_halfdead_page already released the
+				 * lock and pin on leafbuf for us.
+				 */
+				Assert(false);
+				return;
+			}
+		}
+
+		Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
+
+		rightsib = opaque->btpo_next;
+
+		_bt_relbuf(rel, leafbuf);
+
+		/*
+		 * Check here, as calling loops will have locks held, preventing
+		 * interrupts from being processed.
+		 */
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * The page has now been deleted. If its right sibling is completely
+		 * empty, it's possible that the reason we haven't deleted it earlier
+		 * is that it was the rightmost child of the parent. Now that we
+		 * removed the downlink for this page, the right sibling might now be
+		 * the only child of the parent, and could be removed. It would be
+		 * picked up by the next vacuum anyway, but might as well try to
+		 * remove it now, so loop back to process the right sibling.
+		 *
+		 * Note: This relies on the assumption that _bt_getstackbuf() will be
+		 * able to reuse our original descent stack with a different child
+		 * block (provided that the child block is to the right of the
+		 * original leaf page reached by _bt_search()). It will even update
+		 * the descent stack each time we loop around, avoiding repeated work.
+		 */
+		if (!rightsib_empty)
+			break;
+
+		leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
+	}
+}
+
+/*
+ * First stage of page deletion.
+ *
+ * Establish the height of the to-be-deleted subtree with leafbuf at its
+ * lowest level, remove the downlink to the subtree, and mark leafbuf
+ * half-dead.  The final to-be-deleted subtree is usually just leafbuf itself,
+ * but may include additional internal pages (at most one per level of the
+ * tree below the root).
+ *
+ * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
+ * the rightmost child of its parent (and parent has more than one downlink).
+ * Returns 'true' when the first stage of page deletion completed
+ * successfully.
+ */
+static bool
+_bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
+{
+	BlockNumber leafblkno;
+	BlockNumber leafrightsib;
+	BlockNumber topparent;
+	BlockNumber topparentrightsib;
+	ItemId		itemid;
+	Page		page;
+	BTPageOpaque opaque;
+	Buffer		subtreeparent;
+	OffsetNumber poffset;
+	OffsetNumber nextoffset;
+	IndexTuple	itup;
+	IndexTupleData trunctuple;
+
+	page = BufferGetPage(leafbuf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
+		   P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
+		   P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
+
+	/*
+	 * Save info about the leaf page.
+	 */
+	leafblkno = BufferGetBlockNumber(leafbuf);
+	leafrightsib = opaque->btpo_next;
+
+	/*
+	 * Before attempting to lock the parent page, check that the right sibling
+	 * is not in half-dead state.  A half-dead right sibling would have no
+	 * downlink in the parent, which would be highly confusing later when we
+	 * delete the downlink.  It would fail the "right sibling of target page
+	 * is also the next child in parent page" cross-check below.
+	 */
+	if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
+	{
+		elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
+			 leafblkno, leafrightsib);
+		return false;
+	}
+
+	/*
+	 * We cannot delete a page that is the rightmost child of its immediate
+	 * parent, unless it is the only child --- in which case the parent has to
+	 * be deleted too, and the same condition applies recursively to it. We
+	 * have to check this condition all the way up before trying to delete,
+	 * and lock the parent of the root of the to-be-deleted subtree (the
+	 * "subtree parent").  _bt_lock_subtree_parent() locks the subtree parent
+	 * for us.  We remove the downlink to the "top parent" page (subtree root
+	 * page) from the subtree parent page below.
+	 *
+	 * Initialize topparent to be leafbuf page now.  The final to-be-deleted
+	 * subtree is often a degenerate one page subtree consisting only of the
+	 * leafbuf page.  When that happens, the leafbuf page is the final subtree
+	 * root page/top parent page.
+	 */
+	topparent = leafblkno;
+	topparentrightsib = leafrightsib;
+	if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
+								 &subtreeparent, &poffset,
+								 &topparent, &topparentrightsib))
+		return false;
+
+	/*
+	 * Check that the parent-page index items we're about to delete/overwrite
+	 * in subtree parent page contain what we expect.  This can fail if the
+	 * index has become corrupt for some reason.  We want to throw any error
+	 * before entering the critical section --- otherwise it'd be a PANIC.
+	 */
+	page = BufferGetPage(subtreeparent);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+#ifdef USE_ASSERT_CHECKING
+
+	/*
+	 * This is just an assertion because _bt_lock_subtree_parent should have
+	 * guaranteed tuple has the expected contents
+	 */
+	itemid = PageGetItemId(page, poffset);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	Assert(BTreeTupleGetDownLink(itup) == topparent);
+#endif
+
+	nextoffset = OffsetNumberNext(poffset);
+	itemid = PageGetItemId(page, nextoffset);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	if (BTreeTupleGetDownLink(itup) != topparentrightsib)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
+								 topparentrightsib, topparent,
+								 BTreeTupleGetDownLink(itup),
+								 BufferGetBlockNumber(subtreeparent),
+								 RelationGetRelationName(rel))));
+
+	/*
+	 * Any insert which would have gone on the leaf block will now go to its
+	 * right sibling.  In other words, the key space moves right.
+	 */
+	PredicateLockPageCombine(rel, leafblkno, leafrightsib);
+
+	/* No ereport(ERROR) until changes are logged */
+	START_CRIT_SECTION();
+
+	/*
+	 * Update parent of subtree.  We want to delete the downlink to the top
+	 * parent page/root of the subtree, and the *following* key.  Easiest way
+	 * is to copy the right sibling's downlink over the downlink that points
+	 * to top parent page, and then delete the right sibling's original pivot
+	 * tuple.
+	 *
+	 * Lanin and Shasha make the key space move left when deleting a page,
+	 * whereas the key space moves right here.  That's why we cannot simply
+	 * delete the pivot tuple with the downlink to the top parent page.  See
+	 * nbtree/README.
+	 */
+	page = BufferGetPage(subtreeparent);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	itemid = PageGetItemId(page, poffset);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	BTreeTupleSetDownLink(itup, topparentrightsib);
+
+	nextoffset = OffsetNumberNext(poffset);
+	PageIndexTupleDelete(page, nextoffset);
+
+	/*
+	 * Mark the leaf page as half-dead, and stamp it with a link to the top
+	 * parent page.  When the leaf page is also the top parent page, the link
+	 * is set to InvalidBlockNumber.
+	 */
+	page = BufferGetPage(leafbuf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	opaque->btpo_flags |= BTP_HALF_DEAD;
+
+	Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
+	MemSet(&trunctuple, 0, sizeof(IndexTupleData));
+	trunctuple.t_info = sizeof(IndexTupleData);
+	if (topparent != leafblkno)
+		BTreeTupleSetTopParent(&trunctuple, topparent);
+	else
+		BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
+
+	if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
+								 IndexTupleSize(&trunctuple)))
+		elog(ERROR, "could not overwrite high key in half-dead page");
+
+	/* Must mark buffers dirty before XLogInsert */
+	MarkBufferDirty(subtreeparent);
+	MarkBufferDirty(leafbuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_mark_page_halfdead xlrec;
+		XLogRecPtr	recptr;
+
+		xlrec.poffset = poffset;
+		xlrec.leafblk = leafblkno;
+		if (topparent != leafblkno)
+			xlrec.topparent = topparent;
+		else
+			xlrec.topparent = InvalidBlockNumber;
+
+		XLogBeginInsert();
+		XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
+		XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
+
+		page = BufferGetPage(leafbuf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		xlrec.leftblk = opaque->btpo_prev;
+		xlrec.rightblk = opaque->btpo_next;
+
+		XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
+
+		page = BufferGetPage(subtreeparent);
+		PageSetLSN(page, recptr);
+		page = BufferGetPage(leafbuf);
+		PageSetLSN(page, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	_bt_relbuf(rel, subtreeparent);
+	return true;
+}
+
+/*
+ * Second stage of page deletion.
+ *
+ * Unlinks a single page (in the subtree undergoing deletion) from its
+ * siblings.  Also marks the page deleted.
+ *
+ * To get rid of the whole subtree, including the leaf page itself, call here
+ * until the leaf page is deleted.  The original "top parent" established in
+ * the first stage of deletion is deleted in the first call here, while the
+ * leaf page is deleted in the last call here.  Note that the leaf page itself
+ * is often the initial top parent page.
+ *
+ * Returns 'false' if the page could not be unlinked (shouldn't happen).  If
+ * the right sibling of the current target page is empty, *rightsib_empty is
+ * set to true, allowing caller to delete the target's right sibling page in
+ * passing.  Note that *rightsib_empty is only actually used by caller when
+ * target page is leafbuf, following last call here for leafbuf/the subtree
+ * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
+ * consistent.)
+ *
+ * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
+ * On success exit, we'll be holding pin and write lock.  On failure exit,
+ * we'll release both pin and lock before returning (we define it that way
+ * to avoid having to reacquire a lock we already released).
+ */
+static bool
+_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
+						 bool *rightsib_empty, BTVacState *vstate)
+{
+	BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
+	IndexBulkDeleteResult *stats = vstate->stats;
+	BlockNumber leafleftsib;
+	BlockNumber leafrightsib;
+	BlockNumber target;
+	BlockNumber leftsib;
+	BlockNumber rightsib;
+	Buffer		lbuf = InvalidBuffer;
+	Buffer		buf;
+	Buffer		rbuf;
+	Buffer		metabuf = InvalidBuffer;
+	Page		metapg = NULL;
+	BTMetaPageData *metad = NULL;
+	ItemId		itemid;
+	Page		page;
+	BTPageOpaque opaque;
+	FullTransactionId safexid;
+	bool		rightsib_is_rightmost;
+	uint32		targetlevel;
+	IndexTuple	leafhikey;
+	BlockNumber leaftopparent;
+
+	page = BufferGetPage(leafbuf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
+
+	/*
+	 * Remember some information about the leaf page.
+	 */
+	itemid = PageGetItemId(page, P_HIKEY);
+	leafhikey = (IndexTuple) PageGetItem(page, itemid);
+	target = BTreeTupleGetTopParent(leafhikey);
+	leafleftsib = opaque->btpo_prev;
+	leafrightsib = opaque->btpo_next;
+
+	_bt_unlockbuf(rel, leafbuf);
+
+	/*
+	 * Check here, as calling loops will have locks held, preventing
+	 * interrupts from being processed.
+	 */
+	CHECK_FOR_INTERRUPTS();
+
+	/* Unlink the current top parent of the subtree */
+	if (!BlockNumberIsValid(target))
+	{
+		/* Target is leaf page (or leaf page is top parent, if you prefer) */
+		target = leafblkno;
+
+		buf = leafbuf;
+		leftsib = leafleftsib;
+		targetlevel = 0;
+	}
+	else
+	{
+		/* Target is the internal page taken from leaf's top parent link */
+		Assert(target != leafblkno);
+
+		/* Fetch the block number of the target's left sibling */
+		buf = _bt_getbuf(rel, target, BT_READ);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		leftsib = opaque->btpo_prev;
+		targetlevel = opaque->btpo_level;
+		Assert(targetlevel > 0);
+
+		/*
+		 * To avoid deadlocks, we'd better drop the target page lock before
+		 * going further.
+		 */
+		_bt_unlockbuf(rel, buf);
+	}
+
+	/*
+	 * We have to lock the pages we need to modify in the standard order:
+	 * moving right, then up.  Else we will deadlock against other writers.
+	 *
+	 * So, first lock the leaf page, if it's not the target.  Then find and
+	 * write-lock the current left sibling of the target page.  The sibling
+	 * that was current a moment ago could have split, so we may have to move
+	 * right.
+	 */
+	if (target != leafblkno)
+		_bt_lockbuf(rel, leafbuf, BT_WRITE);
+	if (leftsib != P_NONE)
+	{
+		lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
+		page = BufferGetPage(lbuf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		while (P_ISDELETED(opaque) || opaque->btpo_next != target)
+		{
+			bool		leftsibvalid = true;
+
+			/*
+			 * Before we follow the link from the page that was the left
+			 * sibling mere moments ago, validate its right link.  This
+			 * reduces the opportunities for loop to fail to ever make any
+			 * progress in the presence of index corruption.
+			 *
+			 * Note: we rely on the assumption that there can only be one
+			 * vacuum process running at a time (against the same index).
+			 */
+			if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
+				leftsib == opaque->btpo_next)
+				leftsibvalid = false;
+
+			leftsib = opaque->btpo_next;
+			_bt_relbuf(rel, lbuf);
+
+			if (!leftsibvalid)
+			{
+				if (target != leafblkno)
+				{
+					/* we have only a pin on target, but pin+lock on leafbuf */
+					ReleaseBuffer(buf);
+					_bt_relbuf(rel, leafbuf);
+				}
+				else
+				{
+					/* we have only a pin on leafbuf */
+					ReleaseBuffer(leafbuf);
+				}
+
+				ereport(LOG,
+						(errcode(ERRCODE_INDEX_CORRUPTED),
+						 errmsg_internal("valid left sibling for deletion target could not be located: "
+										 "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
+										 leftsib, target, leafblkno, scanblkno,
+										 RelationGetRelationName(rel))));
+
+				return false;
+			}
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* step right one page */
+			lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
+			page = BufferGetPage(lbuf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
+	}
+	else
+		lbuf = InvalidBuffer;
+
+	/* Next write-lock the target page itself */
+	_bt_lockbuf(rel, buf, BT_WRITE);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/*
+	 * Check page is still empty etc, else abandon deletion.  This is just for
+	 * paranoia's sake; a half-dead page cannot resurrect because there can be
+	 * only one vacuum process running at a time.
+	 */
+	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
+		elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
+			 target, RelationGetRelationName(rel));
+
+	if (opaque->btpo_prev != leftsib)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
+								 leftsib, opaque->btpo_prev, target,
+								 RelationGetRelationName(rel))));
+
+	if (target == leafblkno)
+	{
+		if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
+			!P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
+			elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
+				 target, RelationGetRelationName(rel));
+
+		/* Leaf page is also target page: don't set leaftopparent */
+		leaftopparent = InvalidBlockNumber;
+	}
+	else
+	{
+		IndexTuple	finaldataitem;
+
+		if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
+			P_ISLEAF(opaque))
+			elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
+				 targetlevel, target, RelationGetRelationName(rel));
+
+		/* Target is internal: set leaftopparent for next call here...  */
+		itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
+		finaldataitem = (IndexTuple) PageGetItem(page, itemid);
+		leaftopparent = BTreeTupleGetDownLink(finaldataitem);
+		/* ...except when it would be a redundant pointer-to-self */
+		if (leaftopparent == leafblkno)
+			leaftopparent = InvalidBlockNumber;
+	}
+
+	/* No leaftopparent for level 0 (leaf page) or level 1 target */
+	Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
+
+	/*
+	 * And next write-lock the (current) right sibling.
+	 */
+	rightsib = opaque->btpo_next;
+	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
+	page = BufferGetPage(rbuf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	if (opaque->btpo_prev != target)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("right sibling's left-link doesn't match: "
+								 "block %u links to %u instead of expected %u in index \"%s\"",
+								 rightsib, opaque->btpo_prev, target,
+								 RelationGetRelationName(rel))));
+	rightsib_is_rightmost = P_RIGHTMOST(opaque);
+	*rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
+
+	/*
+	 * If we are deleting the next-to-last page on the target's level, then
+	 * the rightsib is a candidate to become the new fast root. (In theory, it
+	 * might be possible to push the fast root even further down, but the odds
+	 * of doing so are slim, and the locking considerations daunting.)
+	 *
+	 * We can safely acquire a lock on the metapage here --- see comments for
+	 * _bt_newroot().
+	 */
+	if (leftsib == P_NONE && rightsib_is_rightmost)
+	{
+		page = BufferGetPage(rbuf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (P_RIGHTMOST(opaque))
+		{
+			/* rightsib will be the only one left on the level */
+			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+			metapg = BufferGetPage(metabuf);
+			metad = BTPageGetMeta(metapg);
+
+			/*
+			 * The expected case here is btm_fastlevel == targetlevel+1; if
+			 * the fastlevel is <= targetlevel, something is wrong, and we
+			 * choose to overwrite it to fix it.
+			 */
+			if (metad->btm_fastlevel > targetlevel + 1)
+			{
+				/* no update wanted */
+				_bt_relbuf(rel, metabuf);
+				metabuf = InvalidBuffer;
+			}
+		}
+	}
+
+	/*
+	 * Here we begin doing the deletion.
+	 */
+
+	/* No ereport(ERROR) until changes are logged */
+	START_CRIT_SECTION();
+
+	/*
+	 * Update siblings' side-links.  Note the target page's side-links will
+	 * continue to point to the siblings.  Asserts here are just rechecking
+	 * things we already verified above.
+	 */
+	if (BufferIsValid(lbuf))
+	{
+		page = BufferGetPage(lbuf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		Assert(opaque->btpo_next == target);
+		opaque->btpo_next = rightsib;
+	}
+	page = BufferGetPage(rbuf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Assert(opaque->btpo_prev == target);
+	opaque->btpo_prev = leftsib;
+
+	/*
+	 * If we deleted a parent of the targeted leaf page, instead of the leaf
+	 * itself, update the leaf to point to the next remaining child in the
+	 * subtree.
+	 *
+	 * Note: We rely on the fact that a buffer pin on the leaf page has been
+	 * held since leafhikey was initialized.  This is safe, though only
+	 * because the page was already half-dead at that point.  The leaf page
+	 * cannot have been modified by any other backend during the period when
+	 * no lock was held.
+	 */
+	if (target != leafblkno)
+		BTreeTupleSetTopParent(leafhikey, leaftopparent);
+
+	/*
+	 * Mark the page itself deleted.  It can be recycled when all current
+	 * transactions are gone.  Storing GetTopTransactionId() would work, but
+	 * we're in VACUUM and would not otherwise have an XID.  Having already
+	 * updated links to the target, ReadNextFullTransactionId() suffices as an
+	 * upper bound.  Any scan having retained a now-stale link is advertising
+	 * in its PGPROC an xmin less than or equal to the value we read here.  It
+	 * will continue to do so, holding back the xmin horizon, for the duration
+	 * of that scan.
+	 */
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
+
+	/*
+	 * Store upper bound XID that's used to determine when deleted page is no
+	 * longer needed as a tombstone
+	 */
+	safexid = ReadNextFullTransactionId();
+	BTPageSetDeleted(page, safexid);
+	opaque->btpo_cycleid = 0;
+
+	/* And update the metapage, if needed */
+	if (BufferIsValid(metabuf))
+	{
+		/* upgrade metapage if needed */
+		if (metad->btm_version < BTREE_NOVAC_VERSION)
+			_bt_upgrademetapage(metapg);
+		metad->btm_fastroot = rightsib;
+		metad->btm_fastlevel = targetlevel;
+		MarkBufferDirty(metabuf);
+	}
+
+	/* Must mark buffers dirty before XLogInsert */
+	MarkBufferDirty(rbuf);
+	MarkBufferDirty(buf);
+	if (BufferIsValid(lbuf))
+		MarkBufferDirty(lbuf);
+	if (target != leafblkno)
+		MarkBufferDirty(leafbuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_unlink_page xlrec;
+		xl_btree_metadata xlmeta;
+		uint8		xlinfo;
+		XLogRecPtr	recptr;
+
+		XLogBeginInsert();
+
+		XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
+		if (BufferIsValid(lbuf))
+			XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
+		XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
+		if (target != leafblkno)
+			XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
+
+		/* information stored on the target/to-be-unlinked block */
+		xlrec.leftsib = leftsib;
+		xlrec.rightsib = rightsib;
+		xlrec.level = targetlevel;
+		xlrec.safexid = safexid;
+
+		/* information needed to recreate the leaf block (if not the target) */
+		xlrec.leafleftsib = leafleftsib;
+		xlrec.leafrightsib = leafrightsib;
+		xlrec.leaftopparent = leaftopparent;
+
+		XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
+
+		if (BufferIsValid(metabuf))
+		{
+			XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+			Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+			xlmeta.version = metad->btm_version;
+			xlmeta.root = metad->btm_root;
+			xlmeta.level = metad->btm_level;
+			xlmeta.fastroot = metad->btm_fastroot;
+			xlmeta.fastlevel = metad->btm_fastlevel;
+			xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
+			xlmeta.allequalimage = metad->btm_allequalimage;
+
+			XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
+			xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
+		}
+		else
+			xlinfo = XLOG_BTREE_UNLINK_PAGE;
+
+		recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+		if (BufferIsValid(metabuf))
+		{
+			PageSetLSN(metapg, recptr);
+		}
+		page = BufferGetPage(rbuf);
+		PageSetLSN(page, recptr);
+		page = BufferGetPage(buf);
+		PageSetLSN(page, recptr);
+		if (BufferIsValid(lbuf))
+		{
+			page = BufferGetPage(lbuf);
+			PageSetLSN(page, recptr);
+		}
+		if (target != leafblkno)
+		{
+			page = BufferGetPage(leafbuf);
+			PageSetLSN(page, recptr);
+		}
+	}
+
+	END_CRIT_SECTION();
+
+	/* release metapage */
+	if (BufferIsValid(metabuf))
+		_bt_relbuf(rel, metabuf);
+
+	/* release siblings */
+	if (BufferIsValid(lbuf))
+		_bt_relbuf(rel, lbuf);
+	_bt_relbuf(rel, rbuf);
+
+	/* If the target is not leafbuf, we're done with it now -- release it */
+	if (target != leafblkno)
+		_bt_relbuf(rel, buf);
+
+	/*
+	 * Maintain pages_newly_deleted, which is simply the number of pages
+	 * deleted by the ongoing VACUUM operation.
+	 *
+	 * Maintain pages_deleted in a way that takes into account how
+	 * btvacuumpage() will count deleted pages that have yet to become
+	 * scanblkno -- only count page when it's not going to get that treatment
+	 * later on.
+	 */
+	stats->pages_newly_deleted++;
+	if (target <= scanblkno)
+		stats->pages_deleted++;
+
+	/*
+	 * Remember information about the target page (now a newly deleted page)
+	 * in dedicated vstate space for later.  The page will be considered as a
+	 * candidate to place in the FSM at the end of the current btvacuumscan()
+	 * call.
+	 */
+	_bt_pendingfsm_add(vstate, target, safexid);
+
+	return true;
+}
+
+/*
+ * Establish how tall the to-be-deleted subtree will be during the first stage
+ * of page deletion.
+ *
+ * Caller's child argument is the block number of the page caller wants to
+ * delete (this is leafbuf's block number, except when we're called
+ * recursively).  stack is a search stack leading to it.  Note that we will
+ * update the stack entry(s) to reflect current downlink positions --- this is
+ * similar to the corresponding point in page split handling.
+ *
+ * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
+ * false.  Returns true on success, in which case caller can use certain
+ * details established here to perform the first stage of deletion.  This
+ * function is the last point at which page deletion may be deemed unsafe
+ * (barring index corruption, or unexpected concurrent page deletions).
+ *
+ * We write lock the parent of the root of the to-be-deleted subtree for
+ * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
+ * caller).  Caller will have to remove a downlink from *subtreeparent.  We
+ * also set a *subtreeparent offset number in *poffset, to indicate the
+ * location of the pivot tuple that contains the relevant downlink.
+ *
+ * The root of the to-be-deleted subtree is called the "top parent".  Note
+ * that the leafbuf page is often the final "top parent" page (you can think
+ * of the leafbuf page as a degenerate single page subtree when that happens).
+ * Caller should initialize *topparent to the target leafbuf page block number
+ * (while *topparentrightsib should be set to leafbuf's right sibling block
+ * number).  We will update *topparent (and *topparentrightsib) for caller
+ * here, though only when it turns out that caller will delete at least one
+ * internal page (i.e. only when caller needs to store a valid link to the top
+ * parent block in the leafbuf page using BTreeTupleSetTopParent()).
+ */
+static bool
+_bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack,
+						Buffer *subtreeparent, OffsetNumber *poffset,
+						BlockNumber *topparent, BlockNumber *topparentrightsib)
+{
+	BlockNumber parent,
+				leftsibparent;
+	OffsetNumber parentoffset,
+				maxoff;
+	Buffer		pbuf;
+	Page		page;
+	BTPageOpaque opaque;
+
+	/*
+	 * Locate the pivot tuple whose downlink points to "child".  Write lock
+	 * the parent page itself.
+	 */
+	pbuf = _bt_getstackbuf(rel, stack, child);
+	if (pbuf == InvalidBuffer)
+	{
+		/*
+		 * Failed to "re-find" a pivot tuple whose downlink matched our child
+		 * block number on the parent level -- the index must be corrupt.
+		 * Don't even try to delete the leafbuf subtree.  Just report the
+		 * issue and press on with vacuuming the index.
+		 *
+		 * Note: _bt_getstackbuf() recovers from concurrent page splits that
+		 * take place on the parent level.  Its approach is a near-exhaustive
+		 * linear search.  This also gives it a surprisingly good chance of
+		 * recovering in the event of a buggy or inconsistent opclass.  But we
+		 * don't rely on that here.
+		 */
+		ereport(LOG,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
+								 RelationGetRelationName(rel), child)));
+		return false;
+	}
+
+	parent = stack->bts_blkno;
+	parentoffset = stack->bts_offset;
+
+	page = BufferGetPage(pbuf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	maxoff = PageGetMaxOffsetNumber(page);
+	leftsibparent = opaque->btpo_prev;
+
+	/*
+	 * _bt_getstackbuf() completes page splits on returned parent buffer when
+	 * required.
+	 *
+	 * In general it's a bad idea for VACUUM to use up more disk space, which
+	 * is why page deletion does not finish incomplete page splits most of the
+	 * time.  We allow this limited exception because the risk is much lower,
+	 * and the potential downside of not proceeding is much higher:  A single
+	 * internal page with the INCOMPLETE_SPLIT flag set might otherwise
+	 * prevent us from deleting hundreds of empty leaf pages from one level
+	 * down.
+	 */
+	Assert(!P_INCOMPLETE_SPLIT(opaque));
+
+	if (parentoffset < maxoff)
+	{
+		/*
+		 * Child is not the rightmost child in parent, so it's safe to delete
+		 * the subtree whose root/topparent is child page
+		 */
+		*subtreeparent = pbuf;
+		*poffset = parentoffset;
+		return true;
+	}
+
+	/*
+	 * Child is the rightmost child of parent.
+	 *
+	 * Since it's the rightmost child of parent, deleting the child (or
+	 * deleting the subtree whose root/topparent is the child page) is only
+	 * safe when it's also possible to delete the parent.
+	 */
+	Assert(parentoffset == maxoff);
+	if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
+	{
+		/*
+		 * Child isn't parent's only child, or parent is rightmost on its
+		 * entire level.  Definitely cannot delete any pages.
+		 */
+		_bt_relbuf(rel, pbuf);
+		return false;
+	}
+
+	/*
+	 * Now make sure that the parent deletion is itself safe by examining the
+	 * child's grandparent page.  Recurse, passing the parent page as the
+	 * child page (child's grandparent is the parent on the next level up). If
+	 * parent deletion is unsafe, then child deletion must also be unsafe (in
+	 * which case caller cannot delete any pages at all).
+	 */
+	*topparent = parent;
+	*topparentrightsib = opaque->btpo_next;
+
+	/*
+	 * Release lock on parent before recursing.
+	 *
+	 * It's OK to release page locks on parent before recursive call locks
+	 * grandparent.  An internal page can only acquire an entry if the child
+	 * is split, but that cannot happen as long as we still hold a lock on the
+	 * leafbuf page.
+	 */
+	_bt_relbuf(rel, pbuf);
+
+	/*
+	 * Before recursing, check that the left sibling of parent (if any) is not
+	 * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
+	 * parent lock).
+	 *
+	 * Note: We deliberately avoid completing incomplete splits here.
+	 */
+	if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
+		return false;
+
+	/* Recurse to examine child page's grandparent page */
+	return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
+								   subtreeparent, poffset,
+								   topparent, topparentrightsib);
+}
+
+/*
+ * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
+ * optimization.
+ *
+ * Called at the start of a btvacuumscan().  Caller's cleanuponly argument
+ * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
+ *
+ * We expect to allocate memory inside VACUUM's top-level memory context here.
+ * The working buffer is subject to a limit based on work_mem.  Our strategy
+ * when the array can no longer grow within the bounds of that limit is to
+ * stop saving additional newly deleted pages, while proceeding as usual with
+ * the pages that we can fit.
+ */
+void
+_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
+{
+	int64		maxbufsize;
+
+	/*
+	 * Don't bother with optimization in cleanup-only case -- we don't expect
+	 * any newly deleted pages.  Besides, cleanup-only calls to btvacuumscan()
+	 * can only take place because this optimization didn't work out during
+	 * the last VACUUM.
+	 */
+	if (cleanuponly)
+		return;
+
+	/*
+	 * Cap maximum size of array so that we always respect work_mem.  Avoid
+	 * int overflow here.
+	 */
+	vstate->bufsize = 256;
+	maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
+	maxbufsize = Min(maxbufsize, INT_MAX);
+	maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
+	/* Stay sane with small work_mem */
+	maxbufsize = Max(maxbufsize, vstate->bufsize);
+	vstate->maxbufsize = maxbufsize;
+
+	/* Allocate buffer, indicate that there are currently 0 pending pages */
+	vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
+	vstate->npendingpages = 0;
+}
+
+/*
+ * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
+ * the ongoing VACUUM operation) into the free space map -- though only when
+ * it is actually safe to do so by now.
+ *
+ * Called at the end of a btvacuumscan(), just before free space map vacuuming
+ * takes place.
+ *
+ * Frees memory allocated by _bt_pendingfsm_init(), if any.
+ */
+void
+_bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
+{
+	IndexBulkDeleteResult *stats = vstate->stats;
+
+	Assert(stats->pages_newly_deleted >= vstate->npendingpages);
+
+	if (vstate->npendingpages == 0)
+	{
+		/* Just free memory when nothing to do */
+		if (vstate->pendingpages)
+			pfree(vstate->pendingpages);
+
+		return;
+	}
+
+#ifdef DEBUG_BTREE_PENDING_FSM
+
+	/*
+	 * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
+	 * placing pending pages in the FSM.  Note that the optimization will
+	 * never be effective without some other backend concurrently consuming an
+	 * XID.
+	 */
+	pg_usleep(5000000L);
+#endif
+
+	/*
+	 * Recompute VACUUM XID boundaries.
+	 *
+	 * We don't actually care about the oldest non-removable XID.  Computing
+	 * the oldest such XID has a useful side-effect that we rely on: it
+	 * forcibly updates the XID horizon state for this backend.  This step is
+	 * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
+	 * that it is now safe to recycle newly deleted pages without this step.
+	 */
+	GetOldestNonRemovableTransactionId(NULL);
+
+	for (int i = 0; i < vstate->npendingpages; i++)
+	{
+		BlockNumber target = vstate->pendingpages[i].target;
+		FullTransactionId safexid = vstate->pendingpages[i].safexid;
+
+		/*
+		 * Do the equivalent of checking BTPageIsRecyclable(), but without
+		 * accessing the page again a second time.
+		 *
+		 * Give up on finding the first non-recyclable page -- all later pages
+		 * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
+		 * to the array in safexid order.
+		 */
+		if (!GlobalVisCheckRemovableFullXid(NULL, safexid))
+			break;
+
+		RecordFreeIndexPage(rel, target);
+		stats->pages_free++;
+	}
+
+	pfree(vstate->pendingpages);
+}
+
+/*
+ * Maintain array of pages that were deleted during current btvacuumscan()
+ * call, for use in _bt_pendingfsm_finalize()
+ */
+static void
+_bt_pendingfsm_add(BTVacState *vstate,
+				   BlockNumber target,
+				   FullTransactionId safexid)
+{
+	Assert(vstate->npendingpages <= vstate->bufsize);
+	Assert(vstate->bufsize <= vstate->maxbufsize);
+
+#ifdef USE_ASSERT_CHECKING
+
+	/*
+	 * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
+	 * array will always be in safexid order (since that is the order that we
+	 * save them in here)
+	 */
+	if (vstate->npendingpages > 0)
+	{
+		FullTransactionId lastsafexid =
+		vstate->pendingpages[vstate->npendingpages - 1].safexid;
+
+		Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
+	}
+#endif
+
+	/*
+	 * If temp buffer reaches maxbufsize/work_mem capacity then we discard
+	 * information about this page.
+	 *
+	 * Note that this also covers the case where we opted to not use the
+	 * optimization in _bt_pendingfsm_init().
+	 */
+	if (vstate->npendingpages == vstate->maxbufsize)
+		return;
+
+	/* Consider enlarging buffer */
+	if (vstate->npendingpages == vstate->bufsize)
+	{
+		int			newbufsize = vstate->bufsize * 2;
+
+		/* Respect work_mem */
+		if (newbufsize > vstate->maxbufsize)
+			newbufsize = vstate->maxbufsize;
+
+		vstate->bufsize = newbufsize;
+		vstate->pendingpages =
+			repalloc(vstate->pendingpages,
+					 sizeof(BTPendingFSM) * vstate->bufsize);
+	}
+
+	/* Save metadata for newly deleted page */
+	vstate->pendingpages[vstate->npendingpages].target = target;
+	vstate->pendingpages[vstate->npendingpages].safexid = safexid;
+	vstate->npendingpages++;
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
new file mode 100644
index 0000000..1360ab8
--- /dev/null
+++ b/src/backend/access/nbtree/nbtree.c
@@ -0,0 +1,1446 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtree.c
+ *	  Implementation of Lehman and Yao's btree management algorithm for
+ *	  Postgres.
+ *
+ * NOTES
+ *	  This file contains only the public interface routines.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtree.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/relscan.h"
+#include "access/xlog.h"
+#include "commands/progress.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "storage/condition_variable.h"
+#include "storage/indexfsm.h"
+#include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/index_selfuncs.h"
+#include "utils/memutils.h"
+
+
+/*
+ * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
+ *
+ * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
+ * a new page; others must wait.
+ *
+ * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
+ * to a new page; some process can start doing that.
+ *
+ * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
+ * We reach this state once for every distinct combination of array keys.
+ */
+typedef enum
+{
+	BTPARALLEL_NOT_INITIALIZED,
+	BTPARALLEL_ADVANCING,
+	BTPARALLEL_IDLE,
+	BTPARALLEL_DONE
+} BTPS_State;
+
+/*
+ * BTParallelScanDescData contains btree specific shared information required
+ * for parallel scan.
+ */
+typedef struct BTParallelScanDescData
+{
+	BlockNumber btps_scanPage;	/* latest or next page to be scanned */
+	BTPS_State	btps_pageStatus;	/* indicates whether next page is
+									 * available for scan. see above for
+									 * possible states of parallel scan. */
+	int			btps_arrayKeyCount; /* count indicating number of array scan
+									 * keys processed by parallel scan */
+	slock_t		btps_mutex;		/* protects above variables */
+	ConditionVariable btps_cv;	/* used to synchronize parallel scan */
+}			BTParallelScanDescData;
+
+typedef struct BTParallelScanDescData *BTParallelScanDesc;
+
+
+static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+						 IndexBulkDeleteCallback callback, void *callback_state,
+						 BTCycleId cycleid);
+static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
+static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
+										  IndexTuple posting,
+										  OffsetNumber updatedoffset,
+										  int *nremaining);
+
+
+/*
+ * Btree handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+bthandler(PG_FUNCTION_ARGS)
+{
+	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+	amroutine->amstrategies = BTMaxStrategyNumber;
+	amroutine->amsupport = BTNProcs;
+	amroutine->amoptsprocnum = BTOPTIONS_PROC;
+	amroutine->amcanorder = true;
+	amroutine->amcanorderbyop = false;
+	amroutine->amcanbackward = true;
+	amroutine->amcanunique = true;
+	amroutine->amcanmulticol = true;
+	amroutine->amoptionalkey = true;
+	amroutine->amsearcharray = true;
+	amroutine->amsearchnulls = true;
+	amroutine->amstorage = false;
+	amroutine->amclusterable = true;
+	amroutine->ampredlocks = true;
+	amroutine->amcanparallel = true;
+	amroutine->amcaninclude = true;
+	amroutine->amusemaintenanceworkmem = false;
+	amroutine->amparallelvacuumoptions =
+		VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
+	amroutine->amkeytype = InvalidOid;
+
+	amroutine->ambuild = btbuild;
+	amroutine->ambuildempty = btbuildempty;
+	amroutine->aminsert = btinsert;
+	amroutine->ambulkdelete = btbulkdelete;
+	amroutine->amvacuumcleanup = btvacuumcleanup;
+	amroutine->amcanreturn = btcanreturn;
+	amroutine->amcostestimate = btcostestimate;
+	amroutine->amoptions = btoptions;
+	amroutine->amproperty = btproperty;
+	amroutine->ambuildphasename = btbuildphasename;
+	amroutine->amvalidate = btvalidate;
+	amroutine->amadjustmembers = btadjustmembers;
+	amroutine->ambeginscan = btbeginscan;
+	amroutine->amrescan = btrescan;
+	amroutine->amgettuple = btgettuple;
+	amroutine->amgetbitmap = btgetbitmap;
+	amroutine->amendscan = btendscan;
+	amroutine->ammarkpos = btmarkpos;
+	amroutine->amrestrpos = btrestrpos;
+	amroutine->amestimateparallelscan = btestimateparallelscan;
+	amroutine->aminitparallelscan = btinitparallelscan;
+	amroutine->amparallelrescan = btparallelrescan;
+
+	PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ *	btbuildempty() -- build an empty btree index in the initialization fork
+ */
+void
+btbuildempty(Relation index)
+{
+	Page		metapage;
+
+	/* Construct metapage. */
+	metapage = (Page) palloc(BLCKSZ);
+	_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
+
+	/*
+	 * Write the page and log it.  It might seem that an immediate sync would
+	 * be sufficient to guarantee that the file exists on disk, but recovery
+	 * itself might remove it while replaying, for example, an
+	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
+	 * this even when wal_level=minimal.
+	 */
+	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
+	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
+			  (char *) metapage, true);
+	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+				BTREE_METAPAGE, metapage, true);
+
+	/*
+	 * An immediate sync is required even if we xlog'd the page, because the
+	 * write did not go through shared_buffers and therefore a concurrent
+	 * checkpoint may have moved the redo pointer past our xlog record.
+	 */
+	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+}
+
+/*
+ *	btinsert() -- insert an index tuple into a btree.
+ *
+ *		Descend the tree recursively, find the appropriate location for our
+ *		new tuple, and put it there.
+ */
+bool
+btinsert(Relation rel, Datum *values, bool *isnull,
+		 ItemPointer ht_ctid, Relation heapRel,
+		 IndexUniqueCheck checkUnique,
+		 bool indexUnchanged,
+		 IndexInfo *indexInfo)
+{
+	bool		result;
+	IndexTuple	itup;
+
+	/* generate an index tuple */
+	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
+	itup->t_tid = *ht_ctid;
+
+	result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
+
+	pfree(itup);
+
+	return result;
+}
+
+/*
+ *	btgettuple() -- Get the next tuple in the scan.
+ */
+bool
+btgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		res;
+
+	/* btree indexes are never lossy */
+	scan->xs_recheck = false;
+
+	/*
+	 * If we have any array keys, initialize them during first call for a
+	 * scan.  We can't do this in btrescan because we don't know the scan
+	 * direction at that time.
+	 */
+	if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
+	{
+		/* punt if we have any unsatisfiable array keys */
+		if (so->numArrayKeys < 0)
+			return false;
+
+		_bt_start_array_keys(scan, dir);
+	}
+
+	/* This loop handles advancing to the next array elements, if any */
+	do
+	{
+		/*
+		 * If we've already initialized this scan, we can just advance it in
+		 * the appropriate direction.  If we haven't done so yet, we call
+		 * _bt_first() to get the first item in the scan.
+		 */
+		if (!BTScanPosIsValid(so->currPos))
+			res = _bt_first(scan, dir);
+		else
+		{
+			/*
+			 * Check to see if we should kill the previously-fetched tuple.
+			 */
+			if (scan->kill_prior_tuple)
+			{
+				/*
+				 * Yes, remember it for later. (We'll deal with all such
+				 * tuples at once right before leaving the index page.)  The
+				 * test for numKilled overrun is not just paranoia: if the
+				 * caller reverses direction in the indexscan then the same
+				 * item might get entered multiple times. It's not worth
+				 * trying to optimize that, so we don't detect it, but instead
+				 * just forget any excess entries.
+				 */
+				if (so->killedItems == NULL)
+					so->killedItems = (int *)
+						palloc(MaxTIDsPerBTreePage * sizeof(int));
+				if (so->numKilled < MaxTIDsPerBTreePage)
+					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
+			}
+
+			/*
+			 * Now continue the scan.
+			 */
+			res = _bt_next(scan, dir);
+		}
+
+		/* If we have a tuple, return it ... */
+		if (res)
+			break;
+		/* ... otherwise see if we have more array keys to deal with */
+	} while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
+
+	return res;
+}
+
+/*
+ * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
+ */
+int64
+btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	int64		ntids = 0;
+	ItemPointer heapTid;
+
+	/*
+	 * If we have any array keys, initialize them.
+	 */
+	if (so->numArrayKeys)
+	{
+		/* punt if we have any unsatisfiable array keys */
+		if (so->numArrayKeys < 0)
+			return ntids;
+
+		_bt_start_array_keys(scan, ForwardScanDirection);
+	}
+
+	/* This loop handles advancing to the next array elements, if any */
+	do
+	{
+		/* Fetch the first page & tuple */
+		if (_bt_first(scan, ForwardScanDirection))
+		{
+			/* Save tuple ID, and continue scanning */
+			heapTid = &scan->xs_heaptid;
+			tbm_add_tuples(tbm, heapTid, 1, false);
+			ntids++;
+
+			for (;;)
+			{
+				/*
+				 * Advance to next tuple within page.  This is the same as the
+				 * easy case in _bt_next().
+				 */
+				if (++so->currPos.itemIndex > so->currPos.lastItem)
+				{
+					/* let _bt_next do the heavy lifting */
+					if (!_bt_next(scan, ForwardScanDirection))
+						break;
+				}
+
+				/* Save tuple ID, and continue scanning */
+				heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
+				tbm_add_tuples(tbm, heapTid, 1, false);
+				ntids++;
+			}
+		}
+		/* Now see if we have more array keys to deal with */
+	} while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
+
+	return ntids;
+}
+
+/*
+ *	btbeginscan() -- start a scan on a btree index
+ */
+IndexScanDesc
+btbeginscan(Relation rel, int nkeys, int norderbys)
+{
+	IndexScanDesc scan;
+	BTScanOpaque so;
+
+	/* no order by operators allowed */
+	Assert(norderbys == 0);
+
+	/* get the scan */
+	scan = RelationGetIndexScan(rel, nkeys, norderbys);
+
+	/* allocate private workspace */
+	so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
+	BTScanPosInvalidate(so->currPos);
+	BTScanPosInvalidate(so->markPos);
+	if (scan->numberOfKeys > 0)
+		so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
+	else
+		so->keyData = NULL;
+
+	so->arrayKeyData = NULL;	/* assume no array keys for now */
+	so->numArrayKeys = 0;
+	so->arrayKeys = NULL;
+	so->arrayContext = NULL;
+
+	so->killedItems = NULL;		/* until needed */
+	so->numKilled = 0;
+
+	/*
+	 * We don't know yet whether the scan will be index-only, so we do not
+	 * allocate the tuple workspace arrays until btrescan.  However, we set up
+	 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
+	 */
+	so->currTuples = so->markTuples = NULL;
+
+	scan->xs_itupdesc = RelationGetDescr(rel);
+
+	scan->opaque = so;
+
+	return scan;
+}
+
+/*
+ *	btrescan() -- rescan an index relation
+ */
+void
+btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+		 ScanKey orderbys, int norderbys)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	/* we aren't holding any read locks, but gotta drop the pins */
+	if (BTScanPosIsValid(so->currPos))
+	{
+		/* Before leaving current page, deal with any killed items */
+		if (so->numKilled > 0)
+			_bt_killitems(scan);
+		BTScanPosUnpinIfPinned(so->currPos);
+		BTScanPosInvalidate(so->currPos);
+	}
+
+	so->markItemIndex = -1;
+	so->arrayKeyCount = 0;
+	BTScanPosUnpinIfPinned(so->markPos);
+	BTScanPosInvalidate(so->markPos);
+
+	/*
+	 * Allocate tuple workspace arrays, if needed for an index-only scan and
+	 * not already done in a previous rescan call.  To save on palloc
+	 * overhead, both workspaces are allocated as one palloc block; only this
+	 * function and btendscan know that.
+	 *
+	 * NOTE: this data structure also makes it safe to return data from a
+	 * "name" column, even though btree name_ops uses an underlying storage
+	 * datatype of cstring.  The risk there is that "name" is supposed to be
+	 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
+	 * However, since we only return data out of tuples sitting in the
+	 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
+	 * data out of the markTuples array --- running off the end of memory for
+	 * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
+	 * adding special-case treatment for name_ops elsewhere.
+	 */
+	if (scan->xs_want_itup && so->currTuples == NULL)
+	{
+		so->currTuples = (char *) palloc(BLCKSZ * 2);
+		so->markTuples = so->currTuples + BLCKSZ;
+	}
+
+	/*
+	 * Reset the scan keys
+	 */
+	if (scankey && scan->numberOfKeys > 0)
+		memmove(scan->keyData,
+				scankey,
+				scan->numberOfKeys * sizeof(ScanKeyData));
+	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
+
+	/* If any keys are SK_SEARCHARRAY type, set up array-key info */
+	_bt_preprocess_array_keys(scan);
+}
+
+/*
+ *	btendscan() -- close down a scan
+ */
+void
+btendscan(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	/* we aren't holding any read locks, but gotta drop the pins */
+	if (BTScanPosIsValid(so->currPos))
+	{
+		/* Before leaving current page, deal with any killed items */
+		if (so->numKilled > 0)
+			_bt_killitems(scan);
+		BTScanPosUnpinIfPinned(so->currPos);
+	}
+
+	so->markItemIndex = -1;
+	BTScanPosUnpinIfPinned(so->markPos);
+
+	/* No need to invalidate positions, the RAM is about to be freed. */
+
+	/* Release storage */
+	if (so->keyData != NULL)
+		pfree(so->keyData);
+	/* so->arrayKeyData and so->arrayKeys are in arrayContext */
+	if (so->arrayContext != NULL)
+		MemoryContextDelete(so->arrayContext);
+	if (so->killedItems != NULL)
+		pfree(so->killedItems);
+	if (so->currTuples != NULL)
+		pfree(so->currTuples);
+	/* so->markTuples should not be pfree'd, see btrescan */
+	pfree(so);
+}
+
+/*
+ *	btmarkpos() -- save current scan position
+ */
+void
+btmarkpos(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	/* There may be an old mark with a pin (but no lock). */
+	BTScanPosUnpinIfPinned(so->markPos);
+
+	/*
+	 * Just record the current itemIndex.  If we later step to next page
+	 * before releasing the marked position, _bt_steppage makes a full copy of
+	 * the currPos struct in markPos.  If (as often happens) the mark is moved
+	 * before we leave the page, we don't have to do that work.
+	 */
+	if (BTScanPosIsValid(so->currPos))
+		so->markItemIndex = so->currPos.itemIndex;
+	else
+	{
+		BTScanPosInvalidate(so->markPos);
+		so->markItemIndex = -1;
+	}
+
+	/* Also record the current positions of any array keys */
+	if (so->numArrayKeys)
+		_bt_mark_array_keys(scan);
+}
+
+/*
+ *	btrestrpos() -- restore scan to last saved position
+ */
+void
+btrestrpos(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	/* Restore the marked positions of any array keys */
+	if (so->numArrayKeys)
+		_bt_restore_array_keys(scan);
+
+	if (so->markItemIndex >= 0)
+	{
+		/*
+		 * The scan has never moved to a new page since the last mark.  Just
+		 * restore the itemIndex.
+		 *
+		 * NB: In this case we can't count on anything in so->markPos to be
+		 * accurate.
+		 */
+		so->currPos.itemIndex = so->markItemIndex;
+	}
+	else
+	{
+		/*
+		 * The scan moved to a new page after last mark or restore, and we are
+		 * now restoring to the marked page.  We aren't holding any read
+		 * locks, but if we're still holding the pin for the current position,
+		 * we must drop it.
+		 */
+		if (BTScanPosIsValid(so->currPos))
+		{
+			/* Before leaving current page, deal with any killed items */
+			if (so->numKilled > 0)
+				_bt_killitems(scan);
+			BTScanPosUnpinIfPinned(so->currPos);
+		}
+
+		if (BTScanPosIsValid(so->markPos))
+		{
+			/* bump pin on mark buffer for assignment to current buffer */
+			if (BTScanPosIsPinned(so->markPos))
+				IncrBufferRefCount(so->markPos.buf);
+			memcpy(&so->currPos, &so->markPos,
+				   offsetof(BTScanPosData, items[1]) +
+				   so->markPos.lastItem * sizeof(BTScanPosItem));
+			if (so->currTuples)
+				memcpy(so->currTuples, so->markTuples,
+					   so->markPos.nextTupleOffset);
+		}
+		else
+			BTScanPosInvalidate(so->currPos);
+	}
+}
+
+/*
+ * btestimateparallelscan -- estimate storage for BTParallelScanDescData
+ */
+Size
+btestimateparallelscan(void)
+{
+	return sizeof(BTParallelScanDescData);
+}
+
+/*
+ * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
+ */
+void
+btinitparallelscan(void *target)
+{
+	BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
+
+	SpinLockInit(&bt_target->btps_mutex);
+	bt_target->btps_scanPage = InvalidBlockNumber;
+	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+	bt_target->btps_arrayKeyCount = 0;
+	ConditionVariableInit(&bt_target->btps_cv);
+}
+
+/*
+ *	btparallelrescan() -- reset parallel scan
+ */
+void
+btparallelrescan(IndexScanDesc scan)
+{
+	BTParallelScanDesc btscan;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+
+	Assert(parallel_scan);
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	/*
+	 * In theory, we don't need to acquire the spinlock here, because there
+	 * shouldn't be any other workers running at this point, but we do so for
+	 * consistency.
+	 */
+	SpinLockAcquire(&btscan->btps_mutex);
+	btscan->btps_scanPage = InvalidBlockNumber;
+	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+	btscan->btps_arrayKeyCount = 0;
+	SpinLockRelease(&btscan->btps_mutex);
+}
+
+/*
+ * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
+ *		page.  Other scans must wait until we call _bt_parallel_release()
+ *		or _bt_parallel_done().
+ *
+ * The return value is true if we successfully seized the scan and false
+ * if we did not.  The latter case occurs if no pages remain for the current
+ * set of scankeys.
+ *
+ * If the return value is true, *pageno returns the next or current page
+ * of the scan (depending on the scan direction).  An invalid block number
+ * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * The first time a participating process reaches the last page, it will return
+ * true and set *pageno to P_NONE; after that, further attempts to seize the
+ * scan will return false.
+ *
+ * Callers should ignore the value of pageno if the return value is false.
+ */
+bool
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTPS_State	pageStatus;
+	bool		exit_loop = false;
+	bool		status = true;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+
+	*pageno = P_NONE;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	while (1)
+	{
+		SpinLockAcquire(&btscan->btps_mutex);
+		pageStatus = btscan->btps_pageStatus;
+
+		if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+		{
+			/* Parallel scan has already advanced to a new set of scankeys. */
+			status = false;
+		}
+		else if (pageStatus == BTPARALLEL_DONE)
+		{
+			/*
+			 * We're done with this set of scankeys.  This may be the end, or
+			 * there could be more sets to try.
+			 */
+			status = false;
+		}
+		else if (pageStatus != BTPARALLEL_ADVANCING)
+		{
+			/*
+			 * We have successfully seized control of the scan for the purpose
+			 * of advancing it to a new page!
+			 */
+			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+			*pageno = btscan->btps_scanPage;
+			exit_loop = true;
+		}
+		SpinLockRelease(&btscan->btps_mutex);
+		if (exit_loop || !status)
+			break;
+		ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
+	}
+	ConditionVariableCancelSleep();
+
+	return status;
+}
+
+/*
+ * _bt_parallel_release() -- Complete the process of advancing the scan to a
+ *		new page.  We now have the new value btps_scanPage; some other backend
+ *		can now begin advancing the scan.
+ */
+void
+_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
+{
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	SpinLockAcquire(&btscan->btps_mutex);
+	btscan->btps_scanPage = scan_page;
+	btscan->btps_pageStatus = BTPARALLEL_IDLE;
+	SpinLockRelease(&btscan->btps_mutex);
+	ConditionVariableSignal(&btscan->btps_cv);
+}
+
+/*
+ * _bt_parallel_done() -- Mark the parallel scan as complete.
+ *
+ * When there are no pages left to scan, this function should be called to
+ * notify other workers.  Otherwise, they might wait forever for the scan to
+ * advance to the next page.
+ */
+void
+_bt_parallel_done(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+	bool		status_changed = false;
+
+	/* Do nothing, for non-parallel scans */
+	if (parallel_scan == NULL)
+		return;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	/*
+	 * Mark the parallel scan as done for this combination of scan keys,
+	 * unless some other process already did so.  See also
+	 * _bt_advance_array_keys.
+	 */
+	SpinLockAcquire(&btscan->btps_mutex);
+	if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
+		btscan->btps_pageStatus != BTPARALLEL_DONE)
+	{
+		btscan->btps_pageStatus = BTPARALLEL_DONE;
+		status_changed = true;
+	}
+	SpinLockRelease(&btscan->btps_mutex);
+
+	/* wake up all the workers associated with this parallel scan */
+	if (status_changed)
+		ConditionVariableBroadcast(&btscan->btps_cv);
+}
+
+/*
+ * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
+ *			keys.
+ *
+ * Updates the count of array keys processed for both local and parallel
+ * scans.
+ */
+void
+_bt_parallel_advance_array_keys(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	so->arrayKeyCount++;
+	SpinLockAcquire(&btscan->btps_mutex);
+	if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+	{
+		btscan->btps_scanPage = InvalidBlockNumber;
+		btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+		btscan->btps_arrayKeyCount++;
+	}
+	SpinLockRelease(&btscan->btps_mutex);
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+			 IndexBulkDeleteCallback callback, void *callback_state)
+{
+	Relation	rel = info->index;
+	BTCycleId	cycleid;
+
+	/* allocate stats if first time through, else re-use existing struct */
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+	/* Establish the vacuum cycle ID to use for this scan */
+	/* The ENSURE stuff ensures we clean up shared memory on failure */
+	PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
+	{
+		cycleid = _bt_start_vacuum(rel);
+
+		btvacuumscan(info, stats, callback, callback_state, cycleid);
+	}
+	PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
+	_bt_end_vacuum(rel);
+
+	return stats;
+}
+
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+	BlockNumber num_delpages;
+
+	/* No-op in ANALYZE ONLY mode */
+	if (info->analyze_only)
+		return stats;
+
+	/*
+	 * If btbulkdelete was called, we need not do anything (we just maintain
+	 * the information used within _bt_vacuum_needs_cleanup() by calling
+	 * _bt_set_cleanup_info() below).
+	 *
+	 * If btbulkdelete was _not_ called, then we have a choice to make: we
+	 * must decide whether or not a btvacuumscan() call is needed now (i.e.
+	 * whether the ongoing VACUUM operation can entirely avoid a physical scan
+	 * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
+	 * now.
+	 */
+	if (stats == NULL)
+	{
+		/* Check if VACUUM operation can entirely avoid btvacuumscan() call */
+		if (!_bt_vacuum_needs_cleanup(info->index))
+			return NULL;
+
+		/*
+		 * Since we aren't going to actually delete any leaf items, there's no
+		 * need to go through all the vacuum-cycle-ID pushups here.
+		 *
+		 * Posting list tuples are a source of inaccuracy for cleanup-only
+		 * scans.  btvacuumscan() will assume that the number of index tuples
+		 * from each page can be used as num_index_tuples, even though
+		 * num_index_tuples is supposed to represent the number of TIDs in the
+		 * index.  This naive approach can underestimate the number of tuples
+		 * in the index significantly.
+		 *
+		 * We handle the problem by making num_index_tuples an estimate in
+		 * cleanup-only case.
+		 */
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+		btvacuumscan(info, stats, NULL, NULL, 0);
+		stats->estimated_count = true;
+	}
+
+	/*
+	 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
+	 *
+	 * num_delpages is the number of deleted pages now in the index that were
+	 * not safe to place in the FSM to be recycled just yet.  num_delpages is
+	 * greater than 0 only when _bt_pagedel() actually deleted pages during
+	 * our call to btvacuumscan().  Even then, _bt_pendingfsm_finalize() must
+	 * have failed to place any newly deleted pages in the FSM just moments
+	 * ago.  (Actually, there are edge cases where recycling of the current
+	 * VACUUM's newly deleted pages does not even become safe by the time the
+	 * next VACUUM comes around.  See nbtree/README.)
+	 */
+	Assert(stats->pages_deleted >= stats->pages_free);
+	num_delpages = stats->pages_deleted - stats->pages_free;
+	_bt_set_cleanup_info(info->index, num_delpages);
+
+	/*
+	 * It's quite possible for us to be fooled by concurrent page splits into
+	 * double-counting some index tuples, so disbelieve any total that exceeds
+	 * the underlying heap's count ... if we know that accurately.  Otherwise
+	 * this might just make matters worse.
+	 */
+	if (!info->estimated_count)
+	{
+		if (stats->num_index_tuples > info->num_heap_tuples)
+			stats->num_index_tuples = info->num_heap_tuples;
+	}
+
+	return stats;
+}
+
+/*
+ * btvacuumscan --- scan the index for VACUUMing purposes
+ *
+ * This combines the functions of looking for leaf tuples that are deletable
+ * according to the vacuum callback, looking for empty pages that can be
+ * deleted, and looking for old deleted pages that can be recycled.  Both
+ * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
+ * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
+ *
+ * The caller is responsible for initially allocating/zeroing a stats struct
+ * and for obtaining a vacuum cycle ID if necessary.
+ */
+static void
+btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+			 IndexBulkDeleteCallback callback, void *callback_state,
+			 BTCycleId cycleid)
+{
+	Relation	rel = info->index;
+	BTVacState	vstate;
+	BlockNumber num_pages;
+	BlockNumber scanblkno;
+	bool		needLock;
+
+	/*
+	 * Reset fields that track information about the entire index now.  This
+	 * avoids double-counting in the case where a single VACUUM command
+	 * requires multiple scans of the index.
+	 *
+	 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
+	 * since they track information about the VACUUM command, and so must last
+	 * across each call to btvacuumscan().
+	 *
+	 * (Note that pages_free is treated as state about the whole index, not
+	 * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
+	 * calls are idempotent, and get repeated for the same deleted pages in
+	 * some scenarios.  The point for us is to track the number of recyclable
+	 * pages in the index at the end of the VACUUM command.)
+	 */
+	stats->num_pages = 0;
+	stats->num_index_tuples = 0;
+	stats->pages_deleted = 0;
+	stats->pages_free = 0;
+
+	/* Set up info to pass down to btvacuumpage */
+	vstate.info = info;
+	vstate.stats = stats;
+	vstate.callback = callback;
+	vstate.callback_state = callback_state;
+	vstate.cycleid = cycleid;
+
+	/* Create a temporary memory context to run _bt_pagedel in */
+	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
+												  "_bt_pagedel",
+												  ALLOCSET_DEFAULT_SIZES);
+
+	/* Initialize vstate fields used by _bt_pendingfsm_finalize */
+	vstate.bufsize = 0;
+	vstate.maxbufsize = 0;
+	vstate.pendingpages = NULL;
+	vstate.npendingpages = 0;
+	/* Consider applying _bt_pendingfsm_finalize optimization */
+	_bt_pendingfsm_init(rel, &vstate, (callback == NULL));
+
+	/*
+	 * The outer loop iterates over all index pages except the metapage, in
+	 * physical order (we hope the kernel will cooperate in providing
+	 * read-ahead for speed).  It is critical that we visit all leaf pages,
+	 * including ones added after we start the scan, else we might fail to
+	 * delete some deletable tuples.  Hence, we must repeatedly check the
+	 * relation length.  We must acquire the relation-extension lock while
+	 * doing so to avoid a race condition: if someone else is extending the
+	 * relation, there is a window where bufmgr/smgr have created a new
+	 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
+	 * we manage to scan such a page here, we'll improperly assume it can be
+	 * recycled.  Taking the lock synchronizes things enough to prevent a
+	 * problem: either num_pages won't include the new page, or _bt_getbuf
+	 * already has write lock on the buffer and it will be fully initialized
+	 * before we can examine it.  (See also vacuumlazy.c, which has the same
+	 * issue.)	Also, we need not worry if a page is added immediately after
+	 * we look; the page splitting code already has write-lock on the left
+	 * page before it adds a right page, so we must already have processed any
+	 * tuples due to be moved into such a page.
+	 *
+	 * We can skip locking for new or temp relations, however, since no one
+	 * else could be accessing them.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	scanblkno = BTREE_METAPAGE + 1;
+	for (;;)
+	{
+		/* Get the current relation length */
+		if (needLock)
+			LockRelationForExtension(rel, ExclusiveLock);
+		num_pages = RelationGetNumberOfBlocks(rel);
+		if (needLock)
+			UnlockRelationForExtension(rel, ExclusiveLock);
+
+		if (info->report_progress)
+			pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
+										 num_pages);
+
+		/* Quit if we've scanned the whole relation */
+		if (scanblkno >= num_pages)
+			break;
+		/* Iterate over pages, then loop back to recheck length */
+		for (; scanblkno < num_pages; scanblkno++)
+		{
+			btvacuumpage(&vstate, scanblkno);
+			if (info->report_progress)
+				pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
+											 scanblkno);
+		}
+	}
+
+	/* Set statistics num_pages field to final size of index */
+	stats->num_pages = num_pages;
+
+	MemoryContextDelete(vstate.pagedelcontext);
+
+	/*
+	 * If there were any calls to _bt_pagedel() during scan of the index then
+	 * see if any of the resulting pages can be placed in the FSM now.  When
+	 * it's not safe we'll have to leave it up to a future VACUUM operation.
+	 *
+	 * Finally, if we placed any pages in the FSM (either just now or during
+	 * the scan), forcibly update the upper-level FSM pages to ensure that
+	 * searchers can find them.
+	 */
+	_bt_pendingfsm_finalize(rel, &vstate);
+	if (stats->pages_free > 0)
+		IndexFreeSpaceMapVacuum(rel);
+}
+
+/*
+ * btvacuumpage --- VACUUM one page
+ *
+ * This processes a single page for btvacuumscan().  In some cases we must
+ * backtrack to re-examine and VACUUM pages that were the scanblkno during
+ * a previous call here.  This is how we handle page splits (that happened
+ * after our cycleid was acquired) whose right half page happened to reuse
+ * a block that we might have processed at some point before it was
+ * recycled (i.e. before the page split).
+ */
+static void
+btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
+{
+	IndexVacuumInfo *info = vstate->info;
+	IndexBulkDeleteResult *stats = vstate->stats;
+	IndexBulkDeleteCallback callback = vstate->callback;
+	void	   *callback_state = vstate->callback_state;
+	Relation	rel = info->index;
+	bool		attempt_pagedel;
+	BlockNumber blkno,
+				backtrack_to;
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+
+	blkno = scanblkno;
+
+backtrack:
+
+	attempt_pagedel = false;
+	backtrack_to = P_NONE;
+
+	/* call vacuum_delay_point while not holding any buffer lock */
+	vacuum_delay_point();
+
+	/*
+	 * We can't use _bt_getbuf() here because it always applies
+	 * _bt_checkpage(), which will barf on an all-zero page. We want to
+	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
+	 * buffer access strategy.
+	 */
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+							 info->strategy);
+	_bt_lockbuf(rel, buf, BT_READ);
+	page = BufferGetPage(buf);
+	opaque = NULL;
+	if (!PageIsNew(page))
+	{
+		_bt_checkpage(rel, buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	}
+
+	Assert(blkno <= scanblkno);
+	if (blkno != scanblkno)
+	{
+		/*
+		 * We're backtracking.
+		 *
+		 * We followed a right link to a sibling leaf page (a page that
+		 * happens to be from a block located before scanblkno).  The only
+		 * case we want to do anything with is a live leaf page having the
+		 * current vacuum cycle ID.
+		 *
+		 * The page had better be in a state that's consistent with what we
+		 * expect.  Check for conditions that imply corruption in passing.  It
+		 * can't be half-dead because only an interrupted VACUUM process can
+		 * leave pages in that state, so we'd definitely have dealt with it
+		 * back when the page was the scanblkno page (half-dead pages are
+		 * always marked fully deleted by _bt_pagedel()).  This assumes that
+		 * there can be only one vacuum process running at a time.
+		 */
+		if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
+		{
+			Assert(false);
+			ereport(LOG,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
+									 blkno, scanblkno, RelationGetRelationName(rel))));
+			_bt_relbuf(rel, buf);
+			return;
+		}
+
+		/*
+		 * We may have already processed the page in an earlier call, when the
+		 * page was scanblkno.  This happens when the leaf page split occurred
+		 * after the scan began, but before the right sibling page became the
+		 * scanblkno.
+		 *
+		 * Page may also have been deleted by current btvacuumpage() call,
+		 * since _bt_pagedel() sometimes deletes the right sibling page of
+		 * scanblkno in passing (it does so after we decided where to
+		 * backtrack to).  We don't need to process this page as a deleted
+		 * page a second time now (in fact, it would be wrong to count it as a
+		 * deleted page in the bulk delete statistics a second time).
+		 */
+		if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
+		{
+			/* Done with current scanblkno (and all lower split pages) */
+			_bt_relbuf(rel, buf);
+			return;
+		}
+	}
+
+	if (!opaque || BTPageIsRecyclable(page))
+	{
+		/* Okay to recycle this page (which could be leaf or internal) */
+		RecordFreeIndexPage(rel, blkno);
+		stats->pages_deleted++;
+		stats->pages_free++;
+	}
+	else if (P_ISDELETED(opaque))
+	{
+		/*
+		 * Already deleted page (which could be leaf or internal).  Can't
+		 * recycle yet.
+		 */
+		stats->pages_deleted++;
+	}
+	else if (P_ISHALFDEAD(opaque))
+	{
+		/* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
+		attempt_pagedel = true;
+
+		/*
+		 * _bt_pagedel() will increment both pages_newly_deleted and
+		 * pages_deleted stats in all cases (barring corruption)
+		 */
+	}
+	else if (P_ISLEAF(opaque))
+	{
+		OffsetNumber deletable[MaxIndexTuplesPerPage];
+		int			ndeletable;
+		BTVacuumPosting updatable[MaxIndexTuplesPerPage];
+		int			nupdatable;
+		OffsetNumber offnum,
+					minoff,
+					maxoff;
+		int			nhtidsdead,
+					nhtidslive;
+
+		/*
+		 * Trade in the initial read lock for a super-exclusive write lock on
+		 * this page.  We must get such a lock on every leaf page over the
+		 * course of the vacuum scan, whether or not it actually contains any
+		 * deletable tuples --- see nbtree/README.
+		 */
+		_bt_upgradelockbufcleanup(rel, buf);
+
+		/*
+		 * Check whether we need to backtrack to earlier pages.  What we are
+		 * concerned about is a page split that happened since we started the
+		 * vacuum scan.  If the split moved tuples on the right half of the
+		 * split (i.e. the tuples that sort high) to a block that we already
+		 * passed over, then we might have missed the tuples.  We need to
+		 * backtrack now.  (Must do this before possibly clearing btpo_cycleid
+		 * or deleting scanblkno page below!)
+		 */
+		if (vstate->cycleid != 0 &&
+			opaque->btpo_cycleid == vstate->cycleid &&
+			!(opaque->btpo_flags & BTP_SPLIT_END) &&
+			!P_RIGHTMOST(opaque) &&
+			opaque->btpo_next < scanblkno)
+			backtrack_to = opaque->btpo_next;
+
+		/*
+		 * When each VACUUM begins, it determines an OldestXmin cutoff value.
+		 * Tuples before the cutoff are removed by VACUUM.  Scan over all
+		 * items to see which ones need to be deleted according to cutoff
+		 * point using callback.
+		 */
+		ndeletable = 0;
+		nupdatable = 0;
+		minoff = P_FIRSTDATAKEY(opaque);
+		maxoff = PageGetMaxOffsetNumber(page);
+		nhtidsdead = 0;
+		nhtidslive = 0;
+		if (callback)
+		{
+			for (offnum = minoff;
+				 offnum <= maxoff;
+				 offnum = OffsetNumberNext(offnum))
+			{
+				IndexTuple	itup;
+
+				itup = (IndexTuple) PageGetItem(page,
+												PageGetItemId(page, offnum));
+
+				/*
+				 * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
+				 * records do not produce their own conflicts.  This is safe
+				 * as long as the callback function only considers whether the
+				 * index tuple refers to pre-cutoff heap tuples that were
+				 * certainly already pruned away during VACUUM's initial heap
+				 * scan by the time we get here. (heapam's XLOG_HEAP2_PRUNE
+				 * records produce conflicts using a latestRemovedXid value
+				 * for the pointed-to heap tuples, so there is no need to
+				 * produce our own conflict now.)
+				 *
+				 * Backends with snapshots acquired after a VACUUM starts but
+				 * before it finishes could have visibility cutoff with a
+				 * later xid than VACUUM's OldestXmin cutoff.  These backends
+				 * might happen to opportunistically mark some index tuples
+				 * LP_DEAD before we reach them, even though they may be after
+				 * our cutoff.  We don't try to kill these "extra" index
+				 * tuples in _bt_delitems_vacuum().  This keep things simple,
+				 * and allows us to always avoid generating our own conflicts.
+				 */
+				Assert(!BTreeTupleIsPivot(itup));
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Regular tuple, standard table TID representation */
+					if (callback(&itup->t_tid, callback_state))
+					{
+						deletable[ndeletable++] = offnum;
+						nhtidsdead++;
+					}
+					else
+						nhtidslive++;
+				}
+				else
+				{
+					BTVacuumPosting vacposting;
+					int			nremaining;
+
+					/* Posting list tuple */
+					vacposting = btreevacuumposting(vstate, itup, offnum,
+													&nremaining);
+					if (vacposting == NULL)
+					{
+						/*
+						 * All table TIDs from the posting tuple remain, so no
+						 * delete or update required
+						 */
+						Assert(nremaining == BTreeTupleGetNPosting(itup));
+					}
+					else if (nremaining > 0)
+					{
+
+						/*
+						 * Store metadata about posting list tuple in
+						 * updatable array for entire page.  Existing tuple
+						 * will be updated during the later call to
+						 * _bt_delitems_vacuum().
+						 */
+						Assert(nremaining < BTreeTupleGetNPosting(itup));
+						updatable[nupdatable++] = vacposting;
+						nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
+					}
+					else
+					{
+						/*
+						 * All table TIDs from the posting list must be
+						 * deleted.  We'll delete the index tuple completely
+						 * (no update required).
+						 */
+						Assert(nremaining == 0);
+						deletable[ndeletable++] = offnum;
+						nhtidsdead += BTreeTupleGetNPosting(itup);
+						pfree(vacposting);
+					}
+
+					nhtidslive += nremaining;
+				}
+			}
+		}
+
+		/*
+		 * Apply any needed deletes or updates.  We issue just one
+		 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
+		 */
+		if (ndeletable > 0 || nupdatable > 0)
+		{
+			Assert(nhtidsdead >= ndeletable + nupdatable);
+			_bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
+								nupdatable);
+
+			stats->tuples_removed += nhtidsdead;
+			/* must recompute maxoff */
+			maxoff = PageGetMaxOffsetNumber(page);
+
+			/* can't leak memory here */
+			for (int i = 0; i < nupdatable; i++)
+				pfree(updatable[i]);
+		}
+		else
+		{
+			/*
+			 * If the leaf page has been split during this vacuum cycle, it
+			 * seems worth expending a write to clear btpo_cycleid even if we
+			 * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
+			 * takes care of this.)  This ensures we won't process the page
+			 * again.
+			 *
+			 * We treat this like a hint-bit update because there's no need to
+			 * WAL-log it.
+			 */
+			Assert(nhtidsdead == 0);
+			if (vstate->cycleid != 0 &&
+				opaque->btpo_cycleid == vstate->cycleid)
+			{
+				opaque->btpo_cycleid = 0;
+				MarkBufferDirtyHint(buf, true);
+			}
+		}
+
+		/*
+		 * If the leaf page is now empty, try to delete it; else count the
+		 * live tuples (live table TIDs in posting lists are counted as
+		 * separate live tuples).  We don't delete when backtracking, though,
+		 * since that would require teaching _bt_pagedel() about backtracking
+		 * (doesn't seem worth adding more complexity to deal with that).
+		 *
+		 * We don't count the number of live TIDs during cleanup-only calls to
+		 * btvacuumscan (i.e. when callback is not set).  We count the number
+		 * of index tuples directly instead.  This avoids the expense of
+		 * directly examining all of the tuples on each page.  VACUUM will
+		 * treat num_index_tuples as an estimate in cleanup-only case, so it
+		 * doesn't matter that this underestimates num_index_tuples
+		 * significantly in some cases.
+		 */
+		if (minoff > maxoff)
+			attempt_pagedel = (blkno == scanblkno);
+		else if (callback)
+			stats->num_index_tuples += nhtidslive;
+		else
+			stats->num_index_tuples += maxoff - minoff + 1;
+
+		Assert(!attempt_pagedel || nhtidslive == 0);
+	}
+
+	if (attempt_pagedel)
+	{
+		MemoryContext oldcontext;
+
+		/* Run pagedel in a temp context to avoid memory leakage */
+		MemoryContextReset(vstate->pagedelcontext);
+		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
+
+		/*
+		 * _bt_pagedel maintains the bulk delete stats on our behalf;
+		 * pages_newly_deleted and pages_deleted are likely to be incremented
+		 * during call
+		 */
+		Assert(blkno == scanblkno);
+		_bt_pagedel(rel, buf, vstate);
+
+		MemoryContextSwitchTo(oldcontext);
+		/* pagedel released buffer, so we shouldn't */
+	}
+	else
+		_bt_relbuf(rel, buf);
+
+	if (backtrack_to != P_NONE)
+	{
+		blkno = backtrack_to;
+		goto backtrack;
+	}
+}
+
+/*
+ * btreevacuumposting --- determine TIDs still needed in posting list
+ *
+ * Returns metadata describing how to build replacement tuple without the TIDs
+ * that VACUUM needs to delete.  Returned value is NULL in the common case
+ * where no changes are needed to caller's posting list tuple (we avoid
+ * allocating memory here as an optimization).
+ *
+ * The number of TIDs that should remain in the posting list tuple is set for
+ * caller in *nremaining.
+ */
+static BTVacuumPosting
+btreevacuumposting(BTVacState *vstate, IndexTuple posting,
+				   OffsetNumber updatedoffset, int *nremaining)
+{
+	int			live = 0;
+	int			nitem = BTreeTupleGetNPosting(posting);
+	ItemPointer items = BTreeTupleGetPosting(posting);
+	BTVacuumPosting vacposting = NULL;
+
+	for (int i = 0; i < nitem; i++)
+	{
+		if (!vstate->callback(items + i, vstate->callback_state))
+		{
+			/* Live table TID */
+			live++;
+		}
+		else if (vacposting == NULL)
+		{
+			/*
+			 * First dead table TID encountered.
+			 *
+			 * It's now clear that we need to delete one or more dead table
+			 * TIDs, so start maintaining metadata describing how to update
+			 * existing posting list tuple.
+			 */
+			vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+								nitem * sizeof(uint16));
+
+			vacposting->itup = posting;
+			vacposting->updatedoffset = updatedoffset;
+			vacposting->ndeletedtids = 0;
+			vacposting->deletetids[vacposting->ndeletedtids++] = i;
+		}
+		else
+		{
+			/* Second or subsequent dead table TID */
+			vacposting->deletetids[vacposting->ndeletedtids++] = i;
+		}
+	}
+
+	*nremaining = live;
+	return vacposting;
+}
+
+/*
+ *	btcanreturn() -- Check whether btree indexes support index-only scans.
+ *
+ * btrees always do, so this is trivial.
+ */
+bool
+btcanreturn(Relation index, int attno)
+{
+	return true;
+}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
new file mode 100644
index 0000000..fdf0e56
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -0,0 +1,2501 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsearch.c
+ *	  Search code for postgres btrees.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtsearch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/predicate.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+
+
+static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static int	_bt_binsrch_posting(BTScanInsert key, Page page,
+								OffsetNumber offnum);
+static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
+						 OffsetNumber offnum);
+static void _bt_saveitem(BTScanOpaque so, int itemIndex,
+						 OffsetNumber offnum, IndexTuple itup);
+static int	_bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+								  OffsetNumber offnum, ItemPointer heapTid,
+								  IndexTuple itup);
+static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+									   OffsetNumber offnum,
+									   ItemPointer heapTid, int tupleOffset);
+static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
+static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
+								  ScanDirection dir);
+static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
+static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
+
+
+/*
+ *	_bt_drop_lock_and_maybe_pin()
+ *
+ * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
+ * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
+ * This will prevent vacuum from stalling in a blocked state trying to read a
+ * page when a cursor is sitting on it -- at least in many important cases.
+ *
+ * Set the buffer to invalid if the pin is released, since the buffer may be
+ * re-used.  If we need to go back to this block (for example, to apply
+ * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
+ * will remain in shared memory for as long as it takes to scan the index
+ * buffer page.
+ */
+static void
+_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+{
+	_bt_unlockbuf(scan->indexRelation, sp->buf);
+
+	if (IsMVCCSnapshot(scan->xs_snapshot) &&
+		RelationNeedsWAL(scan->indexRelation) &&
+		!scan->xs_want_itup)
+	{
+		ReleaseBuffer(sp->buf);
+		sp->buf = InvalidBuffer;
+	}
+}
+
+/*
+ *	_bt_search() -- Search the tree for a particular scankey,
+ *		or more precisely for the first leaf page it could be on.
+ *
+ * The passed scankey is an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * Return value is a stack of parent-page pointers (i.e. there is no entry for
+ * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
+ * which is locked and pinned.  No locks are held on the parent pages,
+ * however!
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree.  This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ *
+ * The returned buffer is locked according to access parameter.  Additionally,
+ * access = BT_WRITE will allow an empty root page to be created and returned.
+ * When access = BT_READ, an empty index will result in *bufP being set to
+ * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
+ * during the search will be finished.
+ */
+BTStack
+_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
+		   Snapshot snapshot)
+{
+	BTStack		stack_in = NULL;
+	int			page_access = BT_READ;
+
+	/* Get the root page to start with */
+	*bufP = _bt_getroot(rel, access);
+
+	/* If index is empty and access = BT_READ, no root page is created. */
+	if (!BufferIsValid(*bufP))
+		return (BTStack) NULL;
+
+	/* Loop iterates once per level descended in the tree */
+	for (;;)
+	{
+		Page		page;
+		BTPageOpaque opaque;
+		OffsetNumber offnum;
+		ItemId		itemid;
+		IndexTuple	itup;
+		BlockNumber child;
+		BTStack		new_stack;
+
+		/*
+		 * Race -- the page we just grabbed may have split since we read its
+		 * downlink in its parent page (or the metapage).  If it has, we may
+		 * need to move right to its new sibling.  Do that.
+		 *
+		 * In write-mode, allow _bt_moveright to finish any incomplete splits
+		 * along the way.  Strictly speaking, we'd only need to finish an
+		 * incomplete split on the leaf page we're about to insert to, not on
+		 * any of the upper levels (internal pages with incomplete splits are
+		 * also taken care of in _bt_getstackbuf).  But this is a good
+		 * opportunity to finish splits of internal pages too.
+		 */
+		*bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
+							  page_access, snapshot);
+
+		/* if this is a leaf page, we're done */
+		page = BufferGetPage(*bufP);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (P_ISLEAF(opaque))
+			break;
+
+		/*
+		 * Find the appropriate pivot tuple on this page.  Its downlink points
+		 * to the child page that we're about to descend to.
+		 */
+		offnum = _bt_binsrch(rel, key, *bufP);
+		itemid = PageGetItemId(page, offnum);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+		Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
+		child = BTreeTupleGetDownLink(itup);
+
+		/*
+		 * We need to save the location of the pivot tuple we chose in a new
+		 * stack entry for this page/level.  If caller ends up splitting a
+		 * page one level down, it usually ends up inserting a new pivot
+		 * tuple/downlink immediately after the location recorded here.
+		 */
+		new_stack = (BTStack) palloc(sizeof(BTStackData));
+		new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
+		new_stack->bts_offset = offnum;
+		new_stack->bts_parent = stack_in;
+
+		/*
+		 * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
+		 * we're on the level 1 and asked to lock leaf page in write mode,
+		 * then lock next page in write mode, because it must be a leaf.
+		 */
+		if (opaque->btpo_level == 1 && access == BT_WRITE)
+			page_access = BT_WRITE;
+
+		/* drop the read lock on the page, then acquire one on its child */
+		*bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
+
+		/* okay, all set to move down a level */
+		stack_in = new_stack;
+	}
+
+	/*
+	 * If we're asked to lock leaf in write mode, but didn't manage to, then
+	 * relock.  This should only happen when the root page is a leaf page (and
+	 * the only page in the index other than the metapage).
+	 */
+	if (access == BT_WRITE && page_access == BT_READ)
+	{
+		/* trade in our read lock for a write lock */
+		_bt_unlockbuf(rel, *bufP);
+		_bt_lockbuf(rel, *bufP, BT_WRITE);
+
+		/*
+		 * Race -- the leaf page may have split after we dropped the read lock
+		 * but before we acquired a write lock.  If it has, we may need to
+		 * move right to its new sibling.  Do that.
+		 */
+		*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
+							  snapshot);
+	}
+
+	return stack_in;
+}
+
+/*
+ *	_bt_moveright() -- move right in the btree if necessary.
+ *
+ * When we follow a pointer to reach a page, it is possible that
+ * the page has changed in the meanwhile.  If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
+ *
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page.  If that entry is
+ * strictly less than the scankey, or <= the scankey in the
+ * key.nextkey=true case, then we followed the wrong link and we need
+ * to move right.
+ *
+ * The passed insertion-type scankey can omit the rightmost column(s) of the
+ * index. (see nbtree/README)
+ *
+ * When key.nextkey is false (the usual case), we are looking for the first
+ * item >= key.  When key.nextkey is true, we are looking for the first item
+ * strictly greater than key.
+ *
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter.  This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed.  'stack' is only used if forupdate is true.
+ *
+ * On entry, we have the buffer pinned and a lock of the type specified by
+ * 'access'.  If we move right, we release the buffer and lock and acquire
+ * the same on the right sibling.  Return value is the buffer we stop at.
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree.  This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ */
+Buffer
+_bt_moveright(Relation rel,
+			  BTScanInsert key,
+			  Buffer buf,
+			  bool forupdate,
+			  BTStack stack,
+			  int access,
+			  Snapshot snapshot)
+{
+	Page		page;
+	BTPageOpaque opaque;
+	int32		cmpval;
+
+	/*
+	 * When nextkey = false (normal case): if the scan key that brought us to
+	 * this page is > the high key stored on the page, then the page has split
+	 * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
+	 * have some duplicates to the right as well as the left, but that's
+	 * something that's only ever dealt with on the leaf level, after
+	 * _bt_search has found an initial leaf page.)
+	 *
+	 * When nextkey = true: move right if the scan key is >= page's high key.
+	 * (Note that key.scantid cannot be set in this case.)
+	 *
+	 * The page could even have split more than once, so scan as far as
+	 * needed.
+	 *
+	 * We also have to move right if we followed a link that brought us to a
+	 * dead page.
+	 */
+	cmpval = key->nextkey ? 0 : 1;
+
+	for (;;)
+	{
+		page = BufferGetPage(buf);
+		TestForOldSnapshot(snapshot, rel, page);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		if (P_RIGHTMOST(opaque))
+			break;
+
+		/*
+		 * Finish any incomplete splits we encounter along the way.
+		 */
+		if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+		{
+			BlockNumber blkno = BufferGetBlockNumber(buf);
+
+			/* upgrade our lock if necessary */
+			if (access == BT_READ)
+			{
+				_bt_unlockbuf(rel, buf);
+				_bt_lockbuf(rel, buf, BT_WRITE);
+			}
+
+			if (P_INCOMPLETE_SPLIT(opaque))
+				_bt_finish_split(rel, buf, stack);
+			else
+				_bt_relbuf(rel, buf);
+
+			/* re-acquire the lock in the right mode, and re-check */
+			buf = _bt_getbuf(rel, blkno, access);
+			continue;
+		}
+
+		if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
+		{
+			/* step right one page */
+			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+			continue;
+		}
+		else
+			break;
+	}
+
+	if (P_IGNORE(opaque))
+		elog(ERROR, "fell off the end of index \"%s\"",
+			 RelationGetRelationName(rel));
+
+	return buf;
+}
+
+/*
+ *	_bt_binsrch() -- Do a binary search for a key on a particular page.
+ *
+ * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
+ * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
+ * particular, this means it is possible to return a value 1 greater than the
+ * number of keys on the page, if the scankey is > all keys on the page.)
+ *
+ * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
+ * of the last key < given scankey, or last key <= given scankey if nextkey
+ * is true.  (Since _bt_compare treats the first data key of such a page as
+ * minus infinity, there will be at least one key < scankey, so the result
+ * always points at one of the keys on the page.)  This key indicates the
+ * right place to descend to be sure we find all leaf keys >= given scankey
+ * (or leaf keys > given scankey when nextkey is true).
+ *
+ * This procedure is not responsible for walking right, it just examines
+ * the given page.  _bt_binsrch() has no lock or refcount side effects
+ * on the buffer.
+ */
+static OffsetNumber
+_bt_binsrch(Relation rel,
+			BTScanInsert key,
+			Buffer buf)
+{
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber low,
+				high;
+	int32		result,
+				cmpval;
+
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* Requesting nextkey semantics while using scantid seems nonsensical */
+	Assert(!key->nextkey || key->scantid == NULL);
+	/* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
+	Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
+
+	low = P_FIRSTDATAKEY(opaque);
+	high = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * If there are no keys on the page, return the first available slot. Note
+	 * this covers two cases: the page is really empty (no keys), or it
+	 * contains only a high key.  The latter case is possible after vacuuming.
+	 * This can never happen on an internal page, however, since they are
+	 * never empty (an internal page must have children).
+	 */
+	if (unlikely(high < low))
+		return low;
+
+	/*
+	 * Binary search to find the first key on the page >= scan key, or first
+	 * key > scankey when nextkey is true.
+	 *
+	 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
+	 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
+	 *
+	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
+	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
+	 *
+	 * We can fall out when high == low.
+	 */
+	high++;						/* establish the loop invariant for high */
+
+	cmpval = key->nextkey ? 0 : 1;	/* select comparison value */
+
+	while (high > low)
+	{
+		OffsetNumber mid = low + ((high - low) / 2);
+
+		/* We have low <= mid < high, so mid points at a real slot */
+
+		result = _bt_compare(rel, key, page, mid);
+
+		if (result >= cmpval)
+			low = mid + 1;
+		else
+			high = mid;
+	}
+
+	/*
+	 * At this point we have high == low, but be careful: they could point
+	 * past the last slot on the page.
+	 *
+	 * On a leaf page, we always return the first key >= scan key (resp. >
+	 * scan key), which could be the last slot + 1.
+	 */
+	if (P_ISLEAF(opaque))
+		return low;
+
+	/*
+	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
+	 * There must be one if _bt_compare() is playing by the rules.
+	 */
+	Assert(low > P_FIRSTDATAKEY(opaque));
+
+	return OffsetNumberPrev(low);
+}
+
+/*
+ *
+ *	_bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
+ *
+ * Like _bt_binsrch(), but with support for caching the binary search
+ * bounds.  Only used during insertion, and only on the leaf page that it
+ * looks like caller will insert tuple on.  Exclusive-locked and pinned
+ * leaf page is contained within insertstate.
+ *
+ * Caches the bounds fields in insertstate so that a subsequent call can
+ * reuse the low and strict high bounds of original binary search.  Callers
+ * that use these fields directly must be prepared for the case where low
+ * and/or stricthigh are not on the same page (one or both exceed maxoff
+ * for the page).  The case where there are no items on the page (high <
+ * low) makes bounds invalid.
+ *
+ * Caller is responsible for invalidating bounds when it modifies the page
+ * before calling here a second time, and for dealing with posting list
+ * tuple matches (callers can use insertstate's postingoff field to
+ * determine which existing heap TID will need to be replaced by a posting
+ * list split).
+ */
+OffsetNumber
+_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
+{
+	BTScanInsert key = insertstate->itup_key;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber low,
+				high,
+				stricthigh;
+	int32		result,
+				cmpval;
+
+	page = BufferGetPage(insertstate->buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	Assert(P_ISLEAF(opaque));
+	Assert(!key->nextkey);
+	Assert(insertstate->postingoff == 0);
+
+	if (!insertstate->bounds_valid)
+	{
+		/* Start new binary search */
+		low = P_FIRSTDATAKEY(opaque);
+		high = PageGetMaxOffsetNumber(page);
+	}
+	else
+	{
+		/* Restore result of previous binary search against same page */
+		low = insertstate->low;
+		high = insertstate->stricthigh;
+	}
+
+	/* If there are no keys on the page, return the first available slot */
+	if (unlikely(high < low))
+	{
+		/* Caller can't reuse bounds */
+		insertstate->low = InvalidOffsetNumber;
+		insertstate->stricthigh = InvalidOffsetNumber;
+		insertstate->bounds_valid = false;
+		return low;
+	}
+
+	/*
+	 * Binary search to find the first key on the page >= scan key. (nextkey
+	 * is always false when inserting).
+	 *
+	 * The loop invariant is: all slots before 'low' are < scan key, all slots
+	 * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
+	 * maintained to save additional search effort for caller.
+	 *
+	 * We can fall out when high == low.
+	 */
+	if (!insertstate->bounds_valid)
+		high++;					/* establish the loop invariant for high */
+	stricthigh = high;			/* high initially strictly higher */
+
+	cmpval = 1;					/* !nextkey comparison value */
+
+	while (high > low)
+	{
+		OffsetNumber mid = low + ((high - low) / 2);
+
+		/* We have low <= mid < high, so mid points at a real slot */
+
+		result = _bt_compare(rel, key, page, mid);
+
+		if (result >= cmpval)
+			low = mid + 1;
+		else
+		{
+			high = mid;
+			if (result != 0)
+				stricthigh = high;
+		}
+
+		/*
+		 * If tuple at offset located by binary search is a posting list whose
+		 * TID range overlaps with caller's scantid, perform posting list
+		 * binary search to set postingoff for caller.  Caller must split the
+		 * posting list when postingoff is set.  This should happen
+		 * infrequently.
+		 */
+		if (unlikely(result == 0 && key->scantid != NULL))
+		{
+			/*
+			 * postingoff should never be set more than once per leaf page
+			 * binary search.  That would mean that there are duplicate table
+			 * TIDs in the index, which is never okay.  Check for that here.
+			 */
+			if (insertstate->postingoff != 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INDEX_CORRUPTED),
+						 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
+										 ItemPointerGetBlockNumber(key->scantid),
+										 ItemPointerGetOffsetNumber(key->scantid),
+										 low, stricthigh,
+										 BufferGetBlockNumber(insertstate->buf),
+										 RelationGetRelationName(rel))));
+
+			insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
+		}
+	}
+
+	/*
+	 * On a leaf page, a binary search always returns the first key >= scan
+	 * key (at least in !nextkey case), which could be the last slot + 1. This
+	 * is also the lower bound of cached search.
+	 *
+	 * stricthigh may also be the last slot + 1, which prevents caller from
+	 * using bounds directly, but is still useful to us if we're called a
+	 * second time with cached bounds (cached low will be < stricthigh when
+	 * that happens).
+	 */
+	insertstate->low = low;
+	insertstate->stricthigh = stricthigh;
+	insertstate->bounds_valid = true;
+
+	return low;
+}
+
+/*----------
+ *	_bt_binsrch_posting() -- posting list binary search.
+ *
+ * Helper routine for _bt_binsrch_insert().
+ *
+ * Returns offset into posting list where caller's scantid belongs.
+ *----------
+ */
+static int
+_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
+{
+	IndexTuple	itup;
+	ItemId		itemid;
+	int			low,
+				high,
+				mid,
+				res;
+
+	/*
+	 * If this isn't a posting tuple, then the index must be corrupt (if it is
+	 * an ordinary non-pivot tuple then there must be an existing tuple with a
+	 * heap TID that equals inserter's new heap TID/scantid).  Defensively
+	 * check that tuple is a posting list tuple whose posting list range
+	 * includes caller's scantid.
+	 *
+	 * (This is also needed because contrib/amcheck's rootdescend option needs
+	 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
+	 */
+	itemid = PageGetItemId(page, offnum);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	if (!BTreeTupleIsPosting(itup))
+		return 0;
+
+	Assert(key->heapkeyspace && key->allequalimage);
+
+	/*
+	 * In the event that posting list tuple has LP_DEAD bit set, indicate this
+	 * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
+	 * second call to _bt_binsrch_insert() can take place when its caller has
+	 * removed the dead item.
+	 */
+	if (ItemIdIsDead(itemid))
+		return -1;
+
+	/* "high" is past end of posting list for loop invariant */
+	low = 0;
+	high = BTreeTupleGetNPosting(itup);
+	Assert(high >= 2);
+
+	while (high > low)
+	{
+		mid = low + ((high - low) / 2);
+		res = ItemPointerCompare(key->scantid,
+								 BTreeTupleGetPostingN(itup, mid));
+
+		if (res > 0)
+			low = mid + 1;
+		else if (res < 0)
+			high = mid;
+		else
+			return mid;
+	}
+
+	/* Exact match not found */
+	return low;
+}
+
+/*----------
+ *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
+ *
+ *	page/offnum: location of btree item to be compared to.
+ *
+ *		This routine returns:
+ *			<0 if scankey < tuple at offnum;
+ *			 0 if scankey == tuple at offnum;
+ *			>0 if scankey > tuple at offnum.
+ *
+ * NULLs in the keys are treated as sortable values.  Therefore
+ * "equality" does not necessarily mean that the item should be returned
+ * to the caller as a matching key.  Similarly, an insertion scankey
+ * with its scantid set is treated as equal to a posting tuple whose TID
+ * range overlaps with their scantid.  There generally won't be a
+ * matching TID in the posting tuple, which caller must handle
+ * themselves (e.g., by splitting the posting list tuple).
+ *
+ * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
+ * "minus infinity": this routine will always claim it is less than the
+ * scankey.  The actual key value stored is explicitly truncated to 0
+ * attributes (explicitly minus infinity) with version 3+ indexes, but
+ * that isn't relied upon.  This allows us to implement the Lehman and
+ * Yao convention that the first down-link pointer is before the first
+ * key.  See backend/access/nbtree/README for details.
+ *----------
+ */
+int32
+_bt_compare(Relation rel,
+			BTScanInsert key,
+			Page page,
+			OffsetNumber offnum)
+{
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	IndexTuple	itup;
+	ItemPointer heapTid;
+	ScanKey		scankey;
+	int			ncmpkey;
+	int			ntupatts;
+	int32		result;
+
+	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
+	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
+	Assert(key->heapkeyspace || key->scantid == NULL);
+
+	/*
+	 * Force result ">" if target item is first data item on an internal page
+	 * --- see NOTE above.
+	 */
+	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
+		return 1;
+
+	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+	ntupatts = BTreeTupleGetNAtts(itup, rel);
+
+	/*
+	 * The scan key is set up with the attribute number associated with each
+	 * term in the key.  It is important that, if the index is multi-key, the
+	 * scan contain the first k key attributes, and that they be in order.  If
+	 * you think about how multi-key ordering works, you'll understand why
+	 * this is.
+	 *
+	 * We don't test for violation of this condition here, however.  The
+	 * initial setup for the index scan had better have gotten it right (see
+	 * _bt_first).
+	 */
+
+	ncmpkey = Min(ntupatts, key->keysz);
+	Assert(key->heapkeyspace || ncmpkey == key->keysz);
+	Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
+	scankey = key->scankeys;
+	for (int i = 1; i <= ncmpkey; i++)
+	{
+		Datum		datum;
+		bool		isNull;
+
+		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
+
+		if (scankey->sk_flags & SK_ISNULL)	/* key is NULL */
+		{
+			if (isNull)
+				result = 0;		/* NULL "=" NULL */
+			else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+				result = -1;	/* NULL "<" NOT_NULL */
+			else
+				result = 1;		/* NULL ">" NOT_NULL */
+		}
+		else if (isNull)		/* key is NOT_NULL and item is NULL */
+		{
+			if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+				result = 1;		/* NOT_NULL ">" NULL */
+			else
+				result = -1;	/* NOT_NULL "<" NULL */
+		}
+		else
+		{
+			/*
+			 * The sk_func needs to be passed the index value as left arg and
+			 * the sk_argument as right arg (they might be of different
+			 * types).  Since it is convenient for callers to think of
+			 * _bt_compare as comparing the scankey to the index item, we have
+			 * to flip the sign of the comparison result.  (Unless it's a DESC
+			 * column, in which case we *don't* flip the sign.)
+			 */
+			result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+													 scankey->sk_collation,
+													 datum,
+													 scankey->sk_argument));
+
+			if (!(scankey->sk_flags & SK_BT_DESC))
+				INVERT_COMPARE_RESULT(result);
+		}
+
+		/* if the keys are unequal, return the difference */
+		if (result != 0)
+			return result;
+
+		scankey++;
+	}
+
+	/*
+	 * All non-truncated attributes (other than heap TID) were found to be
+	 * equal.  Treat truncated attributes as minus infinity when scankey has a
+	 * key attribute value that would otherwise be compared directly.
+	 *
+	 * Note: it doesn't matter if ntupatts includes non-key attributes;
+	 * scankey won't, so explicitly excluding non-key attributes isn't
+	 * necessary.
+	 */
+	if (key->keysz > ntupatts)
+		return 1;
+
+	/*
+	 * Use the heap TID attribute and scantid to try to break the tie.  The
+	 * rules are the same as any other key attribute -- only the
+	 * representation differs.
+	 */
+	heapTid = BTreeTupleGetHeapTID(itup);
+	if (key->scantid == NULL)
+	{
+		/*
+		 * Most searches have a scankey that is considered greater than a
+		 * truncated pivot tuple if and when the scankey has equal values for
+		 * attributes up to and including the least significant untruncated
+		 * attribute in tuple.
+		 *
+		 * For example, if an index has the minimum two attributes (single
+		 * user key attribute, plus heap TID attribute), and a page's high key
+		 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
+		 * will not descend to the page to the left.  The search will descend
+		 * right instead.  The truncated attribute in pivot tuple means that
+		 * all non-pivot tuples on the page to the left are strictly < 'foo',
+		 * so it isn't necessary to descend left.  In other words, search
+		 * doesn't have to descend left because it isn't interested in a match
+		 * that has a heap TID value of -inf.
+		 *
+		 * However, some searches (pivotsearch searches) actually require that
+		 * we descend left when this happens.  -inf is treated as a possible
+		 * match for omitted scankey attribute(s).  This is needed by page
+		 * deletion, which must re-find leaf pages that are targets for
+		 * deletion using their high keys.
+		 *
+		 * Note: the heap TID part of the test ensures that scankey is being
+		 * compared to a pivot tuple with one or more truncated key
+		 * attributes.
+		 *
+		 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
+		 * left here, since they have no heap TID attribute (and cannot have
+		 * any -inf key values in any case, since truncation can only remove
+		 * non-key attributes).  !heapkeyspace searches must always be
+		 * prepared to deal with matches on both sides of the pivot once the
+		 * leaf level is reached.
+		 */
+		if (key->heapkeyspace && !key->pivotsearch &&
+			key->keysz == ntupatts && heapTid == NULL)
+			return 1;
+
+		/* All provided scankey arguments found to be equal */
+		return 0;
+	}
+
+	/*
+	 * Treat truncated heap TID as minus infinity, since scankey has a key
+	 * attribute value (scantid) that would otherwise be compared directly
+	 */
+	Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
+	if (heapTid == NULL)
+		return 1;
+
+	/*
+	 * Scankey must be treated as equal to a posting list tuple if its scantid
+	 * value falls within the range of the posting list.  In all other cases
+	 * there can only be a single heap TID value, which is compared directly
+	 * with scantid.
+	 */
+	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
+	result = ItemPointerCompare(key->scantid, heapTid);
+	if (result <= 0 || !BTreeTupleIsPosting(itup))
+		return result;
+	else
+	{
+		result = ItemPointerCompare(key->scantid,
+									BTreeTupleGetMaxHeapTID(itup));
+		if (result > 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
+ *	_bt_first() -- Find the first item in a scan.
+ *
+ *		We need to be clever about the direction of scan, the search
+ *		conditions, and the tree ordering.  We find the first item (or,
+ *		if backwards scan, the last item) in the tree that satisfies the
+ *		qualifications in the scan key.  On success exit, the page containing
+ *		the current index tuple is pinned but not locked, and data about
+ *		the matching tuple(s) on the page has been loaded into so->currPos.
+ *		scan->xs_ctup.t_self is set to the heap TID of the current tuple,
+ *		and if requested, scan->xs_itup points to a copy of the index tuple.
+ *
+ * If there are no matching items in the index, we return false, with no
+ * pins or locks held.
+ *
+ * Note that scan->keyData[], and the so->keyData[] scankey built from it,
+ * are both search-type scankeys (see nbtree/README for more about this).
+ * Within this routine, we build a temporary insertion-type scankey to use
+ * in locating the scan start position.
+ */
+bool
+_bt_first(IndexScanDesc scan, ScanDirection dir)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Buffer		buf;
+	BTStack		stack;
+	OffsetNumber offnum;
+	StrategyNumber strat;
+	bool		nextkey;
+	bool		goback;
+	BTScanInsertData inskey;
+	ScanKey		startKeys[INDEX_MAX_KEYS];
+	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
+	int			keysCount = 0;
+	int			i;
+	bool		status;
+	StrategyNumber strat_total;
+	BTScanPosItem *currItem;
+	BlockNumber blkno;
+
+	Assert(!BTScanPosIsValid(so->currPos));
+
+	pgstat_count_index_scan(rel);
+
+	/*
+	 * Examine the scan keys and eliminate any redundant keys; also mark the
+	 * keys that must be matched to continue the scan.
+	 */
+	_bt_preprocess_keys(scan);
+
+	/*
+	 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
+	 * never be satisfied (eg, x == 1 AND x > 2).
+	 */
+	if (!so->qual_ok)
+	{
+		/* Notify any other workers that we're done with this scan key. */
+		_bt_parallel_done(scan);
+		return false;
+	}
+
+	/*
+	 * For parallel scans, get the starting page from shared state. If the
+	 * scan has not started, proceed to find out first leaf page in the usual
+	 * way while keeping other participating processes waiting.  If the scan
+	 * has already begun, use the page number from the shared structure.
+	 */
+	if (scan->parallel_scan != NULL)
+	{
+		status = _bt_parallel_seize(scan, &blkno);
+		if (!status)
+			return false;
+		else if (blkno == P_NONE)
+		{
+			_bt_parallel_done(scan);
+			return false;
+		}
+		else if (blkno != InvalidBlockNumber)
+		{
+			if (!_bt_parallel_readpage(scan, blkno, dir))
+				return false;
+			goto readcomplete;
+		}
+	}
+
+	/*----------
+	 * Examine the scan keys to discover where we need to start the scan.
+	 *
+	 * We want to identify the keys that can be used as starting boundaries;
+	 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
+	 * a backwards scan.  We can use keys for multiple attributes so long as
+	 * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
+	 * a > or < boundary or find an attribute with no boundary (which can be
+	 * thought of as the same as "> -infinity"), we can't use keys for any
+	 * attributes to its right, because it would break our simplistic notion
+	 * of what initial positioning strategy to use.
+	 *
+	 * When the scan keys include cross-type operators, _bt_preprocess_keys
+	 * may not be able to eliminate redundant keys; in such cases we will
+	 * arbitrarily pick a usable one for each attribute.  This is correct
+	 * but possibly not optimal behavior.  (For example, with keys like
+	 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
+	 * x=5 would be more efficient.)  Since the situation only arises given
+	 * a poorly-worded query plus an incomplete opfamily, live with it.
+	 *
+	 * When both equality and inequality keys appear for a single attribute
+	 * (again, only possible when cross-type operators appear), we *must*
+	 * select one of the equality keys for the starting point, because
+	 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
+	 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
+	 * start at x=4, we will fail and stop before reaching x=10.  If multiple
+	 * equality quals survive preprocessing, however, it doesn't matter which
+	 * one we use --- by definition, they are either redundant or
+	 * contradictory.
+	 *
+	 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
+	 * If the index stores nulls at the end of the index we'll be starting
+	 * from, and we have no boundary key for the column (which means the key
+	 * we deduced NOT NULL from is an inequality key that constrains the other
+	 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
+	 * use as a boundary key.  If we didn't do this, we might find ourselves
+	 * traversing a lot of null entries at the start of the scan.
+	 *
+	 * In this loop, row-comparison keys are treated the same as keys on their
+	 * first (leftmost) columns.  We'll add on lower-order columns of the row
+	 * comparison below, if possible.
+	 *
+	 * The selected scan keys (at most one per index column) are remembered by
+	 * storing their addresses into the local startKeys[] array.
+	 *----------
+	 */
+	strat_total = BTEqualStrategyNumber;
+	if (so->numberOfKeys > 0)
+	{
+		AttrNumber	curattr;
+		ScanKey		chosen;
+		ScanKey		impliesNN;
+		ScanKey		cur;
+
+		/*
+		 * chosen is the so-far-chosen key for the current attribute, if any.
+		 * We don't cast the decision in stone until we reach keys for the
+		 * next attribute.
+		 */
+		curattr = 1;
+		chosen = NULL;
+		/* Also remember any scankey that implies a NOT NULL constraint */
+		impliesNN = NULL;
+
+		/*
+		 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
+		 * pass to handle after-last-key processing.  Actual exit from the
+		 * loop is at one of the "break" statements below.
+		 */
+		for (cur = so->keyData, i = 0;; cur++, i++)
+		{
+			if (i >= so->numberOfKeys || cur->sk_attno != curattr)
+			{
+				/*
+				 * Done looking at keys for curattr.  If we didn't find a
+				 * usable boundary key, see if we can deduce a NOT NULL key.
+				 */
+				if (chosen == NULL && impliesNN != NULL &&
+					((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+					 ScanDirectionIsForward(dir) :
+					 ScanDirectionIsBackward(dir)))
+				{
+					/* Yes, so build the key in notnullkeys[keysCount] */
+					chosen = &notnullkeys[keysCount];
+					ScanKeyEntryInitialize(chosen,
+										   (SK_SEARCHNOTNULL | SK_ISNULL |
+											(impliesNN->sk_flags &
+											 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
+										   curattr,
+										   ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+											BTGreaterStrategyNumber :
+											BTLessStrategyNumber),
+										   InvalidOid,
+										   InvalidOid,
+										   InvalidOid,
+										   (Datum) 0);
+				}
+
+				/*
+				 * If we still didn't find a usable boundary key, quit; else
+				 * save the boundary key pointer in startKeys.
+				 */
+				if (chosen == NULL)
+					break;
+				startKeys[keysCount++] = chosen;
+
+				/*
+				 * Adjust strat_total, and quit if we have stored a > or <
+				 * key.
+				 */
+				strat = chosen->sk_strategy;
+				if (strat != BTEqualStrategyNumber)
+				{
+					strat_total = strat;
+					if (strat == BTGreaterStrategyNumber ||
+						strat == BTLessStrategyNumber)
+						break;
+				}
+
+				/*
+				 * Done if that was the last attribute, or if next key is not
+				 * in sequence (implying no boundary key is available for the
+				 * next attribute).
+				 */
+				if (i >= so->numberOfKeys ||
+					cur->sk_attno != curattr + 1)
+					break;
+
+				/*
+				 * Reset for next attr.
+				 */
+				curattr = cur->sk_attno;
+				chosen = NULL;
+				impliesNN = NULL;
+			}
+
+			/*
+			 * Can we use this key as a starting boundary for this attr?
+			 *
+			 * If not, does it imply a NOT NULL constraint?  (Because
+			 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
+			 * *any* inequality key works for that; we need not test.)
+			 */
+			switch (cur->sk_strategy)
+			{
+				case BTLessStrategyNumber:
+				case BTLessEqualStrategyNumber:
+					if (chosen == NULL)
+					{
+						if (ScanDirectionIsBackward(dir))
+							chosen = cur;
+						else
+							impliesNN = cur;
+					}
+					break;
+				case BTEqualStrategyNumber:
+					/* override any non-equality choice */
+					chosen = cur;
+					break;
+				case BTGreaterEqualStrategyNumber:
+				case BTGreaterStrategyNumber:
+					if (chosen == NULL)
+					{
+						if (ScanDirectionIsForward(dir))
+							chosen = cur;
+						else
+							impliesNN = cur;
+					}
+					break;
+			}
+		}
+	}
+
+	/*
+	 * If we found no usable boundary keys, we have to start from one end of
+	 * the tree.  Walk down that edge to the first or last key, and scan from
+	 * there.
+	 */
+	if (keysCount == 0)
+	{
+		bool		match;
+
+		match = _bt_endpoint(scan, dir);
+
+		if (!match)
+		{
+			/* No match, so mark (parallel) scan finished */
+			_bt_parallel_done(scan);
+		}
+
+		return match;
+	}
+
+	/*
+	 * We want to start the scan somewhere within the index.  Set up an
+	 * insertion scankey we can use to search for the boundary point we
+	 * identified above.  The insertion scankey is built using the keys
+	 * identified by startKeys[].  (Remaining insertion scankey fields are
+	 * initialized after initial-positioning strategy is finalized.)
+	 */
+	Assert(keysCount <= INDEX_MAX_KEYS);
+	for (i = 0; i < keysCount; i++)
+	{
+		ScanKey		cur = startKeys[i];
+
+		Assert(cur->sk_attno == i + 1);
+
+		if (cur->sk_flags & SK_ROW_HEADER)
+		{
+			/*
+			 * Row comparison header: look to the first row member instead.
+			 *
+			 * The member scankeys are already in insertion format (ie, they
+			 * have sk_func = 3-way-comparison function), but we have to watch
+			 * out for nulls, which _bt_preprocess_keys didn't check. A null
+			 * in the first row member makes the condition unmatchable, just
+			 * like qual_ok = false.
+			 */
+			ScanKey		subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
+
+			Assert(subkey->sk_flags & SK_ROW_MEMBER);
+			if (subkey->sk_flags & SK_ISNULL)
+			{
+				_bt_parallel_done(scan);
+				return false;
+			}
+			memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
+
+			/*
+			 * If the row comparison is the last positioning key we accepted,
+			 * try to add additional keys from the lower-order row members.
+			 * (If we accepted independent conditions on additional index
+			 * columns, we use those instead --- doesn't seem worth trying to
+			 * determine which is more restrictive.)  Note that this is OK
+			 * even if the row comparison is of ">" or "<" type, because the
+			 * condition applied to all but the last row member is effectively
+			 * ">=" or "<=", and so the extra keys don't break the positioning
+			 * scheme.  But, by the same token, if we aren't able to use all
+			 * the row members, then the part of the row comparison that we
+			 * did use has to be treated as just a ">=" or "<=" condition, and
+			 * so we'd better adjust strat_total accordingly.
+			 */
+			if (i == keysCount - 1)
+			{
+				bool		used_all_subkeys = false;
+
+				Assert(!(subkey->sk_flags & SK_ROW_END));
+				for (;;)
+				{
+					subkey++;
+					Assert(subkey->sk_flags & SK_ROW_MEMBER);
+					if (subkey->sk_attno != keysCount + 1)
+						break;	/* out-of-sequence, can't use it */
+					if (subkey->sk_strategy != cur->sk_strategy)
+						break;	/* wrong direction, can't use it */
+					if (subkey->sk_flags & SK_ISNULL)
+						break;	/* can't use null keys */
+					Assert(keysCount < INDEX_MAX_KEYS);
+					memcpy(inskey.scankeys + keysCount, subkey,
+						   sizeof(ScanKeyData));
+					keysCount++;
+					if (subkey->sk_flags & SK_ROW_END)
+					{
+						used_all_subkeys = true;
+						break;
+					}
+				}
+				if (!used_all_subkeys)
+				{
+					switch (strat_total)
+					{
+						case BTLessStrategyNumber:
+							strat_total = BTLessEqualStrategyNumber;
+							break;
+						case BTGreaterStrategyNumber:
+							strat_total = BTGreaterEqualStrategyNumber;
+							break;
+					}
+				}
+				break;			/* done with outer loop */
+			}
+		}
+		else
+		{
+			/*
+			 * Ordinary comparison key.  Transform the search-style scan key
+			 * to an insertion scan key by replacing the sk_func with the
+			 * appropriate btree comparison function.
+			 *
+			 * If scankey operator is not a cross-type comparison, we can use
+			 * the cached comparison function; otherwise gotta look it up in
+			 * the catalogs.  (That can't lead to infinite recursion, since no
+			 * indexscan initiated by syscache lookup will use cross-data-type
+			 * operators.)
+			 *
+			 * We support the convention that sk_subtype == InvalidOid means
+			 * the opclass input type; this is a hack to simplify life for
+			 * ScanKeyInit().
+			 */
+			if (cur->sk_subtype == rel->rd_opcintype[i] ||
+				cur->sk_subtype == InvalidOid)
+			{
+				FmgrInfo   *procinfo;
+
+				procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
+				ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
+											   cur->sk_flags,
+											   cur->sk_attno,
+											   InvalidStrategy,
+											   cur->sk_subtype,
+											   cur->sk_collation,
+											   procinfo,
+											   cur->sk_argument);
+			}
+			else
+			{
+				RegProcedure cmp_proc;
+
+				cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+											 rel->rd_opcintype[i],
+											 cur->sk_subtype,
+											 BTORDER_PROC);
+				if (!RegProcedureIsValid(cmp_proc))
+					elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+						 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
+						 cur->sk_attno, RelationGetRelationName(rel));
+				ScanKeyEntryInitialize(inskey.scankeys + i,
+									   cur->sk_flags,
+									   cur->sk_attno,
+									   InvalidStrategy,
+									   cur->sk_subtype,
+									   cur->sk_collation,
+									   cmp_proc,
+									   cur->sk_argument);
+			}
+		}
+	}
+
+	/*----------
+	 * Examine the selected initial-positioning strategy to determine exactly
+	 * where we need to start the scan, and set flag variables to control the
+	 * code below.
+	 *
+	 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
+	 * item >= scan key.  If nextkey = true, they will locate the first
+	 * item > scan key.
+	 *
+	 * If goback = true, we will then step back one item, while if
+	 * goback = false, we will start the scan on the located item.
+	 *----------
+	 */
+	switch (strat_total)
+	{
+		case BTLessStrategyNumber:
+
+			/*
+			 * Find first item >= scankey, then back up one to arrive at last
+			 * item < scankey.  (Note: this positioning strategy is only used
+			 * for a backward scan, so that is always the correct starting
+			 * position.)
+			 */
+			nextkey = false;
+			goback = true;
+			break;
+
+		case BTLessEqualStrategyNumber:
+
+			/*
+			 * Find first item > scankey, then back up one to arrive at last
+			 * item <= scankey.  (Note: this positioning strategy is only used
+			 * for a backward scan, so that is always the correct starting
+			 * position.)
+			 */
+			nextkey = true;
+			goback = true;
+			break;
+
+		case BTEqualStrategyNumber:
+
+			/*
+			 * If a backward scan was specified, need to start with last equal
+			 * item not first one.
+			 */
+			if (ScanDirectionIsBackward(dir))
+			{
+				/*
+				 * This is the same as the <= strategy.  We will check at the
+				 * end whether the found item is actually =.
+				 */
+				nextkey = true;
+				goback = true;
+			}
+			else
+			{
+				/*
+				 * This is the same as the >= strategy.  We will check at the
+				 * end whether the found item is actually =.
+				 */
+				nextkey = false;
+				goback = false;
+			}
+			break;
+
+		case BTGreaterEqualStrategyNumber:
+
+			/*
+			 * Find first item >= scankey.  (This is only used for forward
+			 * scans.)
+			 */
+			nextkey = false;
+			goback = false;
+			break;
+
+		case BTGreaterStrategyNumber:
+
+			/*
+			 * Find first item > scankey.  (This is only used for forward
+			 * scans.)
+			 */
+			nextkey = true;
+			goback = false;
+			break;
+
+		default:
+			/* can't get here, but keep compiler quiet */
+			elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
+			return false;
+	}
+
+	/* Initialize remaining insertion scan key fields */
+	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
+	inskey.anynullkeys = false; /* unused */
+	inskey.nextkey = nextkey;
+	inskey.pivotsearch = false;
+	inskey.scantid = NULL;
+	inskey.keysz = keysCount;
+
+	/*
+	 * Use the manufactured insertion scan key to descend the tree and
+	 * position ourselves on the target leaf page.
+	 */
+	stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
+
+	/* don't need to keep the stack around... */
+	_bt_freestack(stack);
+
+	if (!BufferIsValid(buf))
+	{
+		/*
+		 * We only get here if the index is completely empty. Lock relation
+		 * because nothing finer to lock exists.
+		 */
+		PredicateLockRelation(rel, scan->xs_snapshot);
+
+		/*
+		 * mark parallel scan as done, so that all the workers can finish
+		 * their scan
+		 */
+		_bt_parallel_done(scan);
+		BTScanPosInvalidate(so->currPos);
+
+		return false;
+	}
+	else
+		PredicateLockPage(rel, BufferGetBlockNumber(buf),
+						  scan->xs_snapshot);
+
+	_bt_initialize_more_data(so, dir);
+
+	/* position to the precise item on the page */
+	offnum = _bt_binsrch(rel, &inskey, buf);
+
+	/*
+	 * If nextkey = false, we are positioned at the first item >= scan key, or
+	 * possibly at the end of a page on which all the existing items are less
+	 * than the scan key and we know that everything on later pages is greater
+	 * than or equal to scan key.
+	 *
+	 * If nextkey = true, we are positioned at the first item > scan key, or
+	 * possibly at the end of a page on which all the existing items are less
+	 * than or equal to the scan key and we know that everything on later
+	 * pages is greater than scan key.
+	 *
+	 * The actually desired starting point is either this item or the prior
+	 * one, or in the end-of-page case it's the first item on the next page or
+	 * the last item on this page.  Adjust the starting offset if needed. (If
+	 * this results in an offset before the first item or after the last one,
+	 * _bt_readpage will report no items found, and then we'll step to the
+	 * next page as needed.)
+	 */
+	if (goback)
+		offnum = OffsetNumberPrev(offnum);
+
+	/* remember which buffer we have pinned, if any */
+	Assert(!BTScanPosIsValid(so->currPos));
+	so->currPos.buf = buf;
+
+	/*
+	 * Now load data from the first page of the scan.
+	 */
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		/*
+		 * There's no actually-matching data on this page.  Try to advance to
+		 * the next page.  Return false if there's no matching data at all.
+		 */
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+		if (!_bt_steppage(scan, dir))
+			return false;
+	}
+	else
+	{
+		/* Drop the lock, and maybe the pin, on the current page */
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	}
+
+readcomplete:
+	/* OK, itemIndex says what to return */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_heaptid = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
+/*
+ *	_bt_next() -- Get the next item in a scan.
+ *
+ *		On entry, so->currPos describes the current page, which may be pinned
+ *		but is not locked, and so->currPos.itemIndex identifies which item was
+ *		previously returned.
+ *
+ *		On successful exit, scan->xs_ctup.t_self is set to the TID of the
+ *		next heap tuple, and if requested, scan->xs_itup points to a copy of
+ *		the index tuple.  so->currPos is updated as needed.
+ *
+ *		On failure exit (no more tuples), we release pin and set
+ *		so->currPos.buf to InvalidBuffer.
+ */
+bool
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTScanPosItem *currItem;
+
+	/*
+	 * Advance to next tuple on current page; or if there's no more, try to
+	 * step to the next page with data.
+	 */
+	if (ScanDirectionIsForward(dir))
+	{
+		if (++so->currPos.itemIndex > so->currPos.lastItem)
+		{
+			if (!_bt_steppage(scan, dir))
+				return false;
+		}
+	}
+	else
+	{
+		if (--so->currPos.itemIndex < so->currPos.firstItem)
+		{
+			if (!_bt_steppage(scan, dir))
+				return false;
+		}
+	}
+
+	/* OK, itemIndex says what to return */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_heaptid = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
+/*
+ *	_bt_readpage() -- Load data from current index page into so->currPos
+ *
+ * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
+ * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
+ * they are updated as appropriate.  All other fields of so->currPos are
+ * initialized from scratch here.
+ *
+ * We scan the current page starting at offnum and moving in the indicated
+ * direction.  All items matching the scan keys are loaded into currPos.items.
+ * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
+ * that there can be no more matching tuples in the current scan direction.
+ *
+ * In the case of a parallel scan, caller must have called _bt_parallel_seize
+ * prior to calling this function; this function will invoke
+ * _bt_parallel_release before returning.
+ *
+ * Returns true if any matching items found on the page, false if none.
+ */
+static bool
+_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber minoff;
+	OffsetNumber maxoff;
+	int			itemIndex;
+	bool		continuescan;
+	int			indnatts;
+
+	/*
+	 * We must have the buffer pinned and locked, but the usual macro can't be
+	 * used here; this function is what makes it good for currPos.
+	 */
+	Assert(BufferIsValid(so->currPos.buf));
+
+	page = BufferGetPage(so->currPos.buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* allow next page be processed by parallel worker */
+	if (scan->parallel_scan)
+	{
+		if (ScanDirectionIsForward(dir))
+			_bt_parallel_release(scan, opaque->btpo_next);
+		else
+			_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+	}
+
+	continuescan = true;		/* default assumption */
+	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * We note the buffer's block number so that we can release the pin later.
+	 * This allows us to re-read the buffer if it is needed again for hinting.
+	 */
+	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+
+	/*
+	 * We save the LSN of the page as we read it, so that we know whether it
+	 * safe to apply LP_DEAD hints to the page later.  This allows us to drop
+	 * the pin for MVCC scans, which allows vacuum to avoid blocking.
+	 */
+	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+
+	/*
+	 * we must save the page's right-link while scanning it; this tells us
+	 * where to step right to after we're done with these items.  There is no
+	 * corresponding need for the left-link, since splits always go right.
+	 */
+	so->currPos.nextPage = opaque->btpo_next;
+
+	/* initialize tuple workspace to empty */
+	so->currPos.nextTupleOffset = 0;
+
+	/*
+	 * Now that the current page has been made consistent, the macro should be
+	 * good.
+	 */
+	Assert(BTScanPosIsPinned(so->currPos));
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* load items[] in ascending order */
+		itemIndex = 0;
+
+		offnum = Max(offnum, minoff);
+
+		while (offnum <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
+
+			/*
+			 * If the scan specifies not to return killed tuples, then we
+			 * treat a killed tuple as not passing the qual
+			 */
+			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			{
+				offnum = OffsetNumberNext(offnum);
+				continue;
+			}
+
+			itup = (IndexTuple) PageGetItem(page, iid);
+
+			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			{
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					_bt_saveitem(so, itemIndex, offnum, itup);
+					itemIndex++;
+				}
+				else
+				{
+					int			tupleOffset;
+
+					/*
+					 * Set up state to return posting list, and remember first
+					 * TID
+					 */
+					tupleOffset =
+						_bt_setuppostingitems(so, itemIndex, offnum,
+											  BTreeTupleGetPostingN(itup, 0),
+											  itup);
+					itemIndex++;
+					/* Remember additional TIDs */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											tupleOffset);
+						itemIndex++;
+					}
+				}
+			}
+			/* When !continuescan, there can't be any more matches, so stop */
+			if (!continuescan)
+				break;
+
+			offnum = OffsetNumberNext(offnum);
+		}
+
+		/*
+		 * We don't need to visit page to the right when the high key
+		 * indicates that no more matches will be found there.
+		 *
+		 * Checking the high key like this works out more often than you might
+		 * think.  Leaf page splits pick a split point between the two most
+		 * dissimilar tuples (this is weighed against the need to evenly share
+		 * free space).  Leaf pages with high key attribute values that can
+		 * only appear on non-pivot tuples on the right sibling page are
+		 * common.
+		 */
+		if (continuescan && !P_RIGHTMOST(opaque))
+		{
+			ItemId		iid = PageGetItemId(page, P_HIKEY);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, iid);
+			int			truncatt;
+
+			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+		}
+
+		if (!continuescan)
+			so->currPos.moreRight = false;
+
+		Assert(itemIndex <= MaxTIDsPerBTreePage);
+		so->currPos.firstItem = 0;
+		so->currPos.lastItem = itemIndex - 1;
+		so->currPos.itemIndex = 0;
+	}
+	else
+	{
+		/* load items[] in descending order */
+		itemIndex = MaxTIDsPerBTreePage;
+
+		offnum = Min(offnum, maxoff);
+
+		while (offnum >= minoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
+			bool		tuple_alive;
+			bool		passes_quals;
+
+			/*
+			 * If the scan specifies not to return killed tuples, then we
+			 * treat a killed tuple as not passing the qual.  Most of the
+			 * time, it's a win to not bother examining the tuple's index
+			 * keys, but just skip to the next tuple (previous, actually,
+			 * since we're scanning backwards).  However, if this is the first
+			 * tuple on the page, we do check the index keys, to prevent
+			 * uselessly advancing to the page to the left.  This is similar
+			 * to the high key optimization used by forward scans.
+			 */
+			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			{
+				Assert(offnum >= P_FIRSTDATAKEY(opaque));
+				if (offnum > P_FIRSTDATAKEY(opaque))
+				{
+					offnum = OffsetNumberPrev(offnum);
+					continue;
+				}
+
+				tuple_alive = false;
+			}
+			else
+				tuple_alive = true;
+
+			itup = (IndexTuple) PageGetItem(page, iid);
+
+			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+										 &continuescan);
+			if (passes_quals && tuple_alive)
+			{
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					itemIndex--;
+					_bt_saveitem(so, itemIndex, offnum, itup);
+				}
+				else
+				{
+					int			tupleOffset;
+
+					/*
+					 * Set up state to return posting list, and remember first
+					 * TID.
+					 *
+					 * Note that we deliberately save/return items from
+					 * posting lists in ascending heap TID order for backwards
+					 * scans.  This allows _bt_killitems() to make a
+					 * consistent assumption about the order of items
+					 * associated with the same posting list tuple.
+					 */
+					itemIndex--;
+					tupleOffset =
+						_bt_setuppostingitems(so, itemIndex, offnum,
+											  BTreeTupleGetPostingN(itup, 0),
+											  itup);
+					/* Remember additional TIDs */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						itemIndex--;
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											tupleOffset);
+					}
+				}
+			}
+			if (!continuescan)
+			{
+				/* there can't be any more matches, so stop */
+				so->currPos.moreLeft = false;
+				break;
+			}
+
+			offnum = OffsetNumberPrev(offnum);
+		}
+
+		Assert(itemIndex >= 0);
+		so->currPos.firstItem = itemIndex;
+		so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
+		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
+	}
+
+	return (so->currPos.firstItem <= so->currPos.lastItem);
+}
+
+/* Save an index item into so->currPos.items[itemIndex] */
+static void
+_bt_saveitem(BTScanOpaque so, int itemIndex,
+			 OffsetNumber offnum, IndexTuple itup)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
+
+	currItem->heapTid = itup->t_tid;
+	currItem->indexOffset = offnum;
+	if (so->currTuples)
+	{
+		Size		itupsz = IndexTupleSize(itup);
+
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+		so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+	}
+}
+
+/*
+ * Setup state to save TIDs/items from a single posting list tuple.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for TID that is
+ * returned to scan first.  Second or subsequent TIDs for posting list should
+ * be saved by calling _bt_savepostingitem().
+ *
+ * Returns an offset into tuple storage space that main tuple is stored at if
+ * needed.
+ */
+static int
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					  ItemPointer heapTid, IndexTuple itup)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	Assert(BTreeTupleIsPosting(itup));
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+	if (so->currTuples)
+	{
+		/* Save base IndexTuple (truncate posting list) */
+		IndexTuple	base;
+		Size		itupsz = BTreeTupleGetPostingOffset(itup);
+
+		itupsz = MAXALIGN(itupsz);
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+		memcpy(base, itup, itupsz);
+		/* Defensively reduce work area index tuple header size */
+		base->t_info &= ~INDEX_SIZE_MASK;
+		base->t_info |= itupsz;
+		so->currPos.nextTupleOffset += itupsz;
+
+		return currItem->tupleOffset;
+	}
+
+	return 0;
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for current posting
+ * tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple.  Caller passes its return value as tupleOffset.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					ItemPointer heapTid, int tupleOffset)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+
+	/*
+	 * Have index-only scans return the same base IndexTuple for every TID
+	 * that originates from the same posting list
+	 */
+	if (so->currTuples)
+		currItem->tupleOffset = tupleOffset;
+}
+
+/*
+ *	_bt_steppage() -- Step to next page containing valid data for scan
+ *
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
+ * if pinned, we'll drop the pin before moving to next page.  The buffer is
+ * not locked on entry.
+ *
+ * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
+ * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
+ * to InvalidBuffer.  We return true to indicate success.
+ */
+static bool
+_bt_steppage(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BlockNumber blkno = InvalidBlockNumber;
+	bool		status;
+
+	Assert(BTScanPosIsValid(so->currPos));
+
+	/* Before leaving current page, deal with any killed items */
+	if (so->numKilled > 0)
+		_bt_killitems(scan);
+
+	/*
+	 * Before we modify currPos, make a copy of the page data if there was a
+	 * mark position that needs it.
+	 */
+	if (so->markItemIndex >= 0)
+	{
+		/* bump pin on current buffer for assignment to mark buffer */
+		if (BTScanPosIsPinned(so->currPos))
+			IncrBufferRefCount(so->currPos.buf);
+		memcpy(&so->markPos, &so->currPos,
+			   offsetof(BTScanPosData, items[1]) +
+			   so->currPos.lastItem * sizeof(BTScanPosItem));
+		if (so->markTuples)
+			memcpy(so->markTuples, so->currTuples,
+				   so->currPos.nextTupleOffset);
+		so->markPos.itemIndex = so->markItemIndex;
+		so->markItemIndex = -1;
+	}
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* Walk right to the next page with data */
+		if (scan->parallel_scan != NULL)
+		{
+			/*
+			 * Seize the scan to get the next block number; if the scan has
+			 * ended already, bail out.
+			 */
+			status = _bt_parallel_seize(scan, &blkno);
+			if (!status)
+			{
+				/* release the previous buffer, if pinned */
+				BTScanPosUnpinIfPinned(so->currPos);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+		}
+		else
+		{
+			/* Not parallel, so use the previously-saved nextPage link. */
+			blkno = so->currPos.nextPage;
+		}
+
+		/* Remember we left a page with data */
+		so->currPos.moreLeft = true;
+
+		/* release the previous buffer, if pinned */
+		BTScanPosUnpinIfPinned(so->currPos);
+	}
+	else
+	{
+		/* Remember we left a page with data */
+		so->currPos.moreRight = true;
+
+		if (scan->parallel_scan != NULL)
+		{
+			/*
+			 * Seize the scan to get the current block number; if the scan has
+			 * ended already, bail out.
+			 */
+			status = _bt_parallel_seize(scan, &blkno);
+			BTScanPosUnpinIfPinned(so->currPos);
+			if (!status)
+			{
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+		}
+		else
+		{
+			/* Not parallel, so just use our own notion of the current page */
+			blkno = so->currPos.currPage;
+		}
+	}
+
+	if (!_bt_readnextpage(scan, blkno, dir))
+		return false;
+
+	/* Drop the lock, and maybe the pin, on the current page */
+	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+	return true;
+}
+
+/*
+ *	_bt_readnextpage() -- Read next page containing valid data for scan
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page.  Caller is responsible to release lock and pin on
+ * buffer on success.  We return true to indicate success.
+ *
+ * If there are no more matching records in the given direction, we drop all
+ * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
+ */
+static bool
+_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Relation	rel;
+	Page		page;
+	BTPageOpaque opaque;
+	bool		status;
+
+	rel = scan->indexRelation;
+
+	if (ScanDirectionIsForward(dir))
+	{
+		for (;;)
+		{
+			/*
+			 * if we're at end of scan, give up and mark parallel scan as
+			 * done, so that all the workers can finish their scan
+			 */
+			if (blkno == P_NONE || !so->currPos.moreRight)
+			{
+				_bt_parallel_done(scan);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+			/* check for interrupts while we're not holding any buffer lock */
+			CHECK_FOR_INTERRUPTS();
+			/* step right one page */
+			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+			page = BufferGetPage(so->currPos.buf);
+			TestForOldSnapshot(scan->xs_snapshot, rel, page);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			/* check for deleted page */
+			if (!P_IGNORE(opaque))
+			{
+				PredicateLockPage(rel, blkno, scan->xs_snapshot);
+				/* see if there are any matches on this page */
+				/* note that this will clear moreRight if we can stop */
+				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
+					break;
+			}
+			else if (scan->parallel_scan != NULL)
+			{
+				/* allow next page be processed by parallel worker */
+				_bt_parallel_release(scan, opaque->btpo_next);
+			}
+
+			/* nope, keep going */
+			if (scan->parallel_scan != NULL)
+			{
+				_bt_relbuf(rel, so->currPos.buf);
+				status = _bt_parallel_seize(scan, &blkno);
+				if (!status)
+				{
+					BTScanPosInvalidate(so->currPos);
+					return false;
+				}
+			}
+			else
+			{
+				blkno = opaque->btpo_next;
+				_bt_relbuf(rel, so->currPos.buf);
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * Should only happen in parallel cases, when some other backend
+		 * advanced the scan.
+		 */
+		if (so->currPos.currPage != blkno)
+		{
+			BTScanPosUnpinIfPinned(so->currPos);
+			so->currPos.currPage = blkno;
+		}
+
+		/*
+		 * Walk left to the next page with data.  This is much more complex
+		 * than the walk-right case because of the possibility that the page
+		 * to our left splits while we are in flight to it, plus the
+		 * possibility that the page we were on gets deleted after we leave
+		 * it.  See nbtree/README for details.
+		 *
+		 * It might be possible to rearrange this code to have less overhead
+		 * in pinning and locking, but that would require capturing the left
+		 * pointer when the page is initially read, and using it here, along
+		 * with big changes to _bt_walk_left() and the code below.  It is not
+		 * clear whether this would be a win, since if the page immediately to
+		 * the left splits after we read this page and before we step left, we
+		 * would need to visit more pages than with the current code.
+		 *
+		 * Note that if we change the code so that we drop the pin for a scan
+		 * which uses a non-MVCC snapshot, we will need to modify the code for
+		 * walking left, to allow for the possibility that a referenced page
+		 * has been deleted.  As long as the buffer is pinned or the snapshot
+		 * is MVCC the page cannot move past the half-dead state to fully
+		 * deleted.
+		 */
+		if (BTScanPosIsPinned(so->currPos))
+			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
+		else
+			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
+
+		for (;;)
+		{
+			/* Done if we know there are no matching keys to the left */
+			if (!so->currPos.moreLeft)
+			{
+				_bt_relbuf(rel, so->currPos.buf);
+				_bt_parallel_done(scan);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+
+			/* Step to next physical page */
+			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
+											scan->xs_snapshot);
+
+			/* if we're physically at end of index, return failure */
+			if (so->currPos.buf == InvalidBuffer)
+			{
+				_bt_parallel_done(scan);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+
+			/*
+			 * Okay, we managed to move left to a non-deleted page. Done if
+			 * it's not half-dead and contains matching tuples. Else loop back
+			 * and do it all again.
+			 */
+			page = BufferGetPage(so->currPos.buf);
+			TestForOldSnapshot(scan->xs_snapshot, rel, page);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			if (!P_IGNORE(opaque))
+			{
+				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
+				/* see if there are any matches on this page */
+				/* note that this will clear moreLeft if we can stop */
+				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
+					break;
+			}
+			else if (scan->parallel_scan != NULL)
+			{
+				/* allow next page be processed by parallel worker */
+				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+			}
+
+			/*
+			 * For parallel scans, get the last page scanned as it is quite
+			 * possible that by the time we try to seize the scan, some other
+			 * worker has already advanced the scan to a different page.  We
+			 * must continue based on the latest page scanned by any worker.
+			 */
+			if (scan->parallel_scan != NULL)
+			{
+				_bt_relbuf(rel, so->currPos.buf);
+				status = _bt_parallel_seize(scan, &blkno);
+				if (!status)
+				{
+					BTScanPosInvalidate(so->currPos);
+					return false;
+				}
+				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+			}
+		}
+	}
+
+	return true;
+}
+
+/*
+ *	_bt_parallel_readpage() -- Read current page containing valid data for scan
+ *
+ * On success, release lock and maybe pin on buffer.  We return true to
+ * indicate success.
+ */
+static bool
+_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	_bt_initialize_more_data(so, dir);
+
+	if (!_bt_readnextpage(scan, blkno, dir))
+		return false;
+
+	/* Drop the lock, and maybe the pin, on the current page */
+	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+	return true;
+}
+
+/*
+ * _bt_walk_left() -- step left one page, if possible
+ *
+ * The given buffer must be pinned and read-locked.  This will be dropped
+ * before stepping left.  On return, we have pin and read lock on the
+ * returned page, instead.
+ *
+ * Returns InvalidBuffer if there is no page to the left (no lock is held
+ * in that case).
+ *
+ * When working on a non-leaf level, it is possible for the returned page
+ * to be half-dead; the caller should check that condition and step left
+ * again if it's important.
+ */
+static Buffer
+_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
+{
+	Page		page;
+	BTPageOpaque opaque;
+
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	for (;;)
+	{
+		BlockNumber obknum;
+		BlockNumber lblkno;
+		BlockNumber blkno;
+		int			tries;
+
+		/* if we're at end of tree, release buf and return failure */
+		if (P_LEFTMOST(opaque))
+		{
+			_bt_relbuf(rel, buf);
+			break;
+		}
+		/* remember original page we are stepping left from */
+		obknum = BufferGetBlockNumber(buf);
+		/* step left */
+		blkno = lblkno = opaque->btpo_prev;
+		_bt_relbuf(rel, buf);
+		/* check for interrupts while we're not holding any buffer lock */
+		CHECK_FOR_INTERRUPTS();
+		buf = _bt_getbuf(rel, blkno, BT_READ);
+		page = BufferGetPage(buf);
+		TestForOldSnapshot(snapshot, rel, page);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		/*
+		 * If this isn't the page we want, walk right till we find what we
+		 * want --- but go no more than four hops (an arbitrary limit). If we
+		 * don't find the correct page by then, the most likely bet is that
+		 * the original page got deleted and isn't in the sibling chain at all
+		 * anymore, not that its left sibling got split more than four times.
+		 *
+		 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
+		 * because half-dead pages are still in the sibling chain.  Caller
+		 * must reject half-dead pages if wanted.
+		 */
+		tries = 0;
+		for (;;)
+		{
+			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+			{
+				/* Found desired page, return it */
+				return buf;
+			}
+			if (P_RIGHTMOST(opaque) || ++tries > 4)
+				break;
+			blkno = opaque->btpo_next;
+			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+			page = BufferGetPage(buf);
+			TestForOldSnapshot(snapshot, rel, page);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
+
+		/* Return to the original page to see what's up */
+		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+		page = BufferGetPage(buf);
+		TestForOldSnapshot(snapshot, rel, page);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (P_ISDELETED(opaque))
+		{
+			/*
+			 * It was deleted.  Move right to first nondeleted page (there
+			 * must be one); that is the page that has acquired the deleted
+			 * one's keyspace, so stepping left from it will take us where we
+			 * want to be.
+			 */
+			for (;;)
+			{
+				if (P_RIGHTMOST(opaque))
+					elog(ERROR, "fell off the end of index \"%s\"",
+						 RelationGetRelationName(rel));
+				blkno = opaque->btpo_next;
+				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+				page = BufferGetPage(buf);
+				TestForOldSnapshot(snapshot, rel, page);
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (!P_ISDELETED(opaque))
+					break;
+			}
+
+			/*
+			 * Now return to top of loop, resetting obknum to point to this
+			 * nondeleted page, and try again.
+			 */
+		}
+		else
+		{
+			/*
+			 * It wasn't deleted; the explanation had better be that the page
+			 * to the left got split or deleted. Without this check, we'd go
+			 * into an infinite loop if there's anything wrong.
+			 */
+			if (opaque->btpo_prev == lblkno)
+				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
+					 obknum, RelationGetRelationName(rel));
+			/* Okay to try again with new lblkno value */
+		}
+	}
+
+	return InvalidBuffer;
+}
+
+/*
+ * _bt_get_endpoint() -- Find the first or last page on a given tree level
+ *
+ * If the index is empty, we will return InvalidBuffer; any other failure
+ * condition causes ereport().  We will not return a dead page.
+ *
+ * The returned buffer is pinned and read-locked.
+ */
+Buffer
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+				 Snapshot snapshot)
+{
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber offnum;
+	BlockNumber blkno;
+	IndexTuple	itup;
+
+	/*
+	 * If we are looking for a leaf page, okay to descend from fast root;
+	 * otherwise better descend from true root.  (There is no point in being
+	 * smarter about intermediate levels.)
+	 */
+	if (level == 0)
+		buf = _bt_getroot(rel, BT_READ);
+	else
+		buf = _bt_gettrueroot(rel);
+
+	if (!BufferIsValid(buf))
+		return InvalidBuffer;
+
+	page = BufferGetPage(buf);
+	TestForOldSnapshot(snapshot, rel, page);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	for (;;)
+	{
+		/*
+		 * If we landed on a deleted page, step right to find a live page
+		 * (there must be one).  Also, if we want the rightmost page, step
+		 * right if needed to get to it (this could happen if the page split
+		 * since we obtained a pointer to it).
+		 */
+		while (P_IGNORE(opaque) ||
+			   (rightmost && !P_RIGHTMOST(opaque)))
+		{
+			blkno = opaque->btpo_next;
+			if (blkno == P_NONE)
+				elog(ERROR, "fell off the end of index \"%s\"",
+					 RelationGetRelationName(rel));
+			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+			page = BufferGetPage(buf);
+			TestForOldSnapshot(snapshot, rel, page);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
+
+		/* Done? */
+		if (opaque->btpo_level == level)
+			break;
+		if (opaque->btpo_level < level)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("btree level %u not found in index \"%s\"",
+									 level, RelationGetRelationName(rel))));
+
+		/* Descend to leftmost or rightmost child page */
+		if (rightmost)
+			offnum = PageGetMaxOffsetNumber(page);
+		else
+			offnum = P_FIRSTDATAKEY(opaque);
+
+		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+		blkno = BTreeTupleGetDownLink(itup);
+
+		buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	}
+
+	return buf;
+}
+
+/*
+ *	_bt_endpoint() -- Find the first or last page in the index, and scan
+ * from there to the first key satisfying all the quals.
+ *
+ * This is used by _bt_first() to set up a scan when we've determined
+ * that the scan must start at the beginning or end of the index (for
+ * a forward or backward scan respectively).  Exit conditions are the
+ * same as for _bt_first().
+ */
+static bool
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber start;
+	BTScanPosItem *currItem;
+
+	/*
+	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
+	 * version of _bt_search().  We don't maintain a stack since we know we
+	 * won't need it.
+	 */
+	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
+
+	if (!BufferIsValid(buf))
+	{
+		/*
+		 * Empty index. Lock the whole relation, as nothing finer to lock
+		 * exists.
+		 */
+		PredicateLockRelation(rel, scan->xs_snapshot);
+		BTScanPosInvalidate(so->currPos);
+		return false;
+	}
+
+	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Assert(P_ISLEAF(opaque));
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* There could be dead pages to the left, so not this: */
+		/* Assert(P_LEFTMOST(opaque)); */
+
+		start = P_FIRSTDATAKEY(opaque);
+	}
+	else if (ScanDirectionIsBackward(dir))
+	{
+		Assert(P_RIGHTMOST(opaque));
+
+		start = PageGetMaxOffsetNumber(page);
+	}
+	else
+	{
+		elog(ERROR, "invalid scan direction: %d", (int) dir);
+		start = 0;				/* keep compiler quiet */
+	}
+
+	/* remember which buffer we have pinned */
+	so->currPos.buf = buf;
+
+	_bt_initialize_more_data(so, dir);
+
+	/*
+	 * Now load data from the first page of the scan.
+	 */
+	if (!_bt_readpage(scan, dir, start))
+	{
+		/*
+		 * There's no actually-matching data on this page.  Try to advance to
+		 * the next page.  Return false if there's no matching data at all.
+		 */
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+		if (!_bt_steppage(scan, dir))
+			return false;
+	}
+	else
+	{
+		/* Drop the lock, and maybe the pin, on the current page */
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	}
+
+	/* OK, itemIndex says what to return */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_heaptid = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
+/*
+ * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
+ * for scan direction
+ */
+static inline void
+_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
+{
+	/* initialize moreLeft/moreRight appropriately for scan direction */
+	if (ScanDirectionIsForward(dir))
+	{
+		so->currPos.moreLeft = false;
+		so->currPos.moreRight = true;
+	}
+	else
+	{
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = false;
+	}
+	so->numKilled = 0;			/* just paranoia */
+	so->markItemIndex = -1;		/* ditto */
+}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
new file mode 100644
index 0000000..78f78e7
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -0,0 +1,2016 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsort.c
+ *		Build a btree from sorted input by loading leaf pages sequentially.
+ *
+ * NOTES
+ *
+ * We use tuplesort.c to sort the given index tuples into order.
+ * Then we scan the index tuples in order and build the btree pages
+ * for each level.  We load source tuples into leaf-level pages.
+ * Whenever we fill a page at one level, we add a link to it to its
+ * parent level (starting a new parent level if necessary).  When
+ * done, we write out each final page on each level, adding it to
+ * its parent level.  When we have only one page on a level, it must be
+ * the root -- it can be attached to the btree metapage and we are done.
+ *
+ * It is not wise to pack the pages entirely full, since then *any*
+ * insertion would cause a split (and not only of the leaf page; the need
+ * for a split would cascade right up the tree).  The steady-state load
+ * factor for btrees is usually estimated at 70%.  We choose to pack leaf
+ * pages to the user-controllable fill factor (default 90%) while upper pages
+ * are always packed to 70%.  This gives us reasonable density (there aren't
+ * many upper pages if the keys are reasonable-size) without risking a lot of
+ * cascading splits during early insertions.
+ *
+ * Formerly the index pages being built were kept in shared buffers, but
+ * that is of no value (since other backends have no interest in them yet)
+ * and it created locking problems for CHECKPOINT, because the upper-level
+ * pages were held exclusive-locked for long periods.  Now we just build
+ * the pages in local memory and smgrwrite or smgrextend them as we finish
+ * them.  They will need to be re-read into shared buffers on first use after
+ * the build finishes.
+ *
+ * This code isn't concerned about the FSM at all. The caller is responsible
+ * for initializing that.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtsort.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/parallel.h"
+#include "access/relscan.h"
+#include "access/table.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "commands/progress.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"		/* pgrminclude ignore */
+#include "utils/rel.h"
+#include "utils/sortsupport.h"
+#include "utils/tuplesort.h"
+
+
+/* Magic numbers for parallel state sharing */
+#define PARALLEL_KEY_BTREE_SHARED		UINT64CONST(0xA000000000000001)
+#define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xA000000000000002)
+#define PARALLEL_KEY_TUPLESORT_SPOOL2	UINT64CONST(0xA000000000000003)
+#define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xA000000000000004)
+#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xA000000000000005)
+#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xA000000000000006)
+
+/*
+ * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
+ * parallel index builds.  This may be useful as a debugging aid.
+#undef DISABLE_LEADER_PARTICIPATION
+ */
+
+/*
+ * Status record for spooling/sorting phase.  (Note we may have two of
+ * these due to the special requirements for uniqueness-checking with
+ * dead tuples.)
+ */
+typedef struct BTSpool
+{
+	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
+	Relation	heap;
+	Relation	index;
+	bool		isunique;
+} BTSpool;
+
+/*
+ * Status for index builds performed in parallel.  This is allocated in a
+ * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
+ * entry, private to tuplesort.c but allocated by this module on its behalf.
+ */
+typedef struct BTShared
+{
+	/*
+	 * These fields are not modified during the sort.  They primarily exist
+	 * for the benefit of worker processes that need to create BTSpool state
+	 * corresponding to that used by the leader.
+	 */
+	Oid			heaprelid;
+	Oid			indexrelid;
+	bool		isunique;
+	bool		isconcurrent;
+	int			scantuplesortstates;
+
+	/*
+	 * workersdonecv is used to monitor the progress of workers.  All parallel
+	 * participants must indicate that they are done before leader can use
+	 * mutable state that workers maintain during scan (and before leader can
+	 * proceed to tuplesort_performsort()).
+	 */
+	ConditionVariable workersdonecv;
+
+	/*
+	 * mutex protects all fields before heapdesc.
+	 *
+	 * These fields contain status information of interest to B-Tree index
+	 * builds that must work just the same when an index is built in parallel.
+	 */
+	slock_t		mutex;
+
+	/*
+	 * Mutable state that is maintained by workers, and reported back to
+	 * leader at end of parallel scan.
+	 *
+	 * nparticipantsdone is number of worker processes finished.
+	 *
+	 * reltuples is the total number of input heap tuples.
+	 *
+	 * havedead indicates if RECENTLY_DEAD tuples were encountered during
+	 * build.
+	 *
+	 * indtuples is the total number of tuples that made it into the index.
+	 *
+	 * brokenhotchain indicates if any worker detected a broken HOT chain
+	 * during build.
+	 */
+	int			nparticipantsdone;
+	double		reltuples;
+	bool		havedead;
+	double		indtuples;
+	bool		brokenhotchain;
+
+	/*
+	 * ParallelTableScanDescData data follows. Can't directly embed here, as
+	 * implementations of the parallel table scan desc interface might need
+	 * stronger alignment.
+	 */
+} BTShared;
+
+/*
+ * Return pointer to a BTShared's parallel table scan.
+ *
+ * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
+ * MAXALIGN.
+ */
+#define ParallelTableScanFromBTShared(shared) \
+	(ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
+
+/*
+ * Status for leader in parallel index build.
+ */
+typedef struct BTLeader
+{
+	/* parallel context itself */
+	ParallelContext *pcxt;
+
+	/*
+	 * nparticipanttuplesorts is the exact number of worker processes
+	 * successfully launched, plus one leader process if it participates as a
+	 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
+	 * participating as a worker).
+	 */
+	int			nparticipanttuplesorts;
+
+	/*
+	 * Leader process convenience pointers to shared state (leader avoids TOC
+	 * lookups).
+	 *
+	 * btshared is the shared state for entire build.  sharedsort is the
+	 * shared, tuplesort-managed state passed to each process tuplesort.
+	 * sharedsort2 is the corresponding btspool2 shared state, used only when
+	 * building unique indexes.  snapshot is the snapshot used by the scan iff
+	 * an MVCC snapshot is required.
+	 */
+	BTShared   *btshared;
+	Sharedsort *sharedsort;
+	Sharedsort *sharedsort2;
+	Snapshot	snapshot;
+	WalUsage   *walusage;
+	BufferUsage *bufferusage;
+} BTLeader;
+
+/*
+ * Working state for btbuild and its callback.
+ *
+ * When parallel CREATE INDEX is used, there is a BTBuildState for each
+ * participant.
+ */
+typedef struct BTBuildState
+{
+	bool		isunique;
+	bool		havedead;
+	Relation	heap;
+	BTSpool    *spool;
+
+	/*
+	 * spool2 is needed only when the index is a unique index. Dead tuples are
+	 * put into spool2 instead of spool in order to avoid uniqueness check.
+	 */
+	BTSpool    *spool2;
+	double		indtuples;
+
+	/*
+	 * btleader is only present when a parallel index build is performed, and
+	 * only in the leader process. (Actually, only the leader has a
+	 * BTBuildState.  Workers have their own spool and spool2, though.)
+	 */
+	BTLeader   *btleader;
+} BTBuildState;
+
+/*
+ * Status record for a btree page being built.  We have one of these
+ * for each active tree level.
+ */
+typedef struct BTPageState
+{
+	Page		btps_page;		/* workspace for page building */
+	BlockNumber btps_blkno;		/* block # to write this page at */
+	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
+	OffsetNumber btps_lastoff;	/* last item offset loaded */
+	Size		btps_lastextra; /* last item's extra posting list space */
+	uint32		btps_level;		/* tree level (0 = leaf) */
+	Size		btps_full;		/* "full" if less than this much free space */
+	struct BTPageState *btps_next;	/* link to parent level, if any */
+} BTPageState;
+
+/*
+ * Overall status record for index writing phase.
+ */
+typedef struct BTWriteState
+{
+	Relation	heap;
+	Relation	index;
+	BTScanInsert inskey;		/* generic insertion scankey */
+	bool		btws_use_wal;	/* dump pages to WAL? */
+	BlockNumber btws_pages_alloced; /* # pages allocated */
+	BlockNumber btws_pages_written; /* # pages written out */
+	Page		btws_zeropage;	/* workspace for filling zeroes */
+} BTWriteState;
+
+
+static double _bt_spools_heapscan(Relation heap, Relation index,
+								  BTBuildState *buildstate, IndexInfo *indexInfo);
+static void _bt_spooldestroy(BTSpool *btspool);
+static void _bt_spool(BTSpool *btspool, ItemPointer self,
+					  Datum *values, bool *isnull);
+static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
+static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
+							   bool *isnull, bool tupleIsAlive, void *state);
+static Page _bt_blnewpage(uint32 level);
+static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
+static void _bt_slideleft(Page rightmostpage);
+static void _bt_sortaddtup(Page page, Size itemsize,
+						   IndexTuple itup, OffsetNumber itup_off,
+						   bool newfirstdataitem);
+static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
+						 IndexTuple itup, Size truncextra);
+static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
+										  BTPageState *state,
+										  BTDedupState dstate);
+static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
+static void _bt_load(BTWriteState *wstate,
+					 BTSpool *btspool, BTSpool *btspool2);
+static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
+							   int request);
+static void _bt_end_parallel(BTLeader *btleader);
+static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
+static double _bt_parallel_heapscan(BTBuildState *buildstate,
+									bool *brokenhotchain);
+static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
+static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
+									   BTShared *btshared, Sharedsort *sharedsort,
+									   Sharedsort *sharedsort2, int sortmem,
+									   bool progress);
+
+
+/*
+ *	btbuild() -- build a new btree index.
+ */
+IndexBuildResult *
+btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+	IndexBuildResult *result;
+	BTBuildState buildstate;
+	double		reltuples;
+
+#ifdef BTREE_BUILD_STATS
+	if (log_btree_build_stats)
+		ResetUsage();
+#endif							/* BTREE_BUILD_STATS */
+
+	buildstate.isunique = indexInfo->ii_Unique;
+	buildstate.havedead = false;
+	buildstate.heap = heap;
+	buildstate.spool = NULL;
+	buildstate.spool2 = NULL;
+	buildstate.indtuples = 0;
+	buildstate.btleader = NULL;
+
+	/*
+	 * We expect to be called exactly once for any index relation. If that's
+	 * not the case, big trouble's what we have.
+	 */
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "index \"%s\" already contains data",
+			 RelationGetRelationName(index));
+
+	reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
+
+	/*
+	 * Finish the build by (1) completing the sort of the spool file, (2)
+	 * inserting the sorted tuples into btree pages and (3) building the upper
+	 * levels.  Finally, it may also be necessary to end use of parallelism.
+	 */
+	_bt_leafbuild(buildstate.spool, buildstate.spool2);
+	_bt_spooldestroy(buildstate.spool);
+	if (buildstate.spool2)
+		_bt_spooldestroy(buildstate.spool2);
+	if (buildstate.btleader)
+		_bt_end_parallel(buildstate.btleader);
+
+	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+	result->heap_tuples = reltuples;
+	result->index_tuples = buildstate.indtuples;
+
+#ifdef BTREE_BUILD_STATS
+	if (log_btree_build_stats)
+	{
+		ShowUsage("BTREE BUILD STATS");
+		ResetUsage();
+	}
+#endif							/* BTREE_BUILD_STATS */
+
+	return result;
+}
+
+/*
+ * Create and initialize one or two spool structures, and save them in caller's
+ * buildstate argument.  May also fill-in fields within indexInfo used by index
+ * builds.
+ *
+ * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
+ * routine encapsulates all aspects of managing parallelism.  Caller need only
+ * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
+ *
+ * Returns the total number of heap tuples scanned.
+ */
+static double
+_bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
+					IndexInfo *indexInfo)
+{
+	BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
+	SortCoordinate coordinate = NULL;
+	double		reltuples = 0;
+
+	/*
+	 * We size the sort area as maintenance_work_mem rather than work_mem to
+	 * speed index creation.  This should be OK since a single backend can't
+	 * run multiple index creations in parallel (see also: notes on
+	 * parallelism and maintenance_work_mem below).
+	 */
+	btspool->heap = heap;
+	btspool->index = index;
+	btspool->isunique = indexInfo->ii_Unique;
+
+	/* Save as primary spool */
+	buildstate->spool = btspool;
+
+	/* Report table scan phase started */
+	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+								 PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
+
+	/* Attempt to launch parallel worker scan when required */
+	if (indexInfo->ii_ParallelWorkers > 0)
+		_bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
+						   indexInfo->ii_ParallelWorkers);
+
+	/*
+	 * If parallel build requested and at least one worker process was
+	 * successfully launched, set up coordination state
+	 */
+	if (buildstate->btleader)
+	{
+		coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
+		coordinate->isWorker = false;
+		coordinate->nParticipants =
+			buildstate->btleader->nparticipanttuplesorts;
+		coordinate->sharedsort = buildstate->btleader->sharedsort;
+	}
+
+	/*
+	 * Begin serial/leader tuplesort.
+	 *
+	 * In cases where parallelism is involved, the leader receives the same
+	 * share of maintenance_work_mem as a serial sort (it is generally treated
+	 * in the same way as a serial sort once we return).  Parallel worker
+	 * Tuplesortstates will have received only a fraction of
+	 * maintenance_work_mem, though.
+	 *
+	 * We rely on the lifetime of the Leader Tuplesortstate almost not
+	 * overlapping with any worker Tuplesortstate's lifetime.  There may be
+	 * some small overlap, but that's okay because we rely on leader
+	 * Tuplesortstate only allocating a small, fixed amount of memory here.
+	 * When its tuplesort_performsort() is called (by our caller), and
+	 * significant amounts of memory are likely to be used, all workers must
+	 * have already freed almost all memory held by their Tuplesortstates
+	 * (they are about to go away completely, too).  The overall effect is
+	 * that maintenance_work_mem always represents an absolute high watermark
+	 * on the amount of memory used by a CREATE INDEX operation, regardless of
+	 * the use of parallelism or any other factor.
+	 */
+	buildstate->spool->sortstate =
+		tuplesort_begin_index_btree(heap, index, buildstate->isunique,
+									maintenance_work_mem, coordinate,
+									false);
+
+	/*
+	 * If building a unique index, put dead tuples in a second spool to keep
+	 * them out of the uniqueness check.  We expect that the second spool (for
+	 * dead tuples) won't get very full, so we give it only work_mem.
+	 */
+	if (indexInfo->ii_Unique)
+	{
+		BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
+		SortCoordinate coordinate2 = NULL;
+
+		/* Initialize secondary spool */
+		btspool2->heap = heap;
+		btspool2->index = index;
+		btspool2->isunique = false;
+		/* Save as secondary spool */
+		buildstate->spool2 = btspool2;
+
+		if (buildstate->btleader)
+		{
+			/*
+			 * Set up non-private state that is passed to
+			 * tuplesort_begin_index_btree() about the basic high level
+			 * coordination of a parallel sort.
+			 */
+			coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
+			coordinate2->isWorker = false;
+			coordinate2->nParticipants =
+				buildstate->btleader->nparticipanttuplesorts;
+			coordinate2->sharedsort = buildstate->btleader->sharedsort2;
+		}
+
+		/*
+		 * We expect that the second one (for dead tuples) won't get very
+		 * full, so we give it only work_mem
+		 */
+		buildstate->spool2->sortstate =
+			tuplesort_begin_index_btree(heap, index, false, work_mem,
+										coordinate2, false);
+	}
+
+	/* Fill spool using either serial or parallel heap scan */
+	if (!buildstate->btleader)
+		reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+										   _bt_build_callback, (void *) buildstate,
+										   NULL);
+	else
+		reltuples = _bt_parallel_heapscan(buildstate,
+										  &indexInfo->ii_BrokenHotChain);
+
+	/*
+	 * Set the progress target for the next phase.  Reset the block number
+	 * values set by table_index_build_scan
+	 */
+	{
+		const int	progress_index[] = {
+			PROGRESS_CREATEIDX_TUPLES_TOTAL,
+			PROGRESS_SCAN_BLOCKS_TOTAL,
+			PROGRESS_SCAN_BLOCKS_DONE
+		};
+		const int64 progress_vals[] = {
+			buildstate->indtuples,
+			0, 0
+		};
+
+		pgstat_progress_update_multi_param(3, progress_index, progress_vals);
+	}
+
+	/* okay, all heap tuples are spooled */
+	if (buildstate->spool2 && !buildstate->havedead)
+	{
+		/* spool2 turns out to be unnecessary */
+		_bt_spooldestroy(buildstate->spool2);
+		buildstate->spool2 = NULL;
+	}
+
+	return reltuples;
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+static void
+_bt_spooldestroy(BTSpool *btspool)
+{
+	tuplesort_end(btspool->sortstate);
+	pfree(btspool);
+}
+
+/*
+ * spool an index entry into the sort file.
+ */
+static void
+_bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
+{
+	tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
+								  self, values, isnull);
+}
+
+/*
+ * given a spool loaded by successive calls to _bt_spool,
+ * create an entire btree.
+ */
+static void
+_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
+{
+	BTWriteState wstate;
+
+#ifdef BTREE_BUILD_STATS
+	if (log_btree_build_stats)
+	{
+		ShowUsage("BTREE BUILD (Spool) STATISTICS");
+		ResetUsage();
+	}
+#endif							/* BTREE_BUILD_STATS */
+
+	/* Execute the sort */
+	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+								 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
+	tuplesort_performsort(btspool->sortstate);
+	if (btspool2)
+	{
+		pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+									 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
+		tuplesort_performsort(btspool2->sortstate);
+	}
+
+	wstate.heap = btspool->heap;
+	wstate.index = btspool->index;
+	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
+	/* _bt_mkscankey() won't set allequalimage without metapage */
+	wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
+	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
+
+	/* reserve the metapage */
+	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
+	wstate.btws_pages_written = 0;
+	wstate.btws_zeropage = NULL;	/* until needed */
+
+	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+								 PROGRESS_BTREE_PHASE_LEAF_LOAD);
+	_bt_load(&wstate, btspool, btspool2);
+}
+
+/*
+ * Per-tuple callback for table_index_build_scan
+ */
+static void
+_bt_build_callback(Relation index,
+				   ItemPointer tid,
+				   Datum *values,
+				   bool *isnull,
+				   bool tupleIsAlive,
+				   void *state)
+{
+	BTBuildState *buildstate = (BTBuildState *) state;
+
+	/*
+	 * insert the index tuple into the appropriate spool file for subsequent
+	 * processing
+	 */
+	if (tupleIsAlive || buildstate->spool2 == NULL)
+		_bt_spool(buildstate->spool, tid, values, isnull);
+	else
+	{
+		/* dead tuples are put into spool2 */
+		buildstate->havedead = true;
+		_bt_spool(buildstate->spool2, tid, values, isnull);
+	}
+
+	buildstate->indtuples += 1;
+}
+
+/*
+ * allocate workspace for a new, clean btree page, not linked to any siblings.
+ */
+static Page
+_bt_blnewpage(uint32 level)
+{
+	Page		page;
+	BTPageOpaque opaque;
+
+	page = (Page) palloc(BLCKSZ);
+
+	/* Zero the page and set up standard page header info */
+	_bt_pageinit(page, BLCKSZ);
+
+	/* Initialize BT opaque state */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	opaque->btpo_prev = opaque->btpo_next = P_NONE;
+	opaque->btpo_level = level;
+	opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
+	opaque->btpo_cycleid = 0;
+
+	/* Make the P_HIKEY line pointer appear allocated */
+	((PageHeader) page)->pd_lower += sizeof(ItemIdData);
+
+	return page;
+}
+
+/*
+ * emit a completed btree page, and release the working storage.
+ */
+static void
+_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
+{
+	/* Ensure rd_smgr is open (could have been closed by relcache flush!) */
+	RelationOpenSmgr(wstate->index);
+
+	/* XLOG stuff */
+	if (wstate->btws_use_wal)
+	{
+		/* We use the XLOG_FPI record type for this */
+		log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
+	}
+
+	/*
+	 * If we have to write pages nonsequentially, fill in the space with
+	 * zeroes until we come back and overwrite.  This is not logically
+	 * necessary on standard Unix filesystems (unwritten space will read as
+	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
+	 * pages aren't WAL-logged though.
+	 */
+	while (blkno > wstate->btws_pages_written)
+	{
+		if (!wstate->btws_zeropage)
+			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
+		/* don't set checksum for all-zero page */
+		smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
+				   wstate->btws_pages_written++,
+				   (char *) wstate->btws_zeropage,
+				   true);
+	}
+
+	PageSetChecksumInplace(page, blkno);
+
+	/*
+	 * Now write the page.  There's no need for smgr to schedule an fsync for
+	 * this write; we'll do it ourselves before ending the build.
+	 */
+	if (blkno == wstate->btws_pages_written)
+	{
+		/* extending the file... */
+		smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
+				   (char *) page, true);
+		wstate->btws_pages_written++;
+	}
+	else
+	{
+		/* overwriting a block we zero-filled before */
+		smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
+				  (char *) page, true);
+	}
+
+	pfree(page);
+}
+
+/*
+ * allocate and initialize a new BTPageState.  the returned structure
+ * is suitable for immediate use by _bt_buildadd.
+ */
+static BTPageState *
+_bt_pagestate(BTWriteState *wstate, uint32 level)
+{
+	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
+
+	/* create initial page for level */
+	state->btps_page = _bt_blnewpage(level);
+
+	/* and assign it a page position */
+	state->btps_blkno = wstate->btws_pages_alloced++;
+
+	state->btps_lowkey = NULL;
+	/* initialize lastoff so first item goes into P_FIRSTKEY */
+	state->btps_lastoff = P_HIKEY;
+	state->btps_lastextra = 0;
+	state->btps_level = level;
+	/* set "full" threshold based on level.  See notes at head of file. */
+	if (level > 0)
+		state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
+	else
+		state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
+
+	/* no parent level, yet */
+	state->btps_next = NULL;
+
+	return state;
+}
+
+/*
+ * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
+ * P_HIKEY, overwriting P_HIKEY).
+ *
+ * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
+ * rightmost page on its level is not supposed to get a high key.  Now that
+ * it's clear that this page is a rightmost page, remove the unneeded empty
+ * P_HIKEY line pointer space.
+ */
+static void
+_bt_slideleft(Page rightmostpage)
+{
+	OffsetNumber off;
+	OffsetNumber maxoff;
+	ItemId		previi;
+
+	maxoff = PageGetMaxOffsetNumber(rightmostpage);
+	Assert(maxoff >= P_FIRSTKEY);
+	previi = PageGetItemId(rightmostpage, P_HIKEY);
+	for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
+	{
+		ItemId		thisii = PageGetItemId(rightmostpage, off);
+
+		*previi = *thisii;
+		previi = thisii;
+	}
+	((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
+}
+
+/*
+ * Add an item to a page being built.
+ *
+ * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
+ * raises an error directly.
+ *
+ * Note that our nbtsort.c caller does not know yet if the page will be
+ * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
+ * caller.  Page that turns out to be the rightmost on its level is fixed by
+ * calling _bt_slideleft().
+ */
+static void
+_bt_sortaddtup(Page page,
+			   Size itemsize,
+			   IndexTuple itup,
+			   OffsetNumber itup_off,
+			   bool newfirstdataitem)
+{
+	IndexTupleData trunctuple;
+
+	if (newfirstdataitem)
+	{
+		trunctuple = *itup;
+		trunctuple.t_info = sizeof(IndexTupleData);
+		BTreeTupleSetNAtts(&trunctuple, 0, false);
+		itup = &trunctuple;
+		itemsize = sizeof(IndexTupleData);
+	}
+
+	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
+					false, false) == InvalidOffsetNumber)
+		elog(ERROR, "failed to add item to the index page");
+}
+
+/*----------
+ * Add an item to a disk page from the sort output (or add a posting list
+ * item formed from the sort output).
+ *
+ * We must be careful to observe the page layout conventions of nbtsearch.c:
+ * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
+ * - on non-leaf pages, the key portion of the first item need not be
+ *	 stored, we should store only the link.
+ *
+ * A leaf page being built looks like:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ...           |
+ * +-----------+----+---------------------------------+
+ * | ... linpN |									  |
+ * +-----------+--------------------------------------+
+ * |	 ^ last										  |
+ * |												  |
+ * +-------------+------------------------------------+
+ * |			 | itemN ...                          |
+ * +-------------+------------------+-----------------+
+ * |		  ... item3 item2 item1 | "special space" |
+ * +--------------------------------+-----------------+
+ *
+ * Contrast this with the diagram in bufpage.h; note the mismatch
+ * between linps and items.  This is because we reserve linp0 as a
+ * placeholder for the pointer to the "high key" item; when we have
+ * filled up the page, we will set linp0 to point to itemN and clear
+ * linpN.  On the other hand, if we find this is the last (rightmost)
+ * page, we leave the items alone and slide the linp array over.  If
+ * the high key is to be truncated, offset 1 is deleted, and we insert
+ * the truncated high key at offset 1.
+ *
+ * 'last' pointer indicates the last offset added to the page.
+ *
+ * 'truncextra' is the size of the posting list in itup, if any.  This
+ * information is stashed for the next call here, when we may benefit
+ * from considering the impact of truncating away the posting list on
+ * the page before deciding to finish the page off.  Posting lists are
+ * often relatively large, so it is worth going to the trouble of
+ * accounting for the saving from truncating away the posting list of
+ * the tuple that becomes the high key (that may be the only way to
+ * get close to target free space on the page).  Note that this is
+ * only used for the soft fillfactor-wise limit, not the critical hard
+ * limit.
+ *----------
+ */
+static void
+_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
+			 Size truncextra)
+{
+	Page		npage;
+	BlockNumber nblkno;
+	OffsetNumber last_off;
+	Size		last_truncextra;
+	Size		pgspc;
+	Size		itupsz;
+	bool		isleaf;
+
+	/*
+	 * This is a handy place to check for cancel interrupts during the btree
+	 * load phase of index creation.
+	 */
+	CHECK_FOR_INTERRUPTS();
+
+	npage = state->btps_page;
+	nblkno = state->btps_blkno;
+	last_off = state->btps_lastoff;
+	last_truncextra = state->btps_lastextra;
+	state->btps_lastextra = truncextra;
+
+	pgspc = PageGetFreeSpace(npage);
+	itupsz = IndexTupleSize(itup);
+	itupsz = MAXALIGN(itupsz);
+	/* Leaf case has slightly different rules due to suffix truncation */
+	isleaf = (state->btps_level == 0);
+
+	/*
+	 * Check whether the new item can fit on a btree page on current level at
+	 * all.
+	 *
+	 * Every newly built index will treat heap TID as part of the keyspace,
+	 * which imposes the requirement that new high keys must occasionally have
+	 * a heap TID appended within _bt_truncate().  That may leave a new pivot
+	 * tuple one or two MAXALIGN() quantums larger than the original
+	 * firstright tuple it's derived from.  v4 deals with the problem by
+	 * decreasing the limit on the size of tuples inserted on the leaf level
+	 * by the same small amount.  Enforce the new v4+ limit on the leaf level,
+	 * and the old limit on internal levels, since pivot tuples may need to
+	 * make use of the reserved space.  This should never fail on internal
+	 * pages.
+	 */
+	if (unlikely(itupsz > BTMaxItemSize(npage)))
+		_bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
+							 itup);
+
+	/*
+	 * Check to see if current page will fit new item, with space left over to
+	 * append a heap TID during suffix truncation when page is a leaf page.
+	 *
+	 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
+	 * high key with heap TID when finishing off a leaf page, since we rely on
+	 * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
+	 * internal pages we can always fit 3 pivot tuples with larger internal
+	 * page tuple limit (includes page high key).
+	 *
+	 * Most of the time, a page is only "full" in the sense that the soft
+	 * fillfactor-wise limit has been exceeded.  However, we must always leave
+	 * at least two items plus a high key on each page before starting a new
+	 * page.  Disregard fillfactor and insert on "full" current page if we
+	 * don't have the minimum number of items yet.  (Note that we deliberately
+	 * assume that suffix truncation neither enlarges nor shrinks new high key
+	 * when applying soft limit, except when last tuple has a posting list.)
+	 */
+	Assert(last_truncextra == 0 || isleaf);
+	if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
+		(pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
+	{
+		/*
+		 * Finish off the page and write it out.
+		 */
+		Page		opage = npage;
+		BlockNumber oblkno = nblkno;
+		ItemId		ii;
+		ItemId		hii;
+		IndexTuple	oitup;
+
+		/* Create new page of same level */
+		npage = _bt_blnewpage(state->btps_level);
+
+		/* and assign it a page position */
+		nblkno = wstate->btws_pages_alloced++;
+
+		/*
+		 * We copy the last item on the page into the new page, and then
+		 * rearrange the old page so that the 'last item' becomes its high key
+		 * rather than a true data item.  There had better be at least two
+		 * items on the page already, else the page would be empty of useful
+		 * data.
+		 */
+		Assert(last_off > P_FIRSTKEY);
+		ii = PageGetItemId(opage, last_off);
+		oitup = (IndexTuple) PageGetItem(opage, ii);
+		_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
+					   !isleaf);
+
+		/*
+		 * Move 'last' into the high key position on opage.  _bt_blnewpage()
+		 * allocated empty space for a line pointer when opage was first
+		 * created, so this is a matter of rearranging already-allocated space
+		 * on page, and initializing high key line pointer. (Actually, leaf
+		 * pages must also swap oitup with a truncated version of oitup, which
+		 * is sometimes larger than oitup, though never by more than the space
+		 * needed to append a heap TID.)
+		 */
+		hii = PageGetItemId(opage, P_HIKEY);
+		*hii = *ii;
+		ItemIdSetUnused(ii);	/* redundant */
+		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
+
+		if (isleaf)
+		{
+			IndexTuple	lastleft;
+			IndexTuple	truncated;
+
+			/*
+			 * Truncate away any unneeded attributes from high key on leaf
+			 * level.  This is only done at the leaf level because downlinks
+			 * in internal pages are either negative infinity items, or get
+			 * their contents from copying from one level down.  See also:
+			 * _bt_split().
+			 *
+			 * We don't try to bias our choice of split point to make it more
+			 * likely that _bt_truncate() can truncate away more attributes,
+			 * whereas the split point used within _bt_split() is chosen much
+			 * more delicately.  Even still, the lastleft and firstright
+			 * tuples passed to _bt_truncate() here are at least not fully
+			 * equal to each other when deduplication is used, unless there is
+			 * a large group of duplicates (also, unique index builds usually
+			 * have few or no spool2 duplicates).  When the split point is
+			 * between two unequal tuples, _bt_truncate() will avoid including
+			 * a heap TID in the new high key, which is the most important
+			 * benefit of suffix truncation.
+			 *
+			 * Overwrite the old item with new truncated high key directly.
+			 * oitup is already located at the physical beginning of tuple
+			 * space, so this should directly reuse the existing tuple space.
+			 */
+			ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
+			lastleft = (IndexTuple) PageGetItem(opage, ii);
+
+			Assert(IndexTupleSize(oitup) > last_truncextra);
+			truncated = _bt_truncate(wstate->index, lastleft, oitup,
+									 wstate->inskey);
+			if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
+										 IndexTupleSize(truncated)))
+				elog(ERROR, "failed to add high key to the index page");
+			pfree(truncated);
+
+			/* oitup should continue to point to the page's high key */
+			hii = PageGetItemId(opage, P_HIKEY);
+			oitup = (IndexTuple) PageGetItem(opage, hii);
+		}
+
+		/*
+		 * Link the old page into its parent, using its low key.  If we don't
+		 * have a parent, we have to create one; this adds a new btree level.
+		 */
+		if (state->btps_next == NULL)
+			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
+
+		Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
+				IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
+				BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
+			   P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
+		Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
+			   !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
+		BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
+		_bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
+		pfree(state->btps_lowkey);
+
+		/*
+		 * Save a copy of the high key from the old page.  It is also the low
+		 * key for the new page.
+		 */
+		state->btps_lowkey = CopyIndexTuple(oitup);
+
+		/*
+		 * Set the sibling links for both pages.
+		 */
+		{
+			BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
+			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
+
+			oopaque->btpo_next = nblkno;
+			nopaque->btpo_prev = oblkno;
+			nopaque->btpo_next = P_NONE;	/* redundant */
+		}
+
+		/*
+		 * Write out the old page.  We never need to touch it again, so we can
+		 * free the opage workspace too.
+		 */
+		_bt_blwritepage(wstate, opage, oblkno);
+
+		/*
+		 * Reset last_off to point to new page
+		 */
+		last_off = P_FIRSTKEY;
+	}
+
+	/*
+	 * By here, either original page is still the current page, or a new page
+	 * was created that became the current page.  Either way, the current page
+	 * definitely has space for new item.
+	 *
+	 * If the new item is the first for its page, it must also be the first
+	 * item on its entire level.  On later same-level pages, a low key for a
+	 * page will be copied from the prior page in the code above.  Generate a
+	 * minus infinity low key here instead.
+	 */
+	if (last_off == P_HIKEY)
+	{
+		Assert(state->btps_lowkey == NULL);
+		state->btps_lowkey = palloc0(sizeof(IndexTupleData));
+		state->btps_lowkey->t_info = sizeof(IndexTupleData);
+		BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
+	}
+
+	/*
+	 * Add the new item into the current page.
+	 */
+	last_off = OffsetNumberNext(last_off);
+	_bt_sortaddtup(npage, itupsz, itup, last_off,
+				   !isleaf && last_off == P_FIRSTKEY);
+
+	state->btps_page = npage;
+	state->btps_blkno = nblkno;
+	state->btps_lastoff = last_off;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the index.  Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
+ * using _bt_buildadd().
+ */
+static void
+_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
+							  BTDedupState dstate)
+{
+	Assert(dstate->nitems > 0);
+
+	if (dstate->nitems == 1)
+		_bt_buildadd(wstate, state, dstate->base, 0);
+	else
+	{
+		IndexTuple	postingtuple;
+		Size		truncextra;
+
+		/* form a tuple with a posting list */
+		postingtuple = _bt_form_posting(dstate->base,
+										dstate->htids,
+										dstate->nhtids);
+		/* Calculate posting list overhead */
+		truncextra = IndexTupleSize(postingtuple) -
+			BTreeTupleGetPostingOffset(postingtuple);
+
+		_bt_buildadd(wstate, state, postingtuple, truncextra);
+		pfree(postingtuple);
+	}
+
+	dstate->nmaxitems = 0;
+	dstate->nhtids = 0;
+	dstate->nitems = 0;
+	dstate->phystupsize = 0;
+}
+
+/*
+ * Finish writing out the completed btree.
+ */
+static void
+_bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
+{
+	BTPageState *s;
+	BlockNumber rootblkno = P_NONE;
+	uint32		rootlevel = 0;
+	Page		metapage;
+
+	/*
+	 * Each iteration of this loop completes one more level of the tree.
+	 */
+	for (s = state; s != NULL; s = s->btps_next)
+	{
+		BlockNumber blkno;
+		BTPageOpaque opaque;
+
+		blkno = s->btps_blkno;
+		opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
+
+		/*
+		 * We have to link the last page on this level to somewhere.
+		 *
+		 * If we're at the top, it's the root, so attach it to the metapage.
+		 * Otherwise, add an entry for it to its parent using its low key.
+		 * This may cause the last page of the parent level to split, but
+		 * that's not a problem -- we haven't gotten to it yet.
+		 */
+		if (s->btps_next == NULL)
+		{
+			opaque->btpo_flags |= BTP_ROOT;
+			rootblkno = blkno;
+			rootlevel = s->btps_level;
+		}
+		else
+		{
+			Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
+					IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
+					BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
+				   P_LEFTMOST(opaque));
+			Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
+				   !P_LEFTMOST(opaque));
+			BTreeTupleSetDownLink(s->btps_lowkey, blkno);
+			_bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
+			pfree(s->btps_lowkey);
+			s->btps_lowkey = NULL;
+		}
+
+		/*
+		 * This is the rightmost page, so the ItemId array needs to be slid
+		 * back one slot.  Then we can dump out the page.
+		 */
+		_bt_slideleft(s->btps_page);
+		_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
+		s->btps_page = NULL;	/* writepage freed the workspace */
+	}
+
+	/*
+	 * As the last step in the process, construct the metapage and make it
+	 * point to the new root (unless we had no data at all, in which case it's
+	 * set to point to "P_NONE").  This changes the index to the "valid" state
+	 * by filling in a valid magic number in the metapage.
+	 */
+	metapage = (Page) palloc(BLCKSZ);
+	_bt_initmetapage(metapage, rootblkno, rootlevel,
+					 wstate->inskey->allequalimage);
+	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
+}
+
+/*
+ * Read tuples in correct sort order from tuplesort, and load them into
+ * btree leaves.
+ */
+static void
+_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
+{
+	BTPageState *state = NULL;
+	bool		merge = (btspool2 != NULL);
+	IndexTuple	itup,
+				itup2 = NULL;
+	bool		load1;
+	TupleDesc	tupdes = RelationGetDescr(wstate->index);
+	int			i,
+				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
+	SortSupport sortKeys;
+	int64		tuples_done = 0;
+	bool		deduplicate;
+
+	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
+		BTGetDeduplicateItems(wstate->index);
+
+	if (merge)
+	{
+		/*
+		 * Another BTSpool for dead tuples exists. Now we have to merge
+		 * btspool and btspool2.
+		 */
+
+		/* the preparation of merge */
+		itup = tuplesort_getindextuple(btspool->sortstate, true);
+		itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
+
+		/* Prepare SortSupport data for each column */
+		sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
+
+		for (i = 0; i < keysz; i++)
+		{
+			SortSupport sortKey = sortKeys + i;
+			ScanKey		scanKey = wstate->inskey->scankeys + i;
+			int16		strategy;
+
+			sortKey->ssup_cxt = CurrentMemoryContext;
+			sortKey->ssup_collation = scanKey->sk_collation;
+			sortKey->ssup_nulls_first =
+				(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
+			sortKey->ssup_attno = scanKey->sk_attno;
+			/* Abbreviation is not supported here */
+			sortKey->abbreviate = false;
+
+			AssertState(sortKey->ssup_attno != 0);
+
+			strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
+				BTGreaterStrategyNumber : BTLessStrategyNumber;
+
+			PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
+		}
+
+		for (;;)
+		{
+			load1 = true;		/* load BTSpool next ? */
+			if (itup2 == NULL)
+			{
+				if (itup == NULL)
+					break;
+			}
+			else if (itup != NULL)
+			{
+				int32		compare = 0;
+
+				for (i = 1; i <= keysz; i++)
+				{
+					SortSupport entry;
+					Datum		attrDatum1,
+								attrDatum2;
+					bool		isNull1,
+								isNull2;
+
+					entry = sortKeys + i - 1;
+					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
+					attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
+
+					compare = ApplySortComparator(attrDatum1, isNull1,
+												  attrDatum2, isNull2,
+												  entry);
+					if (compare > 0)
+					{
+						load1 = false;
+						break;
+					}
+					else if (compare < 0)
+						break;
+				}
+
+				/*
+				 * If key values are equal, we sort on ItemPointer.  This is
+				 * required for btree indexes, since heap TID is treated as an
+				 * implicit last key attribute in order to ensure that all
+				 * keys in the index are physically unique.
+				 */
+				if (compare == 0)
+				{
+					compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
+					Assert(compare != 0);
+					if (compare > 0)
+						load1 = false;
+				}
+			}
+			else
+				load1 = false;
+
+			/* When we see first tuple, create first index page */
+			if (state == NULL)
+				state = _bt_pagestate(wstate, 0);
+
+			if (load1)
+			{
+				_bt_buildadd(wstate, state, itup, 0);
+				itup = tuplesort_getindextuple(btspool->sortstate, true);
+			}
+			else
+			{
+				_bt_buildadd(wstate, state, itup2, 0);
+				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
+			}
+
+			/* Report progress */
+			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+										 ++tuples_done);
+		}
+		pfree(sortKeys);
+	}
+	else if (deduplicate)
+	{
+		/* merge is unnecessary, deduplicate into posting lists */
+		BTDedupState dstate;
+
+		dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
+		dstate->deduplicate = true; /* unused */
+		dstate->nmaxitems = 0;	/* unused */
+		dstate->maxpostingsize = 0; /* set later */
+		/* Metadata about base tuple of current pending posting list */
+		dstate->base = NULL;
+		dstate->baseoff = InvalidOffsetNumber;	/* unused */
+		dstate->basetupsize = 0;
+		/* Metadata about current pending posting list TIDs */
+		dstate->htids = NULL;
+		dstate->nhtids = 0;
+		dstate->nitems = 0;
+		dstate->phystupsize = 0;	/* unused */
+		dstate->nintervals = 0; /* unused */
+
+		while ((itup = tuplesort_getindextuple(btspool->sortstate,
+											   true)) != NULL)
+		{
+			/* When we see first tuple, create first index page */
+			if (state == NULL)
+			{
+				state = _bt_pagestate(wstate, 0);
+
+				/*
+				 * Limit size of posting list tuples to 1/10 space we want to
+				 * leave behind on the page, plus space for final item's line
+				 * pointer.  This is equal to the space that we'd like to
+				 * leave behind on each leaf page when fillfactor is 90,
+				 * allowing us to get close to fillfactor% space utilization
+				 * when there happen to be a great many duplicates.  (This
+				 * makes higher leaf fillfactor settings ineffective when
+				 * building indexes that have many duplicates, but packing
+				 * leaf pages full with few very large tuples doesn't seem
+				 * like a useful goal.)
+				 */
+				dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
+					sizeof(ItemIdData);
+				Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
+					   dstate->maxpostingsize <= INDEX_SIZE_MASK);
+				dstate->htids = palloc(dstate->maxpostingsize);
+
+				/* start new pending posting list with itup copy */
+				_bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
+										InvalidOffsetNumber);
+			}
+			else if (_bt_keep_natts_fast(wstate->index, dstate->base,
+										 itup) > keysz &&
+					 _bt_dedup_save_htid(dstate, itup))
+			{
+				/*
+				 * Tuple is equal to base tuple of pending posting list.  Heap
+				 * TID from itup has been saved in state.
+				 */
+			}
+			else
+			{
+				/*
+				 * Tuple is not equal to pending posting list tuple, or
+				 * _bt_dedup_save_htid() opted to not merge current item into
+				 * pending posting list.
+				 */
+				_bt_sort_dedup_finish_pending(wstate, state, dstate);
+				pfree(dstate->base);
+
+				/* start new pending posting list with itup copy */
+				_bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
+										InvalidOffsetNumber);
+			}
+
+			/* Report progress */
+			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+										 ++tuples_done);
+		}
+
+		if (state)
+		{
+			/*
+			 * Handle the last item (there must be a last item when the
+			 * tuplesort returned one or more tuples)
+			 */
+			_bt_sort_dedup_finish_pending(wstate, state, dstate);
+			pfree(dstate->base);
+			pfree(dstate->htids);
+		}
+
+		pfree(dstate);
+	}
+	else
+	{
+		/* merging and deduplication are both unnecessary */
+		while ((itup = tuplesort_getindextuple(btspool->sortstate,
+											   true)) != NULL)
+		{
+			/* When we see first tuple, create first index page */
+			if (state == NULL)
+				state = _bt_pagestate(wstate, 0);
+
+			_bt_buildadd(wstate, state, itup, 0);
+
+			/* Report progress */
+			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+										 ++tuples_done);
+		}
+	}
+
+	/* Close down final pages and write the metapage */
+	_bt_uppershutdown(wstate, state);
+
+	/*
+	 * When we WAL-logged index pages, we must nonetheless fsync index files.
+	 * Since we're building outside shared buffers, a CHECKPOINT occurring
+	 * during the build has no way to flush the previously written data to
+	 * disk (indeed it won't know the index even exists).  A crash later on
+	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
+	 * earlier WAL entries. If we do not fsync those pages here, they might
+	 * still not be on disk when the crash occurs.
+	 */
+	if (wstate->btws_use_wal)
+	{
+		RelationOpenSmgr(wstate->index);
+		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
+	}
+}
+
+/*
+ * Create parallel context, and launch workers for leader.
+ *
+ * buildstate argument should be initialized (with the exception of the
+ * tuplesort state in spools, which may later be created based on shared
+ * state initially set up here).
+ *
+ * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
+ *
+ * request is the target number of parallel worker processes to launch.
+ *
+ * Sets buildstate's BTLeader, which caller must use to shut down parallel
+ * mode by passing it to _bt_end_parallel() at the very end of its index
+ * build.  If not even a single worker process can be launched, this is
+ * never set, and caller should proceed with a serial index build.
+ */
+static void
+_bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
+{
+	ParallelContext *pcxt;
+	int			scantuplesortstates;
+	Snapshot	snapshot;
+	Size		estbtshared;
+	Size		estsort;
+	BTShared   *btshared;
+	Sharedsort *sharedsort;
+	Sharedsort *sharedsort2;
+	BTSpool    *btspool = buildstate->spool;
+	BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
+	WalUsage   *walusage;
+	BufferUsage *bufferusage;
+	bool		leaderparticipates = true;
+	int			querylen;
+
+#ifdef DISABLE_LEADER_PARTICIPATION
+	leaderparticipates = false;
+#endif
+
+	/*
+	 * Enter parallel mode, and create context for parallel build of btree
+	 * index
+	 */
+	EnterParallelMode();
+	Assert(request > 0);
+	pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
+								 request);
+
+	scantuplesortstates = leaderparticipates ? request + 1 : request;
+
+	/*
+	 * Prepare for scan of the base relation.  In a normal index build, we use
+	 * SnapshotAny because we must retrieve all tuples and do our own time
+	 * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
+	 * concurrent build, we take a regular MVCC snapshot and index whatever's
+	 * live according to that.
+	 */
+	if (!isconcurrent)
+		snapshot = SnapshotAny;
+	else
+		snapshot = RegisterSnapshot(GetTransactionSnapshot());
+
+	/*
+	 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
+	 * PARALLEL_KEY_TUPLESORT tuplesort workspace
+	 */
+	estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
+	shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
+	estsort = tuplesort_estimate_shared(scantuplesortstates);
+	shm_toc_estimate_chunk(&pcxt->estimator, estsort);
+
+	/*
+	 * Unique case requires a second spool, and so we may have to account for
+	 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
+	 */
+	if (!btspool->isunique)
+		shm_toc_estimate_keys(&pcxt->estimator, 2);
+	else
+	{
+		shm_toc_estimate_chunk(&pcxt->estimator, estsort);
+		shm_toc_estimate_keys(&pcxt->estimator, 3);
+	}
+
+	/*
+	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
+	 * and PARALLEL_KEY_BUFFER_USAGE.
+	 *
+	 * If there are no extensions loaded that care, we could skip this.  We
+	 * have no way of knowing whether anyone's looking at pgWalUsage or
+	 * pgBufferUsage, so do it unconditionally.
+	 */
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   mul_size(sizeof(WalUsage), pcxt->nworkers));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
+	if (debug_query_string)
+	{
+		querylen = strlen(debug_query_string);
+		shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
+		shm_toc_estimate_keys(&pcxt->estimator, 1);
+	}
+	else
+		querylen = 0;			/* keep compiler quiet */
+
+	/* Everyone's had a chance to ask for space, so now create the DSM */
+	InitializeParallelDSM(pcxt);
+
+	/* If no DSM segment was available, back out (do serial build) */
+	if (pcxt->seg == NULL)
+	{
+		if (IsMVCCSnapshot(snapshot))
+			UnregisterSnapshot(snapshot);
+		DestroyParallelContext(pcxt);
+		ExitParallelMode();
+		return;
+	}
+
+	/* Store shared build state, for which we reserved space */
+	btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
+	/* Initialize immutable state */
+	btshared->heaprelid = RelationGetRelid(btspool->heap);
+	btshared->indexrelid = RelationGetRelid(btspool->index);
+	btshared->isunique = btspool->isunique;
+	btshared->isconcurrent = isconcurrent;
+	btshared->scantuplesortstates = scantuplesortstates;
+	ConditionVariableInit(&btshared->workersdonecv);
+	SpinLockInit(&btshared->mutex);
+	/* Initialize mutable state */
+	btshared->nparticipantsdone = 0;
+	btshared->reltuples = 0.0;
+	btshared->havedead = false;
+	btshared->indtuples = 0.0;
+	btshared->brokenhotchain = false;
+	table_parallelscan_initialize(btspool->heap,
+								  ParallelTableScanFromBTShared(btshared),
+								  snapshot);
+
+	/*
+	 * Store shared tuplesort-private state, for which we reserved space.
+	 * Then, initialize opaque state using tuplesort routine.
+	 */
+	sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
+	tuplesort_initialize_shared(sharedsort, scantuplesortstates,
+								pcxt->seg);
+
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
+
+	/* Unique case requires a second spool, and associated shared state */
+	if (!btspool->isunique)
+		sharedsort2 = NULL;
+	else
+	{
+		/*
+		 * Store additional shared tuplesort-private state, for which we
+		 * reserved space.  Then, initialize opaque state using tuplesort
+		 * routine.
+		 */
+		sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
+		tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
+									pcxt->seg);
+
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
+	}
+
+	/* Store query string for workers */
+	if (debug_query_string)
+	{
+		char	   *sharedquery;
+
+		sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
+		memcpy(sharedquery, debug_query_string, querylen + 1);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
+	}
+
+	/*
+	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
+	 * initialize.
+	 */
+	walusage = shm_toc_allocate(pcxt->toc,
+								mul_size(sizeof(WalUsage), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
+	bufferusage = shm_toc_allocate(pcxt->toc,
+								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+
+	/* Launch workers, saving status for leader/caller */
+	LaunchParallelWorkers(pcxt);
+	btleader->pcxt = pcxt;
+	btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
+	if (leaderparticipates)
+		btleader->nparticipanttuplesorts++;
+	btleader->btshared = btshared;
+	btleader->sharedsort = sharedsort;
+	btleader->sharedsort2 = sharedsort2;
+	btleader->snapshot = snapshot;
+	btleader->walusage = walusage;
+	btleader->bufferusage = bufferusage;
+
+	/* If no workers were successfully launched, back out (do serial build) */
+	if (pcxt->nworkers_launched == 0)
+	{
+		_bt_end_parallel(btleader);
+		return;
+	}
+
+	/* Save leader state now that it's clear build will be parallel */
+	buildstate->btleader = btleader;
+
+	/* Join heap scan ourselves */
+	if (leaderparticipates)
+		_bt_leader_participate_as_worker(buildstate);
+
+	/*
+	 * Caller needs to wait for all launched workers when we return.  Make
+	 * sure that the failure-to-start case will not hang forever.
+	 */
+	WaitForParallelWorkersToAttach(pcxt);
+}
+
+/*
+ * Shut down workers, destroy parallel context, and end parallel mode.
+ */
+static void
+_bt_end_parallel(BTLeader *btleader)
+{
+	int			i;
+
+	/* Shutdown worker processes */
+	WaitForParallelWorkersToFinish(btleader->pcxt);
+
+	/*
+	 * Next, accumulate WAL usage.  (This must wait for the workers to finish,
+	 * or we might get incomplete data.)
+	 */
+	for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
+		InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
+
+	/* Free last reference to MVCC snapshot, if one was used */
+	if (IsMVCCSnapshot(btleader->snapshot))
+		UnregisterSnapshot(btleader->snapshot);
+	DestroyParallelContext(btleader->pcxt);
+	ExitParallelMode();
+}
+
+/*
+ * Returns size of shared memory required to store state for a parallel
+ * btree index build based on the snapshot its parallel scan will use.
+ */
+static Size
+_bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
+{
+	/* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
+	return add_size(BUFFERALIGN(sizeof(BTShared)),
+					table_parallelscan_estimate(heap, snapshot));
+}
+
+/*
+ * Within leader, wait for end of heap scan.
+ *
+ * When called, parallel heap scan started by _bt_begin_parallel() will
+ * already be underway within worker processes (when leader participates
+ * as a worker, we should end up here just as workers are finishing).
+ *
+ * Fills in fields needed for ambuild statistics, and lets caller set
+ * field indicating that some worker encountered a broken HOT chain.
+ *
+ * Returns the total number of heap tuples scanned.
+ */
+static double
+_bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
+{
+	BTShared   *btshared = buildstate->btleader->btshared;
+	int			nparticipanttuplesorts;
+	double		reltuples;
+
+	nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
+	for (;;)
+	{
+		SpinLockAcquire(&btshared->mutex);
+		if (btshared->nparticipantsdone == nparticipanttuplesorts)
+		{
+			buildstate->havedead = btshared->havedead;
+			buildstate->indtuples = btshared->indtuples;
+			*brokenhotchain = btshared->brokenhotchain;
+			reltuples = btshared->reltuples;
+			SpinLockRelease(&btshared->mutex);
+			break;
+		}
+		SpinLockRelease(&btshared->mutex);
+
+		ConditionVariableSleep(&btshared->workersdonecv,
+							   WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
+	}
+
+	ConditionVariableCancelSleep();
+
+	return reltuples;
+}
+
+/*
+ * Within leader, participate as a parallel worker.
+ */
+static void
+_bt_leader_participate_as_worker(BTBuildState *buildstate)
+{
+	BTLeader   *btleader = buildstate->btleader;
+	BTSpool    *leaderworker;
+	BTSpool    *leaderworker2;
+	int			sortmem;
+
+	/* Allocate memory and initialize private spool */
+	leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
+	leaderworker->heap = buildstate->spool->heap;
+	leaderworker->index = buildstate->spool->index;
+	leaderworker->isunique = buildstate->spool->isunique;
+
+	/* Initialize second spool, if required */
+	if (!btleader->btshared->isunique)
+		leaderworker2 = NULL;
+	else
+	{
+		/* Allocate memory for worker's own private secondary spool */
+		leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
+
+		/* Initialize worker's own secondary spool */
+		leaderworker2->heap = leaderworker->heap;
+		leaderworker2->index = leaderworker->index;
+		leaderworker2->isunique = false;
+	}
+
+	/*
+	 * Might as well use reliable figure when doling out maintenance_work_mem
+	 * (when requested number of workers were not launched, this will be
+	 * somewhat higher than it is for other workers).
+	 */
+	sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
+
+	/* Perform work common to all participants */
+	_bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
+							   btleader->sharedsort, btleader->sharedsort2,
+							   sortmem, true);
+
+#ifdef BTREE_BUILD_STATS
+	if (log_btree_build_stats)
+	{
+		ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
+		ResetUsage();
+	}
+#endif							/* BTREE_BUILD_STATS */
+}
+
+/*
+ * Perform work within a launched parallel process.
+ */
+void
+_bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
+{
+	char	   *sharedquery;
+	BTSpool    *btspool;
+	BTSpool    *btspool2;
+	BTShared   *btshared;
+	Sharedsort *sharedsort;
+	Sharedsort *sharedsort2;
+	Relation	heapRel;
+	Relation	indexRel;
+	LOCKMODE	heapLockmode;
+	LOCKMODE	indexLockmode;
+	WalUsage   *walusage;
+	BufferUsage *bufferusage;
+	int			sortmem;
+
+#ifdef BTREE_BUILD_STATS
+	if (log_btree_build_stats)
+		ResetUsage();
+#endif							/* BTREE_BUILD_STATS */
+
+	/*
+	 * The only possible status flag that can be set to the parallel worker is
+	 * PROC_IN_SAFE_IC.
+	 */
+	Assert((MyProc->statusFlags == 0) ||
+		   (MyProc->statusFlags == PROC_IN_SAFE_IC));
+
+	/* Set debug_query_string for individual workers first */
+	sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
+	debug_query_string = sharedquery;
+
+	/* Report the query string from leader */
+	pgstat_report_activity(STATE_RUNNING, debug_query_string);
+
+	/* Look up nbtree shared state */
+	btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
+
+	/* Open relations using lock modes known to be obtained by index.c */
+	if (!btshared->isconcurrent)
+	{
+		heapLockmode = ShareLock;
+		indexLockmode = AccessExclusiveLock;
+	}
+	else
+	{
+		heapLockmode = ShareUpdateExclusiveLock;
+		indexLockmode = RowExclusiveLock;
+	}
+
+	/* Open relations within worker */
+	heapRel = table_open(btshared->heaprelid, heapLockmode);
+	indexRel = index_open(btshared->indexrelid, indexLockmode);
+
+	/* Initialize worker's own spool */
+	btspool = (BTSpool *) palloc0(sizeof(BTSpool));
+	btspool->heap = heapRel;
+	btspool->index = indexRel;
+	btspool->isunique = btshared->isunique;
+
+	/* Look up shared state private to tuplesort.c */
+	sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
+	tuplesort_attach_shared(sharedsort, seg);
+	if (!btshared->isunique)
+	{
+		btspool2 = NULL;
+		sharedsort2 = NULL;
+	}
+	else
+	{
+		/* Allocate memory for worker's own private secondary spool */
+		btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
+
+		/* Initialize worker's own secondary spool */
+		btspool2->heap = btspool->heap;
+		btspool2->index = btspool->index;
+		btspool2->isunique = false;
+		/* Look up shared state private to tuplesort.c */
+		sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
+		tuplesort_attach_shared(sharedsort2, seg);
+	}
+
+	/* Prepare to track buffer usage during parallel execution */
+	InstrStartParallelQuery();
+
+	/* Perform sorting of spool, and possibly a spool2 */
+	sortmem = maintenance_work_mem / btshared->scantuplesortstates;
+	_bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
+							   sharedsort2, sortmem, false);
+
+	/* Report WAL/buffer usage during parallel execution */
+	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
+	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
+	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+						  &walusage[ParallelWorkerNumber]);
+
+#ifdef BTREE_BUILD_STATS
+	if (log_btree_build_stats)
+	{
+		ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
+		ResetUsage();
+	}
+#endif							/* BTREE_BUILD_STATS */
+
+	index_close(indexRel, indexLockmode);
+	table_close(heapRel, heapLockmode);
+}
+
+/*
+ * Perform a worker's portion of a parallel sort.
+ *
+ * This generates a tuplesort for passed btspool, and a second tuplesort
+ * state if a second btspool is need (i.e. for unique index builds).  All
+ * other spool fields should already be set when this is called.
+ *
+ * sortmem is the amount of working memory to use within each worker,
+ * expressed in KBs.
+ *
+ * When this returns, workers are done, and need only release resources.
+ */
+static void
+_bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
+						   BTShared *btshared, Sharedsort *sharedsort,
+						   Sharedsort *sharedsort2, int sortmem, bool progress)
+{
+	SortCoordinate coordinate;
+	BTBuildState buildstate;
+	TableScanDesc scan;
+	double		reltuples;
+	IndexInfo  *indexInfo;
+
+	/* Initialize local tuplesort coordination state */
+	coordinate = palloc0(sizeof(SortCoordinateData));
+	coordinate->isWorker = true;
+	coordinate->nParticipants = -1;
+	coordinate->sharedsort = sharedsort;
+
+	/* Begin "partial" tuplesort */
+	btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
+													 btspool->index,
+													 btspool->isunique,
+													 sortmem, coordinate,
+													 false);
+
+	/*
+	 * Just as with serial case, there may be a second spool.  If so, a
+	 * second, dedicated spool2 partial tuplesort is required.
+	 */
+	if (btspool2)
+	{
+		SortCoordinate coordinate2;
+
+		/*
+		 * We expect that the second one (for dead tuples) won't get very
+		 * full, so we give it only work_mem (unless sortmem is less for
+		 * worker).  Worker processes are generally permitted to allocate
+		 * work_mem independently.
+		 */
+		coordinate2 = palloc0(sizeof(SortCoordinateData));
+		coordinate2->isWorker = true;
+		coordinate2->nParticipants = -1;
+		coordinate2->sharedsort = sharedsort2;
+		btspool2->sortstate =
+			tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
+										Min(sortmem, work_mem), coordinate2,
+										false);
+	}
+
+	/* Fill in buildstate for _bt_build_callback() */
+	buildstate.isunique = btshared->isunique;
+	buildstate.havedead = false;
+	buildstate.heap = btspool->heap;
+	buildstate.spool = btspool;
+	buildstate.spool2 = btspool2;
+	buildstate.indtuples = 0;
+	buildstate.btleader = NULL;
+
+	/* Join parallel scan */
+	indexInfo = BuildIndexInfo(btspool->index);
+	indexInfo->ii_Concurrent = btshared->isconcurrent;
+	scan = table_beginscan_parallel(btspool->heap,
+									ParallelTableScanFromBTShared(btshared));
+	reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
+									   true, progress, _bt_build_callback,
+									   (void *) &buildstate, scan);
+
+	/* Execute this worker's part of the sort */
+	if (progress)
+		pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+									 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
+	tuplesort_performsort(btspool->sortstate);
+	if (btspool2)
+	{
+		if (progress)
+			pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+										 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
+		tuplesort_performsort(btspool2->sortstate);
+	}
+
+	/*
+	 * Done.  Record ambuild statistics, and whether we encountered a broken
+	 * HOT chain.
+	 */
+	SpinLockAcquire(&btshared->mutex);
+	btshared->nparticipantsdone++;
+	btshared->reltuples += reltuples;
+	if (buildstate.havedead)
+		btshared->havedead = true;
+	btshared->indtuples += buildstate.indtuples;
+	if (indexInfo->ii_BrokenHotChain)
+		btshared->brokenhotchain = true;
+	SpinLockRelease(&btshared->mutex);
+
+	/* Notify leader */
+	ConditionVariableSignal(&btshared->workersdonecv);
+
+	/* We can end tuplesorts immediately */
+	tuplesort_end(btspool->sortstate);
+	if (btspool2)
+		tuplesort_end(btspool2->sortstate);
+}
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
new file mode 100644
index 0000000..3485e93
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -0,0 +1,1190 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsplitloc.c
+ *	  Choose split point code for Postgres btree implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtsplitloc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "storage/lmgr.h"
+
+typedef enum
+{
+	/* strategy for searching through materialized list of split points */
+	SPLIT_DEFAULT,				/* give some weight to truncation */
+	SPLIT_MANY_DUPLICATES,		/* find minimally distinguishing point */
+	SPLIT_SINGLE_VALUE			/* leave left page almost full */
+} FindSplitStrat;
+
+typedef struct
+{
+	/* details of free space left by split */
+	int16		curdelta;		/* current leftfree/rightfree delta */
+	int16		leftfree;		/* space left on left page post-split */
+	int16		rightfree;		/* space left on right page post-split */
+
+	/* split point identifying fields (returned by _bt_findsplitloc) */
+	OffsetNumber firstrightoff; /* first origpage item on rightpage */
+	bool		newitemonleft;	/* new item goes on left, or right? */
+
+} SplitPoint;
+
+typedef struct
+{
+	/* context data for _bt_recsplitloc */
+	Relation	rel;			/* index relation */
+	Page		origpage;		/* page undergoing split */
+	IndexTuple	newitem;		/* new item (cause of page split) */
+	Size		newitemsz;		/* size of newitem (includes line pointer) */
+	bool		is_leaf;		/* T if splitting a leaf page */
+	bool		is_rightmost;	/* T if splitting rightmost page on level */
+	OffsetNumber newitemoff;	/* where the new item is to be inserted */
+	int			leftspace;		/* space available for items on left page */
+	int			rightspace;		/* space available for items on right page */
+	int			olddataitemstotal;	/* space taken by old items */
+	Size		minfirstrightsz;	/* smallest firstright size */
+
+	/* candidate split point data */
+	int			maxsplits;		/* maximum number of splits */
+	int			nsplits;		/* current number of splits */
+	SplitPoint *splits;			/* all candidate split points for page */
+	int			interval;		/* current range of acceptable split points */
+} FindSplitData;
+
+static void _bt_recsplitloc(FindSplitData *state,
+							OffsetNumber firstrightoff, bool newitemonleft,
+							int olddataitemstoleft,
+							Size firstrightofforigpagetuplesz);
+static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
+								bool usemult);
+static int	_bt_splitcmp(const void *arg1, const void *arg2);
+static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+								int leaffillfactor, bool *usemult);
+static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
+static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
+									 bool *newitemonleft, FindSplitStrat strategy);
+static int	_bt_defaultinterval(FindSplitData *state);
+static int	_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
+						 SplitPoint *rightpage, FindSplitStrat *strategy);
+static void _bt_interval_edges(FindSplitData *state,
+							   SplitPoint **leftinterval, SplitPoint **rightinterval);
+static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split);
+static inline IndexTuple _bt_split_lastleft(FindSplitData *state,
+											SplitPoint *split);
+static inline IndexTuple _bt_split_firstright(FindSplitData *state,
+											  SplitPoint *split);
+
+
+/*
+ *	_bt_findsplitloc() -- find an appropriate place to split a page.
+ *
+ * The main goal here is to equalize the free space that will be on each
+ * split page, *after accounting for the inserted tuple*.  (If we fail to
+ * account for it, we might find ourselves with too little room on the page
+ * that it needs to go into!)
+ *
+ * If the page is the rightmost page on its level, we instead try to arrange
+ * to leave the left split page fillfactor% full.  In this way, when we are
+ * inserting successively increasing keys (consider sequences, timestamps,
+ * etc) we will end up with a tree whose pages are about fillfactor% full,
+ * instead of the 50% full result that we'd get without this special case.
+ * This is the same as nbtsort.c produces for a newly-created tree.  Note
+ * that leaf and nonleaf pages use different fillfactors.  Note also that
+ * there are a number of further special cases where fillfactor is not
+ * applied in the standard way.
+ *
+ * We are passed the intended insert position of the new tuple, expressed as
+ * the offsetnumber of the tuple it must go in front of (this could be
+ * maxoff+1 if the tuple is to go at the end).  The new tuple itself is also
+ * passed, since it's needed to give some weight to how effective suffix
+ * truncation will be.  The implementation picks the split point that
+ * maximizes the effectiveness of suffix truncation from a small list of
+ * alternative candidate split points that leave each side of the split with
+ * about the same share of free space.  Suffix truncation is secondary to
+ * equalizing free space, except in cases with large numbers of duplicates.
+ * Note that it is always assumed that caller goes on to perform truncation,
+ * even with pg_upgrade'd indexes where that isn't actually the case
+ * (!heapkeyspace indexes).  See nbtree/README for more information about
+ * suffix truncation.
+ *
+ * We return the index of the first existing tuple that should go on the
+ * righthand page (which is called firstrightoff), plus a boolean
+ * indicating whether the new tuple goes on the left or right page.  You
+ * can think of the returned state as a point _between_ two adjacent data
+ * items (laftleft and firstright data items) on an imaginary version of
+ * origpage that already includes newitem.  The bool is necessary to
+ * disambiguate the case where firstrightoff == newitemoff (i.e. it is
+ * sometimes needed to determine if the firstright tuple for the split is
+ * newitem rather than the tuple from origpage at offset firstrightoff).
+ */
+OffsetNumber
+_bt_findsplitloc(Relation rel,
+				 Page origpage,
+				 OffsetNumber newitemoff,
+				 Size newitemsz,
+				 IndexTuple newitem,
+				 bool *newitemonleft)
+{
+	BTPageOpaque opaque;
+	int			leftspace,
+				rightspace,
+				olddataitemstotal,
+				olddataitemstoleft,
+				perfectpenalty,
+				leaffillfactor;
+	FindSplitData state;
+	FindSplitStrat strategy;
+	ItemId		itemid;
+	OffsetNumber offnum,
+				maxoff,
+				firstrightoff;
+	double		fillfactormult;
+	bool		usemult;
+	SplitPoint	leftpage,
+				rightpage;
+
+	opaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+	maxoff = PageGetMaxOffsetNumber(origpage);
+
+	/* Total free space available on a btree page, after fixed overhead */
+	leftspace = rightspace =
+		PageGetPageSize(origpage) - SizeOfPageHeaderData -
+		MAXALIGN(sizeof(BTPageOpaqueData));
+
+	/* The right page will have the same high key as the old page */
+	if (!P_RIGHTMOST(opaque))
+	{
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
+							 sizeof(ItemIdData));
+	}
+
+	/* Count up total space in data items before actually scanning 'em */
+	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
+	leaffillfactor = BTGetFillFactor(rel);
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+	state.rel = rel;
+	state.origpage = origpage;
+	state.newitem = newitem;
+	state.newitemsz = newitemsz;
+	state.is_leaf = P_ISLEAF(opaque);
+	state.is_rightmost = P_RIGHTMOST(opaque);
+	state.leftspace = leftspace;
+	state.rightspace = rightspace;
+	state.olddataitemstotal = olddataitemstotal;
+	state.minfirstrightsz = SIZE_MAX;
+	state.newitemoff = newitemoff;
+
+	/* newitem cannot be a posting list item */
+	Assert(!BTreeTupleIsPosting(newitem));
+
+	/*
+	 * nsplits should never exceed maxoff because there will be at most as
+	 * many candidate split points as there are points _between_ tuples, once
+	 * you imagine that the new item is already on the original page (the
+	 * final number of splits may be slightly lower because not all points
+	 * between tuples will be legal).
+	 */
+	state.maxsplits = maxoff;
+	state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
+	state.nsplits = 0;
+
+	/*
+	 * Scan through the data items and calculate space usage for a split at
+	 * each possible position
+	 */
+	olddataitemstoleft = 0;
+
+	for (offnum = P_FIRSTDATAKEY(opaque);
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		Size		itemsz;
+
+		itemid = PageGetItemId(origpage, offnum);
+		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+
+		/*
+		 * When item offset number is not newitemoff, neither side of the
+		 * split can be newitem.  Record a split after the previous data item
+		 * from original page, but before the current data item from original
+		 * page. (_bt_recsplitloc() will reject the split when there are no
+		 * previous items, which we rely on.)
+		 */
+		if (offnum < newitemoff)
+			_bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
+		else if (offnum > newitemoff)
+			_bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
+		else
+		{
+			/*
+			 * Record a split after all "offnum < newitemoff" original page
+			 * data items, but before newitem
+			 */
+			_bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
+
+			/*
+			 * Record a split after newitem, but before data item from
+			 * original page at offset newitemoff/current offset
+			 */
+			_bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
+		}
+
+		olddataitemstoleft += itemsz;
+	}
+
+	/*
+	 * Record a split after all original page data items, but before newitem.
+	 * (Though only when it's possible that newitem will end up alone on new
+	 * right page.)
+	 */
+	Assert(olddataitemstoleft == olddataitemstotal);
+	if (newitemoff > maxoff)
+		_bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
+
+	/*
+	 * I believe it is not possible to fail to find a feasible split, but just
+	 * in case ...
+	 */
+	if (state.nsplits == 0)
+		elog(ERROR, "could not find a feasible split point for index \"%s\"",
+			 RelationGetRelationName(rel));
+
+	/*
+	 * Start search for a split point among list of legal split points.  Give
+	 * primary consideration to equalizing available free space in each half
+	 * of the split initially (start with default strategy), while applying
+	 * rightmost and split-after-new-item optimizations where appropriate.
+	 * Either of the two other fallback strategies may be required for cases
+	 * with a large number of duplicates around the original/space-optimal
+	 * split point.
+	 *
+	 * Default strategy gives some weight to suffix truncation in deciding a
+	 * split point on leaf pages.  It attempts to select a split point where a
+	 * distinguishing attribute appears earlier in the new high key for the
+	 * left side of the split, in order to maximize the number of trailing
+	 * attributes that can be truncated away.  Only candidate split points
+	 * that imply an acceptable balance of free space on each side are
+	 * considered.  See _bt_defaultinterval().
+	 */
+	if (!state.is_leaf)
+	{
+		/* fillfactormult only used on rightmost page */
+		usemult = state.is_rightmost;
+		fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
+	}
+	else if (state.is_rightmost)
+	{
+		/* Rightmost leaf page --  fillfactormult always used */
+		usemult = true;
+		fillfactormult = leaffillfactor / 100.0;
+	}
+	else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
+	{
+		/*
+		 * New item inserted at rightmost point among a localized grouping on
+		 * a leaf page -- apply "split after new item" optimization, either by
+		 * applying leaf fillfactor multiplier, or by choosing the exact split
+		 * point that leaves newitem as lastleft. (usemult is set for us.)
+		 */
+		if (usemult)
+		{
+			/* fillfactormult should be set based on leaf fillfactor */
+			fillfactormult = leaffillfactor / 100.0;
+		}
+		else
+		{
+			/* find precise split point after newitemoff */
+			for (int i = 0; i < state.nsplits; i++)
+			{
+				SplitPoint *split = state.splits + i;
+
+				if (split->newitemonleft &&
+					newitemoff == split->firstrightoff)
+				{
+					pfree(state.splits);
+					*newitemonleft = true;
+					return newitemoff;
+				}
+			}
+
+			/*
+			 * Cannot legally split after newitemoff; proceed with split
+			 * without using fillfactor multiplier.  This is defensive, and
+			 * should never be needed in practice.
+			 */
+			fillfactormult = 0.50;
+		}
+	}
+	else
+	{
+		/* Other leaf page.  50:50 page split. */
+		usemult = false;
+		/* fillfactormult not used, but be tidy */
+		fillfactormult = 0.50;
+	}
+
+	/*
+	 * Save leftmost and rightmost splits for page before original ordinal
+	 * sort order is lost by delta/fillfactormult sort
+	 */
+	leftpage = state.splits[0];
+	rightpage = state.splits[state.nsplits - 1];
+
+	/* Give split points a fillfactormult-wise delta, and sort on deltas */
+	_bt_deltasortsplits(&state, fillfactormult, usemult);
+
+	/* Determine split interval for default strategy */
+	state.interval = _bt_defaultinterval(&state);
+
+	/*
+	 * Determine if default strategy/split interval will produce a
+	 * sufficiently distinguishing split, or if we should change strategies.
+	 * Alternative strategies change the range of split points that are
+	 * considered acceptable (split interval), and possibly change
+	 * fillfactormult, in order to deal with pages with a large number of
+	 * duplicates gracefully.
+	 *
+	 * Pass low and high splits for the entire page (actually, they're for an
+	 * imaginary version of the page that includes newitem).  These are used
+	 * when the initial split interval encloses split points that are full of
+	 * duplicates, and we need to consider if it's even possible to avoid
+	 * appending a heap TID.
+	 */
+	perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
+
+	if (strategy == SPLIT_DEFAULT)
+	{
+		/*
+		 * Default strategy worked out (always works out with internal page).
+		 * Original split interval still stands.
+		 */
+	}
+
+	/*
+	 * Many duplicates strategy is used when a heap TID would otherwise be
+	 * appended, but the page isn't completely full of logical duplicates.
+	 *
+	 * The split interval is widened to include all legal candidate split
+	 * points.  There might be a few as two distinct values in the whole-page
+	 * split interval, though it's also possible that most of the values on
+	 * the page are unique.  The final split point will either be to the
+	 * immediate left or to the immediate right of the group of duplicate
+	 * tuples that enclose the first/delta-optimal split point (perfect
+	 * penalty was set so that the lowest delta split point that avoids
+	 * appending a heap TID will be chosen).  Maximizing the number of
+	 * attributes that can be truncated away is not a goal of the many
+	 * duplicates strategy.
+	 *
+	 * Single value strategy is used when it is impossible to avoid appending
+	 * a heap TID.  It arranges to leave the left page very full.  This
+	 * maximizes space utilization in cases where tuples with the same
+	 * attribute values span many pages.  Newly inserted duplicates will tend
+	 * to have higher heap TID values, so we'll end up splitting to the right
+	 * consistently.  (Single value strategy is harmless though not
+	 * particularly useful with !heapkeyspace indexes.)
+	 */
+	else if (strategy == SPLIT_MANY_DUPLICATES)
+	{
+		Assert(state.is_leaf);
+		/* Shouldn't try to truncate away extra user attributes */
+		Assert(perfectpenalty ==
+			   IndexRelationGetNumberOfKeyAttributes(state.rel));
+		/* No need to resort splits -- no change in fillfactormult/deltas */
+		state.interval = state.nsplits;
+	}
+	else if (strategy == SPLIT_SINGLE_VALUE)
+	{
+		Assert(state.is_leaf);
+		/* Split near the end of the page */
+		usemult = true;
+		fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
+		/* Resort split points with new delta */
+		_bt_deltasortsplits(&state, fillfactormult, usemult);
+		/* Appending a heap TID is unavoidable, so interval of 1 is fine */
+		state.interval = 1;
+	}
+
+	/*
+	 * Search among acceptable split points (using final split interval) for
+	 * the entry that has the lowest penalty, and is therefore expected to
+	 * maximize fan-out.  Sets *newitemonleft for us.
+	 */
+	firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
+									 strategy);
+	pfree(state.splits);
+
+	return firstrightoff;
+}
+
+/*
+ * Subroutine to record a particular point between two tuples (possibly the
+ * new item) on page (ie, combination of firstrightoff and newitemonleft
+ * settings) in *state for later analysis.  This is also a convenient point to
+ * check if the split is legal (if it isn't, it won't be recorded).
+ *
+ * firstrightoff is the offset of the first item on the original page that
+ * goes to the right page, and firstrightofforigpagetuplesz is the size of
+ * that tuple.  firstrightoff can be > max offset, which means that all the
+ * old items go to the left page and only the new item goes to the right page.
+ * We don't actually use firstrightofforigpagetuplesz in that case (actually,
+ * we don't use it for _any_ split where the firstright tuple happens to be
+ * newitem).
+ *
+ * olddataitemstoleft is the total size of all old items to the left of the
+ * split point that is recorded here when legal.  Should not include
+ * newitemsz, since that is handled here.
+ */
+static void
+_bt_recsplitloc(FindSplitData *state,
+				OffsetNumber firstrightoff,
+				bool newitemonleft,
+				int olddataitemstoleft,
+				Size firstrightofforigpagetuplesz)
+{
+	int16		leftfree,
+				rightfree;
+	Size		firstrightsz;
+	Size		postingsz = 0;
+	bool		newitemisfirstright;
+
+	/* Is the new item going to be split point's firstright tuple? */
+	newitemisfirstright = (firstrightoff == state->newitemoff &&
+						   !newitemonleft);
+
+	if (newitemisfirstright)
+		firstrightsz = state->newitemsz;
+	else
+	{
+		firstrightsz = firstrightofforigpagetuplesz;
+
+		/*
+		 * Calculate suffix truncation space saving when firstright tuple is a
+		 * posting list tuple, though only when the tuple is over 64 bytes
+		 * including line pointer overhead (arbitrary).  This avoids accessing
+		 * the tuple in cases where its posting list must be very small (if
+		 * tuple has one at all).
+		 *
+		 * Note: We don't do this in the case where firstright tuple is
+		 * newitem, since newitem cannot have a posting list.
+		 */
+		if (state->is_leaf && firstrightsz > 64)
+		{
+			ItemId		itemid;
+			IndexTuple	newhighkey;
+
+			itemid = PageGetItemId(state->origpage, firstrightoff);
+			newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid);
+
+			if (BTreeTupleIsPosting(newhighkey))
+				postingsz = IndexTupleSize(newhighkey) -
+					BTreeTupleGetPostingOffset(newhighkey);
+		}
+	}
+
+	/* Account for all the old tuples */
+	leftfree = state->leftspace - olddataitemstoleft;
+	rightfree = state->rightspace -
+		(state->olddataitemstotal - olddataitemstoleft);
+
+	/*
+	 * The first item on the right page becomes the high key of the left page;
+	 * therefore it counts against left space as well as right space (we
+	 * cannot assume that suffix truncation will make it any smaller).  When
+	 * index has included attributes, then those attributes of left page high
+	 * key will be truncated leaving that page with slightly more free space.
+	 * However, that shouldn't affect our ability to find valid split
+	 * location, since we err in the direction of being pessimistic about free
+	 * space on the left half.  Besides, even when suffix truncation of
+	 * non-TID attributes occurs, the new high key often won't even be a
+	 * single MAXALIGN() quantum smaller than the firstright tuple it's based
+	 * on.
+	 *
+	 * If we are on the leaf level, assume that suffix truncation cannot avoid
+	 * adding a heap TID to the left half's new high key when splitting at the
+	 * leaf level.  In practice the new high key will often be smaller and
+	 * will rarely be larger, but conservatively assume the worst case.  We do
+	 * go to the trouble of subtracting away posting list overhead, though
+	 * only when it looks like it will make an appreciable difference.
+	 * (Posting lists are the only case where truncation will typically make
+	 * the final high key far smaller than firstright, so being a bit more
+	 * precise there noticeably improves the balance of free space.)
+	 */
+	if (state->is_leaf)
+		leftfree -= (int16) (firstrightsz +
+							 MAXALIGN(sizeof(ItemPointerData)) -
+							 postingsz);
+	else
+		leftfree -= (int16) firstrightsz;
+
+	/* account for the new item */
+	if (newitemonleft)
+		leftfree -= (int16) state->newitemsz;
+	else
+		rightfree -= (int16) state->newitemsz;
+
+	/*
+	 * If we are not on the leaf level, we will be able to discard the key
+	 * data from the first item that winds up on the right page.
+	 */
+	if (!state->is_leaf)
+		rightfree += (int16) firstrightsz -
+			(int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
+
+	/* Record split if legal */
+	if (leftfree >= 0 && rightfree >= 0)
+	{
+		Assert(state->nsplits < state->maxsplits);
+
+		/* Determine smallest firstright tuple size among legal splits */
+		state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz);
+
+		state->splits[state->nsplits].curdelta = 0;
+		state->splits[state->nsplits].leftfree = leftfree;
+		state->splits[state->nsplits].rightfree = rightfree;
+		state->splits[state->nsplits].firstrightoff = firstrightoff;
+		state->splits[state->nsplits].newitemonleft = newitemonleft;
+		state->nsplits++;
+	}
+}
+
+/*
+ * Subroutine to assign space deltas to materialized array of candidate split
+ * points based on current fillfactor, and to sort array using that fillfactor
+ */
+static void
+_bt_deltasortsplits(FindSplitData *state, double fillfactormult,
+					bool usemult)
+{
+	for (int i = 0; i < state->nsplits; i++)
+	{
+		SplitPoint *split = state->splits + i;
+		int16		delta;
+
+		if (usemult)
+			delta = fillfactormult * split->leftfree -
+				(1.0 - fillfactormult) * split->rightfree;
+		else
+			delta = split->leftfree - split->rightfree;
+
+		if (delta < 0)
+			delta = -delta;
+
+		/* Save delta */
+		split->curdelta = delta;
+	}
+
+	qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
+}
+
+/*
+ * qsort-style comparator used by _bt_deltasortsplits()
+ */
+static int
+_bt_splitcmp(const void *arg1, const void *arg2)
+{
+	SplitPoint *split1 = (SplitPoint *) arg1;
+	SplitPoint *split2 = (SplitPoint *) arg2;
+
+	if (split1->curdelta > split2->curdelta)
+		return 1;
+	if (split1->curdelta < split2->curdelta)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Subroutine to determine whether or not a non-rightmost leaf page should be
+ * split immediately after the would-be original page offset for the
+ * new/incoming tuple (or should have leaf fillfactor applied when new item is
+ * to the right on original page).  This is appropriate when there is a
+ * pattern of localized monotonically increasing insertions into a composite
+ * index, where leading attribute values form local groupings, and we
+ * anticipate further insertions of the same/current grouping (new item's
+ * grouping) in the near future.  This can be thought of as a variation on
+ * applying leaf fillfactor during rightmost leaf page splits, since cases
+ * that benefit will converge on packing leaf pages leaffillfactor% full over
+ * time.
+ *
+ * We may leave extra free space remaining on the rightmost page of a "most
+ * significant column" grouping of tuples if that grouping never ends up
+ * having future insertions that use the free space.  That effect is
+ * self-limiting; a future grouping that becomes the "nearest on the right"
+ * grouping of the affected grouping usually puts the extra free space to good
+ * use.
+ *
+ * Caller uses optimization when routine returns true, though the exact action
+ * taken by caller varies.  Caller uses original leaf page fillfactor in
+ * standard way rather than using the new item offset directly when *usemult
+ * was also set to true here.  Otherwise, caller applies optimization by
+ * locating the legal split point that makes the new tuple the lastleft tuple
+ * for the split.
+ */
+static bool
+_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+					int leaffillfactor, bool *usemult)
+{
+	int16		nkeyatts;
+	ItemId		itemid;
+	IndexTuple	tup;
+	int			keepnatts;
+
+	Assert(state->is_leaf && !state->is_rightmost);
+
+	nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+
+	/* Single key indexes not considered here */
+	if (nkeyatts == 1)
+		return false;
+
+	/* Ascending insertion pattern never inferred when new item is first */
+	if (state->newitemoff == P_FIRSTKEY)
+		return false;
+
+	/*
+	 * Only apply optimization on pages with equisized tuples, since ordinal
+	 * keys are likely to be fixed-width.  Testing if the new tuple is
+	 * variable width directly might also work, but that fails to apply the
+	 * optimization to indexes with a numeric_ops attribute.
+	 *
+	 * Conclude that page has equisized tuples when the new item is the same
+	 * width as the smallest item observed during pass over page, and other
+	 * non-pivot tuples must be the same width as well.  (Note that the
+	 * possibly-truncated existing high key isn't counted in
+	 * olddataitemstotal, and must be subtracted from maxoff.)
+	 */
+	if (state->newitemsz != state->minfirstrightsz)
+		return false;
+	if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
+		return false;
+
+	/*
+	 * Avoid applying optimization when tuples are wider than a tuple
+	 * consisting of two non-NULL int8/int64 attributes (or four non-NULL
+	 * int4/int32 attributes)
+	 */
+	if (state->newitemsz >
+		MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
+		sizeof(ItemIdData))
+		return false;
+
+	/*
+	 * At least the first attribute's value must be equal to the corresponding
+	 * value in previous tuple to apply optimization.  New item cannot be a
+	 * duplicate, either.
+	 *
+	 * Handle case where new item is to the right of all items on the existing
+	 * page.  This is suggestive of monotonically increasing insertions in
+	 * itself, so the "heap TID adjacency" test is not applied here.
+	 */
+	if (state->newitemoff > maxoff)
+	{
+		itemid = PageGetItemId(state->origpage, maxoff);
+		tup = (IndexTuple) PageGetItem(state->origpage, itemid);
+		keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+		if (keepnatts > 1 && keepnatts <= nkeyatts)
+		{
+			*usemult = true;
+			return true;
+		}
+
+		return false;
+	}
+
+	/*
+	 * "Low cardinality leading column, high cardinality suffix column"
+	 * indexes with a random insertion pattern (e.g., an index with a boolean
+	 * column, such as an index on '(book_is_in_print, book_isbn)') present us
+	 * with a risk of consistently misapplying the optimization.  We're
+	 * willing to accept very occasional misapplication of the optimization,
+	 * provided the cases where we get it wrong are rare and self-limiting.
+	 *
+	 * Heap TID adjacency strongly suggests that the item just to the left was
+	 * inserted very recently, which limits overapplication of the
+	 * optimization.  Besides, all inappropriate cases triggered here will
+	 * still split in the middle of the page on average.
+	 */
+	itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff));
+	tup = (IndexTuple) PageGetItem(state->origpage, itemid);
+	/* Do cheaper test first */
+	if (BTreeTupleIsPosting(tup) ||
+		!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+		return false;
+	/* Check same conditions as rightmost item case, too */
+	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+	if (keepnatts > 1 && keepnatts <= nkeyatts)
+	{
+		double		interp = (double) state->newitemoff / ((double) maxoff + 1);
+		double		leaffillfactormult = (double) leaffillfactor / 100.0;
+
+		/*
+		 * Don't allow caller to split after a new item when it will result in
+		 * a split point to the right of the point that a leaf fillfactor
+		 * split would use -- have caller apply leaf fillfactor instead
+		 */
+		*usemult = interp > leaffillfactormult;
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Subroutine for determining if two heap TIDS are "adjacent".
+ *
+ * Adjacent means that the high TID is very likely to have been inserted into
+ * heap relation immediately after the low TID, probably during the current
+ * transaction.
+ */
+static bool
+_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
+{
+	BlockNumber lowblk,
+				highblk;
+
+	lowblk = ItemPointerGetBlockNumber(lowhtid);
+	highblk = ItemPointerGetBlockNumber(highhtid);
+
+	/* Make optimistic assumption of adjacency when heap blocks match */
+	if (lowblk == highblk)
+		return true;
+
+	/* When heap block one up, second offset should be FirstOffsetNumber */
+	if (lowblk + 1 == highblk &&
+		ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
+		return true;
+
+	return false;
+}
+
+/*
+ * Subroutine to find the "best" split point among candidate split points.
+ * The best split point is the split point with the lowest penalty among split
+ * points that fall within current/final split interval.  Penalty is an
+ * abstract score, with a definition that varies depending on whether we're
+ * splitting a leaf page or an internal page.  See _bt_split_penalty() for
+ * details.
+ *
+ * "perfectpenalty" is assumed to be the lowest possible penalty among
+ * candidate split points.  This allows us to return early without wasting
+ * cycles on calculating the first differing attribute for all candidate
+ * splits when that clearly cannot improve our choice (or when we only want a
+ * minimally distinguishing split point, and don't want to make the split any
+ * more unbalanced than is necessary).
+ *
+ * We return the index of the first existing tuple that should go on the right
+ * page, plus a boolean indicating if new item is on left of split point.
+ */
+static OffsetNumber
+_bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
+				 bool *newitemonleft, FindSplitStrat strategy)
+{
+	int			bestpenalty,
+				lowsplit;
+	int			highsplit = Min(state->interval, state->nsplits);
+	SplitPoint *final;
+
+	bestpenalty = INT_MAX;
+	lowsplit = 0;
+	for (int i = lowsplit; i < highsplit; i++)
+	{
+		int			penalty;
+
+		penalty = _bt_split_penalty(state, state->splits + i);
+
+		if (penalty < bestpenalty)
+		{
+			bestpenalty = penalty;
+			lowsplit = i;
+		}
+
+		if (penalty <= perfectpenalty)
+			break;
+	}
+
+	final = &state->splits[lowsplit];
+
+	/*
+	 * There is a risk that the "many duplicates" strategy will repeatedly do
+	 * the wrong thing when there are monotonically decreasing insertions to
+	 * the right of a large group of duplicates.   Repeated splits could leave
+	 * a succession of right half pages with free space that can never be
+	 * used.  This must be avoided.
+	 *
+	 * Consider the example of the leftmost page in a single integer attribute
+	 * NULLS FIRST index which is almost filled with NULLs.  Monotonically
+	 * decreasing integer insertions might cause the same leftmost page to
+	 * split repeatedly at the same point.  Each split derives its new high
+	 * key from the lowest current value to the immediate right of the large
+	 * group of NULLs, which will always be higher than all future integer
+	 * insertions, directing all future integer insertions to the same
+	 * leftmost page.
+	 */
+	if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
+		!final->newitemonleft && final->firstrightoff >= state->newitemoff &&
+		final->firstrightoff < state->newitemoff + 9)
+	{
+		/*
+		 * Avoid the problem by performing a 50:50 split when the new item is
+		 * just to the right of the would-be "many duplicates" split point.
+		 * (Note that the test used for an insert that is "just to the right"
+		 * of the split point is conservative.)
+		 */
+		final = &state->splits[0];
+	}
+
+	*newitemonleft = final->newitemonleft;
+	return final->firstrightoff;
+}
+
+#define LEAF_SPLIT_DISTANCE			0.050
+#define INTERNAL_SPLIT_DISTANCE		0.075
+
+/*
+ * Return a split interval to use for the default strategy.  This is a limit
+ * on the number of candidate split points to give further consideration to.
+ * Only a fraction of all candidate splits points (those located at the start
+ * of the now-sorted splits array) fall within the split interval.  Split
+ * interval is applied within _bt_bestsplitloc().
+ *
+ * Split interval represents an acceptable range of split points -- those that
+ * have leftfree and rightfree values that are acceptably balanced.  The final
+ * split point chosen is the split point with the lowest "penalty" among split
+ * points in this split interval (unless we change our entire strategy, in
+ * which case the interval also changes -- see _bt_strategy()).
+ *
+ * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits,
+ * and sigma b for internal ("branch") splits.  It's hard to provide a
+ * theoretical justification for the size of the split interval, though it's
+ * clear that a small split interval can make tuples on level L+1 much smaller
+ * on average, without noticeably affecting space utilization on level L.
+ * (Note that the way that we calculate split interval might need to change if
+ * suffix truncation is taught to truncate tuples "within" the last
+ * attribute/datum for data types like text, which is more or less how it is
+ * assumed to work in the paper.)
+ */
+static int
+_bt_defaultinterval(FindSplitData *state)
+{
+	SplitPoint *spaceoptimal;
+	int16		tolerance,
+				lowleftfree,
+				lowrightfree,
+				highleftfree,
+				highrightfree;
+
+	/*
+	 * Determine leftfree and rightfree values that are higher and lower than
+	 * we're willing to tolerate.  Note that the final split interval will be
+	 * about 10% of nsplits in the common case where all non-pivot tuples
+	 * (data items) from a leaf page are uniformly sized.  We're a bit more
+	 * aggressive when splitting internal pages.
+	 */
+	if (state->is_leaf)
+		tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
+	else
+		tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
+
+	/* First candidate split point is the most evenly balanced */
+	spaceoptimal = state->splits;
+	lowleftfree = spaceoptimal->leftfree - tolerance;
+	lowrightfree = spaceoptimal->rightfree - tolerance;
+	highleftfree = spaceoptimal->leftfree + tolerance;
+	highrightfree = spaceoptimal->rightfree + tolerance;
+
+	/*
+	 * Iterate through split points, starting from the split immediately after
+	 * 'spaceoptimal'.  Find the first split point that divides free space so
+	 * unevenly that including it in the split interval would be unacceptable.
+	 */
+	for (int i = 1; i < state->nsplits; i++)
+	{
+		SplitPoint *split = state->splits + i;
+
+		/* Cannot use curdelta here, since its value is often weighted */
+		if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
+			split->leftfree > highleftfree || split->rightfree > highrightfree)
+			return i;
+	}
+
+	return state->nsplits;
+}
+
+/*
+ * Subroutine to decide whether split should use default strategy/initial
+ * split interval, or whether it should finish splitting the page using
+ * alternative strategies (this is only possible with leaf pages).
+ *
+ * Caller uses alternative strategy (or sticks with default strategy) based
+ * on how *strategy is set here.  Return value is "perfect penalty", which is
+ * passed to _bt_bestsplitloc() as a final constraint on how far caller is
+ * willing to go to avoid appending a heap TID when using the many duplicates
+ * strategy (it also saves _bt_bestsplitloc() useless cycles).
+ */
+static int
+_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
+			 SplitPoint *rightpage, FindSplitStrat *strategy)
+{
+	IndexTuple	leftmost,
+				rightmost;
+	SplitPoint *leftinterval,
+			   *rightinterval;
+	int			perfectpenalty;
+	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+
+	/* Assume that alternative strategy won't be used for now */
+	*strategy = SPLIT_DEFAULT;
+
+	/*
+	 * Use smallest observed firstright item size for entire page (actually,
+	 * entire imaginary version of page that includes newitem) as perfect
+	 * penalty on internal pages.  This can save cycles in the common case
+	 * where most or all splits (not just splits within interval) have
+	 * firstright tuples that are the same size.
+	 */
+	if (!state->is_leaf)
+		return state->minfirstrightsz;
+
+	/*
+	 * Use leftmost and rightmost tuples from leftmost and rightmost splits in
+	 * current split interval
+	 */
+	_bt_interval_edges(state, &leftinterval, &rightinterval);
+	leftmost = _bt_split_lastleft(state, leftinterval);
+	rightmost = _bt_split_firstright(state, rightinterval);
+
+	/*
+	 * If initial split interval can produce a split point that will at least
+	 * avoid appending a heap TID in new high key, we're done.  Finish split
+	 * with default strategy and initial split interval.
+	 */
+	perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
+	if (perfectpenalty <= indnkeyatts)
+		return perfectpenalty;
+
+	/*
+	 * Work out how caller should finish split when even their "perfect"
+	 * penalty for initial/default split interval indicates that the interval
+	 * does not contain even a single split that avoids appending a heap TID.
+	 *
+	 * Use the leftmost split's lastleft tuple and the rightmost split's
+	 * firstright tuple to assess every possible split.
+	 */
+	leftmost = _bt_split_lastleft(state, leftpage);
+	rightmost = _bt_split_firstright(state, rightpage);
+
+	/*
+	 * If page (including new item) has many duplicates but is not entirely
+	 * full of duplicates, a many duplicates strategy split will be performed.
+	 * If page is entirely full of duplicates, a single value strategy split
+	 * will be performed.
+	 */
+	perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
+	if (perfectpenalty <= indnkeyatts)
+	{
+		*strategy = SPLIT_MANY_DUPLICATES;
+
+		/*
+		 * Many duplicates strategy should split at either side the group of
+		 * duplicates that enclose the delta-optimal split point.  Return
+		 * indnkeyatts rather than the true perfect penalty to make that
+		 * happen.  (If perfectpenalty was returned here then low cardinality
+		 * composite indexes could have continual unbalanced splits.)
+		 *
+		 * Note that caller won't go through with a many duplicates split in
+		 * rare cases where it looks like there are ever-decreasing insertions
+		 * to the immediate right of the split point.  This must happen just
+		 * before a final decision is made, within _bt_bestsplitloc().
+		 */
+		return indnkeyatts;
+	}
+
+	/*
+	 * Single value strategy is only appropriate with ever-increasing heap
+	 * TIDs; otherwise, original default strategy split should proceed to
+	 * avoid pathological performance.  Use page high key to infer if this is
+	 * the rightmost page among pages that store the same duplicate value.
+	 * This should not prevent insertions of heap TIDs that are slightly out
+	 * of order from using single value strategy, since that's expected with
+	 * concurrent inserters of the same duplicate value.
+	 */
+	else if (state->is_rightmost)
+		*strategy = SPLIT_SINGLE_VALUE;
+	else
+	{
+		ItemId		itemid;
+		IndexTuple	hikey;
+
+		itemid = PageGetItemId(state->origpage, P_HIKEY);
+		hikey = (IndexTuple) PageGetItem(state->origpage, itemid);
+		perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
+											 state->newitem);
+		if (perfectpenalty <= indnkeyatts)
+			*strategy = SPLIT_SINGLE_VALUE;
+		else
+		{
+			/*
+			 * Have caller finish split using default strategy, since page
+			 * does not appear to be the rightmost page for duplicates of the
+			 * value the page is filled with
+			 */
+		}
+	}
+
+	return perfectpenalty;
+}
+
+/*
+ * Subroutine to locate leftmost and rightmost splits for current/default
+ * split interval.  Note that it will be the same split iff there is only one
+ * split in interval.
+ */
+static void
+_bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
+				   SplitPoint **rightinterval)
+{
+	int			highsplit = Min(state->interval, state->nsplits);
+	SplitPoint *deltaoptimal;
+
+	deltaoptimal = state->splits;
+	*leftinterval = NULL;
+	*rightinterval = NULL;
+
+	/*
+	 * Delta is an absolute distance to optimal split point, so both the
+	 * leftmost and rightmost split point will usually be at the end of the
+	 * array
+	 */
+	for (int i = highsplit - 1; i >= 0; i--)
+	{
+		SplitPoint *distant = state->splits + i;
+
+		if (distant->firstrightoff < deltaoptimal->firstrightoff)
+		{
+			if (*leftinterval == NULL)
+				*leftinterval = distant;
+		}
+		else if (distant->firstrightoff > deltaoptimal->firstrightoff)
+		{
+			if (*rightinterval == NULL)
+				*rightinterval = distant;
+		}
+		else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
+		{
+			/*
+			 * "incoming tuple will become firstright" (distant) is to the
+			 * left of "incoming tuple will become lastleft" (delta-optimal)
+			 */
+			Assert(distant->firstrightoff == state->newitemoff);
+			if (*leftinterval == NULL)
+				*leftinterval = distant;
+		}
+		else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
+		{
+			/*
+			 * "incoming tuple will become lastleft" (distant) is to the right
+			 * of "incoming tuple will become firstright" (delta-optimal)
+			 */
+			Assert(distant->firstrightoff == state->newitemoff);
+			if (*rightinterval == NULL)
+				*rightinterval = distant;
+		}
+		else
+		{
+			/* There was only one or two splits in initial split interval */
+			Assert(distant == deltaoptimal);
+			if (*leftinterval == NULL)
+				*leftinterval = distant;
+			if (*rightinterval == NULL)
+				*rightinterval = distant;
+		}
+
+		if (*leftinterval && *rightinterval)
+			return;
+	}
+
+	Assert(false);
+}
+
+/*
+ * Subroutine to find penalty for caller's candidate split point.
+ *
+ * On leaf pages, penalty is the attribute number that distinguishes each side
+ * of a split.  It's the last attribute that needs to be included in new high
+ * key for left page.  It can be greater than the number of key attributes in
+ * cases where a heap TID will need to be appended during truncation.
+ *
+ * On internal pages, penalty is simply the size of the firstright tuple for
+ * the split (including line pointer overhead).  This tuple will become the
+ * new high key for the left page.
+ */
+static inline int
+_bt_split_penalty(FindSplitData *state, SplitPoint *split)
+{
+	IndexTuple	lastleft;
+	IndexTuple	firstright;
+
+	if (!state->is_leaf)
+	{
+		ItemId		itemid;
+
+		if (!split->newitemonleft &&
+			split->firstrightoff == state->newitemoff)
+			return state->newitemsz;
+
+		itemid = PageGetItemId(state->origpage, split->firstrightoff);
+
+		return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+	}
+
+	lastleft = _bt_split_lastleft(state, split);
+	firstright = _bt_split_firstright(state, split);
+
+	return _bt_keep_natts_fast(state->rel, lastleft, firstright);
+}
+
+/*
+ * Subroutine to get a lastleft IndexTuple for a split point
+ */
+static inline IndexTuple
+_bt_split_lastleft(FindSplitData *state, SplitPoint *split)
+{
+	ItemId		itemid;
+
+	if (split->newitemonleft && split->firstrightoff == state->newitemoff)
+		return state->newitem;
+
+	itemid = PageGetItemId(state->origpage,
+						   OffsetNumberPrev(split->firstrightoff));
+	return (IndexTuple) PageGetItem(state->origpage, itemid);
+}
+
+/*
+ * Subroutine to get a firstright IndexTuple for a split point
+ */
+static inline IndexTuple
+_bt_split_firstright(FindSplitData *state, SplitPoint *split)
+{
+	ItemId		itemid;
+
+	if (!split->newitemonleft && split->firstrightoff == state->newitemoff)
+		return state->newitem;
+
+	itemid = PageGetItemId(state->origpage, split->firstrightoff);
+	return (IndexTuple) PageGetItem(state->origpage, itemid);
+}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
new file mode 100644
index 0000000..d524310
--- /dev/null
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -0,0 +1,2751 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtutils.c
+ *	  Utility code for Postgres btree implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtutils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <time.h>
+
+#include "access/nbtree.h"
+#include "access/reloptions.h"
+#include "access/relscan.h"
+#include "catalog/catalog.h"
+#include "commands/progress.h"
+#include "lib/qunique.h"
+#include "miscadmin.h"
+#include "utils/array.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+typedef struct BTSortArrayContext
+{
+	FmgrInfo	flinfo;
+	Oid			collation;
+	bool		reverse;
+} BTSortArrayContext;
+
+static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
+									  StrategyNumber strat,
+									  Datum *elems, int nelems);
+static int	_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
+									bool reverse,
+									Datum *elems, int nelems);
+static int	_bt_compare_array_elements(const void *a, const void *b, void *arg);
+static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
+									 ScanKey leftarg, ScanKey rightarg,
+									 bool *result);
+static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
+static void _bt_mark_scankey_required(ScanKey skey);
+static bool _bt_check_rowcompare(ScanKey skey,
+								 IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+								 ScanDirection dir, bool *continuescan);
+static int	_bt_keep_natts(Relation rel, IndexTuple lastleft,
+						   IndexTuple firstright, BTScanInsert itup_key);
+
+
+/*
+ * _bt_mkscankey
+ *		Build an insertion scan key that contains comparison data from itup
+ *		as well as comparator routines appropriate to the key datatypes.
+ *
+ *		When itup is a non-pivot tuple, the returned insertion scan key is
+ *		suitable for finding a place for it to go on the leaf level.  Pivot
+ *		tuples can be used to re-find leaf page with matching high key, but
+ *		then caller needs to set scan key's pivotsearch field to true.  This
+ *		allows caller to search for a leaf page with a matching high key,
+ *		which is usually to the left of the first leaf page a non-pivot match
+ *		might appear on.
+ *
+ *		The result is intended for use with _bt_compare() and _bt_truncate().
+ *		Callers that don't need to fill out the insertion scankey arguments
+ *		(e.g. they use an ad-hoc comparison routine, or only need a scankey
+ *		for _bt_truncate()) can pass a NULL index tuple.  The scankey will
+ *		be initialized as if an "all truncated" pivot tuple was passed
+ *		instead.
+ *
+ *		Note that we may occasionally have to share lock the metapage to
+ *		determine whether or not the keys in the index are expected to be
+ *		unique (i.e. if this is a "heapkeyspace" index).  We assume a
+ *		heapkeyspace index when caller passes a NULL tuple, allowing index
+ *		build callers to avoid accessing the non-existent metapage.  We
+ *		also assume that the index is _not_ allequalimage when a NULL tuple
+ *		is passed; CREATE INDEX callers call _bt_allequalimage() to set the
+ *		field themselves.
+ */
+BTScanInsert
+_bt_mkscankey(Relation rel, IndexTuple itup)
+{
+	BTScanInsert key;
+	ScanKey		skey;
+	TupleDesc	itupdesc;
+	int			indnkeyatts;
+	int16	   *indoption;
+	int			tupnatts;
+	int			i;
+
+	itupdesc = RelationGetDescr(rel);
+	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	indoption = rel->rd_indoption;
+	tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
+
+	Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
+
+	/*
+	 * We'll execute search using scan key constructed on key columns.
+	 * Truncated attributes and non-key attributes are omitted from the final
+	 * scan key.
+	 */
+	key = palloc(offsetof(BTScanInsertData, scankeys) +
+				 sizeof(ScanKeyData) * indnkeyatts);
+	if (itup)
+		_bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
+	else
+	{
+		/* Utility statement callers can set these fields themselves */
+		key->heapkeyspace = true;
+		key->allequalimage = false;
+	}
+	key->anynullkeys = false;	/* initial assumption */
+	key->nextkey = false;
+	key->pivotsearch = false;
+	key->keysz = Min(indnkeyatts, tupnatts);
+	key->scantid = key->heapkeyspace && itup ?
+		BTreeTupleGetHeapTID(itup) : NULL;
+	skey = key->scankeys;
+	for (i = 0; i < indnkeyatts; i++)
+	{
+		FmgrInfo   *procinfo;
+		Datum		arg;
+		bool		null;
+		int			flags;
+
+		/*
+		 * We can use the cached (default) support procs since no cross-type
+		 * comparison can be needed.
+		 */
+		procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
+
+		/*
+		 * Key arguments built from truncated attributes (or when caller
+		 * provides no tuple) are defensively represented as NULL values. They
+		 * should never be used.
+		 */
+		if (i < tupnatts)
+			arg = index_getattr(itup, i + 1, itupdesc, &null);
+		else
+		{
+			arg = (Datum) 0;
+			null = true;
+		}
+		flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
+		ScanKeyEntryInitializeWithInfo(&skey[i],
+									   flags,
+									   (AttrNumber) (i + 1),
+									   InvalidStrategy,
+									   InvalidOid,
+									   rel->rd_indcollation[i],
+									   procinfo,
+									   arg);
+		/* Record if any key attribute is NULL (or truncated) */
+		if (null)
+			key->anynullkeys = true;
+	}
+
+	return key;
+}
+
+/*
+ * free a retracement stack made by _bt_search.
+ */
+void
+_bt_freestack(BTStack stack)
+{
+	BTStack		ostack;
+
+	while (stack != NULL)
+	{
+		ostack = stack;
+		stack = stack->bts_parent;
+		pfree(ostack);
+	}
+}
+
+
+/*
+ *	_bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
+ *
+ * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
+ * set up BTArrayKeyInfo info for each one that is an equality-type key.
+ * Prepare modified scan keys in so->arrayKeyData, which will hold the current
+ * array elements during each primitive indexscan operation.  For inequality
+ * array keys, it's sufficient to find the extreme element value and replace
+ * the whole array with that scalar value.
+ *
+ * Note: the reason we need so->arrayKeyData, rather than just scribbling
+ * on scan->keyData, is that callers are permitted to call btrescan without
+ * supplying a new set of scankey data.
+ */
+void
+_bt_preprocess_array_keys(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	int			numberOfKeys = scan->numberOfKeys;
+	int16	   *indoption = scan->indexRelation->rd_indoption;
+	int			numArrayKeys;
+	ScanKey		cur;
+	int			i;
+	MemoryContext oldContext;
+
+	/* Quick check to see if there are any array keys */
+	numArrayKeys = 0;
+	for (i = 0; i < numberOfKeys; i++)
+	{
+		cur = &scan->keyData[i];
+		if (cur->sk_flags & SK_SEARCHARRAY)
+		{
+			numArrayKeys++;
+			Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
+			/* If any arrays are null as a whole, we can quit right now. */
+			if (cur->sk_flags & SK_ISNULL)
+			{
+				so->numArrayKeys = -1;
+				so->arrayKeyData = NULL;
+				return;
+			}
+		}
+	}
+
+	/* Quit if nothing to do. */
+	if (numArrayKeys == 0)
+	{
+		so->numArrayKeys = 0;
+		so->arrayKeyData = NULL;
+		return;
+	}
+
+	/*
+	 * Make a scan-lifespan context to hold array-associated data, or reset it
+	 * if we already have one from a previous rescan cycle.
+	 */
+	if (so->arrayContext == NULL)
+		so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
+												 "BTree array context",
+												 ALLOCSET_SMALL_SIZES);
+	else
+		MemoryContextReset(so->arrayContext);
+
+	oldContext = MemoryContextSwitchTo(so->arrayContext);
+
+	/* Create modifiable copy of scan->keyData in the workspace context */
+	so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
+	memcpy(so->arrayKeyData,
+		   scan->keyData,
+		   scan->numberOfKeys * sizeof(ScanKeyData));
+
+	/* Allocate space for per-array data in the workspace context */
+	so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
+
+	/* Now process each array key */
+	numArrayKeys = 0;
+	for (i = 0; i < numberOfKeys; i++)
+	{
+		ArrayType  *arrayval;
+		int16		elmlen;
+		bool		elmbyval;
+		char		elmalign;
+		int			num_elems;
+		Datum	   *elem_values;
+		bool	   *elem_nulls;
+		int			num_nonnulls;
+		int			j;
+
+		cur = &so->arrayKeyData[i];
+		if (!(cur->sk_flags & SK_SEARCHARRAY))
+			continue;
+
+		/*
+		 * First, deconstruct the array into elements.  Anything allocated
+		 * here (including a possibly detoasted array value) is in the
+		 * workspace context.
+		 */
+		arrayval = DatumGetArrayTypeP(cur->sk_argument);
+		/* We could cache this data, but not clear it's worth it */
+		get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+							 &elmlen, &elmbyval, &elmalign);
+		deconstruct_array(arrayval,
+						  ARR_ELEMTYPE(arrayval),
+						  elmlen, elmbyval, elmalign,
+						  &elem_values, &elem_nulls, &num_elems);
+
+		/*
+		 * Compress out any null elements.  We can ignore them since we assume
+		 * all btree operators are strict.
+		 */
+		num_nonnulls = 0;
+		for (j = 0; j < num_elems; j++)
+		{
+			if (!elem_nulls[j])
+				elem_values[num_nonnulls++] = elem_values[j];
+		}
+
+		/* We could pfree(elem_nulls) now, but not worth the cycles */
+
+		/* If there's no non-nulls, the scan qual is unsatisfiable */
+		if (num_nonnulls == 0)
+		{
+			numArrayKeys = -1;
+			break;
+		}
+
+		/*
+		 * If the comparison operator is not equality, then the array qual
+		 * degenerates to a simple comparison against the smallest or largest
+		 * non-null array element, as appropriate.
+		 */
+		switch (cur->sk_strategy)
+		{
+			case BTLessStrategyNumber:
+			case BTLessEqualStrategyNumber:
+				cur->sk_argument =
+					_bt_find_extreme_element(scan, cur,
+											 BTGreaterStrategyNumber,
+											 elem_values, num_nonnulls);
+				continue;
+			case BTEqualStrategyNumber:
+				/* proceed with rest of loop */
+				break;
+			case BTGreaterEqualStrategyNumber:
+			case BTGreaterStrategyNumber:
+				cur->sk_argument =
+					_bt_find_extreme_element(scan, cur,
+											 BTLessStrategyNumber,
+											 elem_values, num_nonnulls);
+				continue;
+			default:
+				elog(ERROR, "unrecognized StrategyNumber: %d",
+					 (int) cur->sk_strategy);
+				break;
+		}
+
+		/*
+		 * Sort the non-null elements and eliminate any duplicates.  We must
+		 * sort in the same ordering used by the index column, so that the
+		 * successive primitive indexscans produce data in index order.
+		 */
+		num_elems = _bt_sort_array_elements(scan, cur,
+											(indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
+											elem_values, num_nonnulls);
+
+		/*
+		 * And set up the BTArrayKeyInfo data.
+		 */
+		so->arrayKeys[numArrayKeys].scan_key = i;
+		so->arrayKeys[numArrayKeys].num_elems = num_elems;
+		so->arrayKeys[numArrayKeys].elem_values = elem_values;
+		numArrayKeys++;
+	}
+
+	so->numArrayKeys = numArrayKeys;
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * _bt_find_extreme_element() -- get least or greatest array element
+ *
+ * scan and skey identify the index column, whose opfamily determines the
+ * comparison semantics.  strat should be BTLessStrategyNumber to get the
+ * least element, or BTGreaterStrategyNumber to get the greatest.
+ */
+static Datum
+_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
+						 StrategyNumber strat,
+						 Datum *elems, int nelems)
+{
+	Relation	rel = scan->indexRelation;
+	Oid			elemtype,
+				cmp_op;
+	RegProcedure cmp_proc;
+	FmgrInfo	flinfo;
+	Datum		result;
+	int			i;
+
+	/*
+	 * Determine the nominal datatype of the array elements.  We have to
+	 * support the convention that sk_subtype == InvalidOid means the opclass
+	 * input type; this is a hack to simplify life for ScanKeyInit().
+	 */
+	elemtype = skey->sk_subtype;
+	if (elemtype == InvalidOid)
+		elemtype = rel->rd_opcintype[skey->sk_attno - 1];
+
+	/*
+	 * Look up the appropriate comparison operator in the opfamily.
+	 *
+	 * Note: it's possible that this would fail, if the opfamily is
+	 * incomplete, but it seems quite unlikely that an opfamily would omit
+	 * non-cross-type comparison operators for any datatype that it supports
+	 * at all.
+	 */
+	cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
+								 elemtype,
+								 elemtype,
+								 strat);
+	if (!OidIsValid(cmp_op))
+		elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
+			 strat, elemtype, elemtype,
+			 rel->rd_opfamily[skey->sk_attno - 1]);
+	cmp_proc = get_opcode(cmp_op);
+	if (!RegProcedureIsValid(cmp_proc))
+		elog(ERROR, "missing oprcode for operator %u", cmp_op);
+
+	fmgr_info(cmp_proc, &flinfo);
+
+	Assert(nelems > 0);
+	result = elems[0];
+	for (i = 1; i < nelems; i++)
+	{
+		if (DatumGetBool(FunctionCall2Coll(&flinfo,
+										   skey->sk_collation,
+										   elems[i],
+										   result)))
+			result = elems[i];
+	}
+
+	return result;
+}
+
+/*
+ * _bt_sort_array_elements() -- sort and de-dup array elements
+ *
+ * The array elements are sorted in-place, and the new number of elements
+ * after duplicate removal is returned.
+ *
+ * scan and skey identify the index column, whose opfamily determines the
+ * comparison semantics.  If reverse is true, we sort in descending order.
+ */
+static int
+_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
+						bool reverse,
+						Datum *elems, int nelems)
+{
+	Relation	rel = scan->indexRelation;
+	Oid			elemtype;
+	RegProcedure cmp_proc;
+	BTSortArrayContext cxt;
+
+	if (nelems <= 1)
+		return nelems;			/* no work to do */
+
+	/*
+	 * Determine the nominal datatype of the array elements.  We have to
+	 * support the convention that sk_subtype == InvalidOid means the opclass
+	 * input type; this is a hack to simplify life for ScanKeyInit().
+	 */
+	elemtype = skey->sk_subtype;
+	if (elemtype == InvalidOid)
+		elemtype = rel->rd_opcintype[skey->sk_attno - 1];
+
+	/*
+	 * Look up the appropriate comparison function in the opfamily.
+	 *
+	 * Note: it's possible that this would fail, if the opfamily is
+	 * incomplete, but it seems quite unlikely that an opfamily would omit
+	 * non-cross-type support functions for any datatype that it supports at
+	 * all.
+	 */
+	cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
+								 elemtype,
+								 elemtype,
+								 BTORDER_PROC);
+	if (!RegProcedureIsValid(cmp_proc))
+		elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
+			 BTORDER_PROC, elemtype, elemtype,
+			 rel->rd_opfamily[skey->sk_attno - 1]);
+
+	/* Sort the array elements */
+	fmgr_info(cmp_proc, &cxt.flinfo);
+	cxt.collation = skey->sk_collation;
+	cxt.reverse = reverse;
+	qsort_arg((void *) elems, nelems, sizeof(Datum),
+			  _bt_compare_array_elements, (void *) &cxt);
+
+	/* Now scan the sorted elements and remove duplicates */
+	return qunique_arg(elems, nelems, sizeof(Datum),
+					   _bt_compare_array_elements, &cxt);
+}
+
+/*
+ * qsort_arg comparator for sorting array elements
+ */
+static int
+_bt_compare_array_elements(const void *a, const void *b, void *arg)
+{
+	Datum		da = *((const Datum *) a);
+	Datum		db = *((const Datum *) b);
+	BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
+	int32		compare;
+
+	compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo,
+											  cxt->collation,
+											  da, db));
+	if (cxt->reverse)
+		INVERT_COMPARE_RESULT(compare);
+	return compare;
+}
+
+/*
+ * _bt_start_array_keys() -- Initialize array keys at start of a scan
+ *
+ * Set up the cur_elem counters and fill in the first sk_argument value for
+ * each array scankey.  We can't do this until we know the scan direction.
+ */
+void
+_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	int			i;
+
+	for (i = 0; i < so->numArrayKeys; i++)
+	{
+		BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+		ScanKey		skey = &so->arrayKeyData[curArrayKey->scan_key];
+
+		Assert(curArrayKey->num_elems > 0);
+		if (ScanDirectionIsBackward(dir))
+			curArrayKey->cur_elem = curArrayKey->num_elems - 1;
+		else
+			curArrayKey->cur_elem = 0;
+		skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
+	}
+}
+
+/*
+ * _bt_advance_array_keys() -- Advance to next set of array elements
+ *
+ * Returns true if there is another set of values to consider, false if not.
+ * On true result, the scankeys are initialized with the next set of values.
+ */
+bool
+_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		found = false;
+	int			i;
+
+	/*
+	 * We must advance the last array key most quickly, since it will
+	 * correspond to the lowest-order index column among the available
+	 * qualifications. This is necessary to ensure correct ordering of output
+	 * when there are multiple array keys.
+	 */
+	for (i = so->numArrayKeys - 1; i >= 0; i--)
+	{
+		BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+		ScanKey		skey = &so->arrayKeyData[curArrayKey->scan_key];
+		int			cur_elem = curArrayKey->cur_elem;
+		int			num_elems = curArrayKey->num_elems;
+
+		if (ScanDirectionIsBackward(dir))
+		{
+			if (--cur_elem < 0)
+			{
+				cur_elem = num_elems - 1;
+				found = false;	/* need to advance next array key */
+			}
+			else
+				found = true;
+		}
+		else
+		{
+			if (++cur_elem >= num_elems)
+			{
+				cur_elem = 0;
+				found = false;	/* need to advance next array key */
+			}
+			else
+				found = true;
+		}
+
+		curArrayKey->cur_elem = cur_elem;
+		skey->sk_argument = curArrayKey->elem_values[cur_elem];
+		if (found)
+			break;
+	}
+
+	/* advance parallel scan */
+	if (scan->parallel_scan != NULL)
+		_bt_parallel_advance_array_keys(scan);
+
+	return found;
+}
+
+/*
+ * _bt_mark_array_keys() -- Handle array keys during btmarkpos
+ *
+ * Save the current state of the array keys as the "mark" position.
+ */
+void
+_bt_mark_array_keys(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	int			i;
+
+	for (i = 0; i < so->numArrayKeys; i++)
+	{
+		BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+
+		curArrayKey->mark_elem = curArrayKey->cur_elem;
+	}
+}
+
+/*
+ * _bt_restore_array_keys() -- Handle array keys during btrestrpos
+ *
+ * Restore the array keys to where they were when the mark was set.
+ */
+void
+_bt_restore_array_keys(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		changed = false;
+	int			i;
+
+	/* Restore each array key to its position when the mark was set */
+	for (i = 0; i < so->numArrayKeys; i++)
+	{
+		BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+		ScanKey		skey = &so->arrayKeyData[curArrayKey->scan_key];
+		int			mark_elem = curArrayKey->mark_elem;
+
+		if (curArrayKey->cur_elem != mark_elem)
+		{
+			curArrayKey->cur_elem = mark_elem;
+			skey->sk_argument = curArrayKey->elem_values[mark_elem];
+			changed = true;
+		}
+	}
+
+	/*
+	 * If we changed any keys, we must redo _bt_preprocess_keys.  That might
+	 * sound like overkill, but in cases with multiple keys per index column
+	 * it seems necessary to do the full set of pushups.
+	 */
+	if (changed)
+	{
+		_bt_preprocess_keys(scan);
+		/* The mark should have been set on a consistent set of keys... */
+		Assert(so->qual_ok);
+	}
+}
+
+
+/*
+ *	_bt_preprocess_keys() -- Preprocess scan keys
+ *
+ * The given search-type keys (in scan->keyData[] or so->arrayKeyData[])
+ * are copied to so->keyData[] with possible transformation.
+ * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
+ * the number of output keys (possibly less, never greater).
+ *
+ * The output keys are marked with additional sk_flags bits beyond the
+ * system-standard bits supplied by the caller.  The DESC and NULLS_FIRST
+ * indoption bits for the relevant index attribute are copied into the flags.
+ * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
+ * so that the index sorts in the desired direction.
+ *
+ * One key purpose of this routine is to discover which scan keys must be
+ * satisfied to continue the scan.  It also attempts to eliminate redundant
+ * keys and detect contradictory keys.  (If the index opfamily provides
+ * incomplete sets of cross-type operators, we may fail to detect redundant
+ * or contradictory keys, but we can survive that.)
+ *
+ * The output keys must be sorted by index attribute.  Presently we expect
+ * (but verify) that the input keys are already so sorted --- this is done
+ * by match_clauses_to_index() in indxpath.c.  Some reordering of the keys
+ * within each attribute may be done as a byproduct of the processing here,
+ * but no other code depends on that.
+ *
+ * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
+ * if they must be satisfied in order to continue the scan forward or backward
+ * respectively.  _bt_checkkeys uses these flags.  For example, if the quals
+ * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
+ * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
+ * But once we reach tuples like (1,4,z) we can stop scanning because no
+ * later tuples could match.  This is reflected by marking the x and y keys,
+ * but not the z key, with SK_BT_REQFWD.  In general, the keys for leading
+ * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
+ * For the first attribute without an "=" key, any "<" and "<=" keys are
+ * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
+ * This can be seen to be correct by considering the above example.  Note
+ * in particular that if there are no keys for a given attribute, the keys for
+ * subsequent attributes can never be required; for instance "WHERE y = 4"
+ * requires a full-index scan.
+ *
+ * If possible, redundant keys are eliminated: we keep only the tightest
+ * >/>= bound and the tightest </<= bound, and if there's an = key then
+ * that's the only one returned.  (So, we return either a single = key,
+ * or one or two boundary-condition keys for each attr.)  However, if we
+ * cannot compare two keys for lack of a suitable cross-type operator,
+ * we cannot eliminate either.  If there are two such keys of the same
+ * operator strategy, the second one is just pushed into the output array
+ * without further processing here.  We may also emit both >/>= or both
+ * </<= keys if we can't compare them.  The logic about required keys still
+ * works if we don't eliminate redundant keys.
+ *
+ * Note that one reason we need direction-sensitive required-key flags is
+ * precisely that we may not be able to eliminate redundant keys.  Suppose
+ * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
+ * which key is more restrictive for lack of a suitable cross-type operator.
+ * _bt_first will arbitrarily pick one of the keys to do the initial
+ * positioning with.  If it picks x > 4, then the x > 10 condition will fail
+ * until we reach index entries > 10; but we can't stop the scan just because
+ * x > 10 is failing.  On the other hand, if we are scanning backwards, then
+ * failure of either key is indeed enough to stop the scan.  (In general, when
+ * inequality keys are present, the initial-positioning code only promises to
+ * position before the first possible match, not exactly at the first match,
+ * for a forward scan; or after the last match for a backward scan.)
+ *
+ * As a byproduct of this work, we can detect contradictory quals such
+ * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = false,
+ * indicating the scan need not be run at all since no tuples can match.
+ * (In this case we do not bother completing the output key array!)
+ * Again, missing cross-type operators might cause us to fail to prove the
+ * quals contradictory when they really are, but the scan will work correctly.
+ *
+ * Row comparison keys are currently also treated without any smarts:
+ * we just transfer them into the preprocessed array without any
+ * editorialization.  We can treat them the same as an ordinary inequality
+ * comparison on the row's first index column, for the purposes of the logic
+ * about required keys.
+ *
+ * Note: the reason we have to copy the preprocessed scan keys into private
+ * storage is that we are modifying the array based on comparisons of the
+ * key argument values, which could change on a rescan or after moving to
+ * new elements of array keys.  Therefore we can't overwrite the source data.
+ */
+void
+_bt_preprocess_keys(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	int			numberOfKeys = scan->numberOfKeys;
+	int16	   *indoption = scan->indexRelation->rd_indoption;
+	int			new_numberOfKeys;
+	int			numberOfEqualCols;
+	ScanKey		inkeys;
+	ScanKey		outkeys;
+	ScanKey		cur;
+	ScanKey		xform[BTMaxStrategyNumber];
+	bool		test_result;
+	int			i,
+				j;
+	AttrNumber	attno;
+
+	/* initialize result variables */
+	so->qual_ok = true;
+	so->numberOfKeys = 0;
+
+	if (numberOfKeys < 1)
+		return;					/* done if qual-less scan */
+
+	/*
+	 * Read so->arrayKeyData if array keys are present, else scan->keyData
+	 */
+	if (so->arrayKeyData != NULL)
+		inkeys = so->arrayKeyData;
+	else
+		inkeys = scan->keyData;
+
+	outkeys = so->keyData;
+	cur = &inkeys[0];
+	/* we check that input keys are correctly ordered */
+	if (cur->sk_attno < 1)
+		elog(ERROR, "btree index keys must be ordered by attribute");
+
+	/* We can short-circuit most of the work if there's just one key */
+	if (numberOfKeys == 1)
+	{
+		/* Apply indoption to scankey (might change sk_strategy!) */
+		if (!_bt_fix_scankey_strategy(cur, indoption))
+			so->qual_ok = false;
+		memcpy(outkeys, cur, sizeof(ScanKeyData));
+		so->numberOfKeys = 1;
+		/* We can mark the qual as required if it's for first index col */
+		if (cur->sk_attno == 1)
+			_bt_mark_scankey_required(outkeys);
+		return;
+	}
+
+	/*
+	 * Otherwise, do the full set of pushups.
+	 */
+	new_numberOfKeys = 0;
+	numberOfEqualCols = 0;
+
+	/*
+	 * Initialize for processing of keys for attr 1.
+	 *
+	 * xform[i] points to the currently best scan key of strategy type i+1; it
+	 * is NULL if we haven't yet found such a key for this attr.
+	 */
+	attno = 1;
+	memset(xform, 0, sizeof(xform));
+
+	/*
+	 * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
+	 * handle after-last-key processing.  Actual exit from the loop is at the
+	 * "break" statement below.
+	 */
+	for (i = 0;; cur++, i++)
+	{
+		if (i < numberOfKeys)
+		{
+			/* Apply indoption to scankey (might change sk_strategy!) */
+			if (!_bt_fix_scankey_strategy(cur, indoption))
+			{
+				/* NULL can't be matched, so give up */
+				so->qual_ok = false;
+				return;
+			}
+		}
+
+		/*
+		 * If we are at the end of the keys for a particular attr, finish up
+		 * processing and emit the cleaned-up keys.
+		 */
+		if (i == numberOfKeys || cur->sk_attno != attno)
+		{
+			int			priorNumberOfEqualCols = numberOfEqualCols;
+
+			/* check input keys are correctly ordered */
+			if (i < numberOfKeys && cur->sk_attno < attno)
+				elog(ERROR, "btree index keys must be ordered by attribute");
+
+			/*
+			 * If = has been specified, all other keys can be eliminated as
+			 * redundant.  If we have a case like key = 1 AND key > 2, we can
+			 * set qual_ok to false and abandon further processing.
+			 *
+			 * We also have to deal with the case of "key IS NULL", which is
+			 * unsatisfiable in combination with any other index condition. By
+			 * the time we get here, that's been classified as an equality
+			 * check, and we've rejected any combination of it with a regular
+			 * equality condition; but not with other types of conditions.
+			 */
+			if (xform[BTEqualStrategyNumber - 1])
+			{
+				ScanKey		eq = xform[BTEqualStrategyNumber - 1];
+
+				for (j = BTMaxStrategyNumber; --j >= 0;)
+				{
+					ScanKey		chk = xform[j];
+
+					if (!chk || j == (BTEqualStrategyNumber - 1))
+						continue;
+
+					if (eq->sk_flags & SK_SEARCHNULL)
+					{
+						/* IS NULL is contradictory to anything else */
+						so->qual_ok = false;
+						return;
+					}
+
+					if (_bt_compare_scankey_args(scan, chk, eq, chk,
+												 &test_result))
+					{
+						if (!test_result)
+						{
+							/* keys proven mutually contradictory */
+							so->qual_ok = false;
+							return;
+						}
+						/* else discard the redundant non-equality key */
+						xform[j] = NULL;
+					}
+					/* else, cannot determine redundancy, keep both keys */
+				}
+				/* track number of attrs for which we have "=" keys */
+				numberOfEqualCols++;
+			}
+
+			/* try to keep only one of <, <= */
+			if (xform[BTLessStrategyNumber - 1]
+				&& xform[BTLessEqualStrategyNumber - 1])
+			{
+				ScanKey		lt = xform[BTLessStrategyNumber - 1];
+				ScanKey		le = xform[BTLessEqualStrategyNumber - 1];
+
+				if (_bt_compare_scankey_args(scan, le, lt, le,
+											 &test_result))
+				{
+					if (test_result)
+						xform[BTLessEqualStrategyNumber - 1] = NULL;
+					else
+						xform[BTLessStrategyNumber - 1] = NULL;
+				}
+			}
+
+			/* try to keep only one of >, >= */
+			if (xform[BTGreaterStrategyNumber - 1]
+				&& xform[BTGreaterEqualStrategyNumber - 1])
+			{
+				ScanKey		gt = xform[BTGreaterStrategyNumber - 1];
+				ScanKey		ge = xform[BTGreaterEqualStrategyNumber - 1];
+
+				if (_bt_compare_scankey_args(scan, ge, gt, ge,
+											 &test_result))
+				{
+					if (test_result)
+						xform[BTGreaterEqualStrategyNumber - 1] = NULL;
+					else
+						xform[BTGreaterStrategyNumber - 1] = NULL;
+				}
+			}
+
+			/*
+			 * Emit the cleaned-up keys into the outkeys[] array, and then
+			 * mark them if they are required.  They are required (possibly
+			 * only in one direction) if all attrs before this one had "=".
+			 */
+			for (j = BTMaxStrategyNumber; --j >= 0;)
+			{
+				if (xform[j])
+				{
+					ScanKey		outkey = &outkeys[new_numberOfKeys++];
+
+					memcpy(outkey, xform[j], sizeof(ScanKeyData));
+					if (priorNumberOfEqualCols == attno - 1)
+						_bt_mark_scankey_required(outkey);
+				}
+			}
+
+			/*
+			 * Exit loop here if done.
+			 */
+			if (i == numberOfKeys)
+				break;
+
+			/* Re-initialize for new attno */
+			attno = cur->sk_attno;
+			memset(xform, 0, sizeof(xform));
+		}
+
+		/* check strategy this key's operator corresponds to */
+		j = cur->sk_strategy - 1;
+
+		/* if row comparison, push it directly to the output array */
+		if (cur->sk_flags & SK_ROW_HEADER)
+		{
+			ScanKey		outkey = &outkeys[new_numberOfKeys++];
+
+			memcpy(outkey, cur, sizeof(ScanKeyData));
+			if (numberOfEqualCols == attno - 1)
+				_bt_mark_scankey_required(outkey);
+
+			/*
+			 * We don't support RowCompare using equality; such a qual would
+			 * mess up the numberOfEqualCols tracking.
+			 */
+			Assert(j != (BTEqualStrategyNumber - 1));
+			continue;
+		}
+
+		/* have we seen one of these before? */
+		if (xform[j] == NULL)
+		{
+			/* nope, so remember this scankey */
+			xform[j] = cur;
+		}
+		else
+		{
+			/* yup, keep only the more restrictive key */
+			if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
+										 &test_result))
+			{
+				if (test_result)
+					xform[j] = cur;
+				else if (j == (BTEqualStrategyNumber - 1))
+				{
+					/* key == a && key == b, but a != b */
+					so->qual_ok = false;
+					return;
+				}
+				/* else old key is more restrictive, keep it */
+			}
+			else
+			{
+				/*
+				 * We can't determine which key is more restrictive.  Keep the
+				 * previous one in xform[j] and push this one directly to the
+				 * output array.
+				 */
+				ScanKey		outkey = &outkeys[new_numberOfKeys++];
+
+				memcpy(outkey, cur, sizeof(ScanKeyData));
+				if (numberOfEqualCols == attno - 1)
+					_bt_mark_scankey_required(outkey);
+			}
+		}
+	}
+
+	so->numberOfKeys = new_numberOfKeys;
+}
+
+/*
+ * Compare two scankey values using a specified operator.
+ *
+ * The test we want to perform is logically "leftarg op rightarg", where
+ * leftarg and rightarg are the sk_argument values in those ScanKeys, and
+ * the comparison operator is the one in the op ScanKey.  However, in
+ * cross-data-type situations we may need to look up the correct operator in
+ * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
+ * and amoplefttype/amoprighttype equal to the two argument datatypes.
+ *
+ * If the opfamily doesn't supply a complete set of cross-type operators we
+ * may not be able to make the comparison.  If we can make the comparison
+ * we store the operator result in *result and return true.  We return false
+ * if the comparison could not be made.
+ *
+ * Note: op always points at the same ScanKey as either leftarg or rightarg.
+ * Since we don't scribble on the scankeys, this aliasing should cause no
+ * trouble.
+ *
+ * Note: this routine needs to be insensitive to any DESC option applied
+ * to the index column.  For example, "x < 4" is a tighter constraint than
+ * "x < 5" regardless of which way the index is sorted.
+ */
+static bool
+_bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
+						 ScanKey leftarg, ScanKey rightarg,
+						 bool *result)
+{
+	Relation	rel = scan->indexRelation;
+	Oid			lefttype,
+				righttype,
+				optype,
+				opcintype,
+				cmp_op;
+	StrategyNumber strat;
+
+	/*
+	 * First, deal with cases where one or both args are NULL.  This should
+	 * only happen when the scankeys represent IS NULL/NOT NULL conditions.
+	 */
+	if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
+	{
+		bool		leftnull,
+					rightnull;
+
+		if (leftarg->sk_flags & SK_ISNULL)
+		{
+			Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
+			leftnull = true;
+		}
+		else
+			leftnull = false;
+		if (rightarg->sk_flags & SK_ISNULL)
+		{
+			Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
+			rightnull = true;
+		}
+		else
+			rightnull = false;
+
+		/*
+		 * We treat NULL as either greater than or less than all other values.
+		 * Since true > false, the tests below work correctly for NULLS LAST
+		 * logic.  If the index is NULLS FIRST, we need to flip the strategy.
+		 */
+		strat = op->sk_strategy;
+		if (op->sk_flags & SK_BT_NULLS_FIRST)
+			strat = BTCommuteStrategyNumber(strat);
+
+		switch (strat)
+		{
+			case BTLessStrategyNumber:
+				*result = (leftnull < rightnull);
+				break;
+			case BTLessEqualStrategyNumber:
+				*result = (leftnull <= rightnull);
+				break;
+			case BTEqualStrategyNumber:
+				*result = (leftnull == rightnull);
+				break;
+			case BTGreaterEqualStrategyNumber:
+				*result = (leftnull >= rightnull);
+				break;
+			case BTGreaterStrategyNumber:
+				*result = (leftnull > rightnull);
+				break;
+			default:
+				elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
+				*result = false;	/* keep compiler quiet */
+				break;
+		}
+		return true;
+	}
+
+	/*
+	 * The opfamily we need to worry about is identified by the index column.
+	 */
+	Assert(leftarg->sk_attno == rightarg->sk_attno);
+
+	opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
+
+	/*
+	 * Determine the actual datatypes of the ScanKey arguments.  We have to
+	 * support the convention that sk_subtype == InvalidOid means the opclass
+	 * input type; this is a hack to simplify life for ScanKeyInit().
+	 */
+	lefttype = leftarg->sk_subtype;
+	if (lefttype == InvalidOid)
+		lefttype = opcintype;
+	righttype = rightarg->sk_subtype;
+	if (righttype == InvalidOid)
+		righttype = opcintype;
+	optype = op->sk_subtype;
+	if (optype == InvalidOid)
+		optype = opcintype;
+
+	/*
+	 * If leftarg and rightarg match the types expected for the "op" scankey,
+	 * we can use its already-looked-up comparison function.
+	 */
+	if (lefttype == opcintype && righttype == optype)
+	{
+		*result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
+												 op->sk_collation,
+												 leftarg->sk_argument,
+												 rightarg->sk_argument));
+		return true;
+	}
+
+	/*
+	 * Otherwise, we need to go to the syscache to find the appropriate
+	 * operator.  (This cannot result in infinite recursion, since no
+	 * indexscan initiated by syscache lookup will use cross-data-type
+	 * operators.)
+	 *
+	 * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
+	 * un-flip it to get the correct opfamily member.
+	 */
+	strat = op->sk_strategy;
+	if (op->sk_flags & SK_BT_DESC)
+		strat = BTCommuteStrategyNumber(strat);
+
+	cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
+								 lefttype,
+								 righttype,
+								 strat);
+	if (OidIsValid(cmp_op))
+	{
+		RegProcedure cmp_proc = get_opcode(cmp_op);
+
+		if (RegProcedureIsValid(cmp_proc))
+		{
+			*result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
+														op->sk_collation,
+														leftarg->sk_argument,
+														rightarg->sk_argument));
+			return true;
+		}
+	}
+
+	/* Can't make the comparison */
+	*result = false;			/* suppress compiler warnings */
+	return false;
+}
+
+/*
+ * Adjust a scankey's strategy and flags setting as needed for indoptions.
+ *
+ * We copy the appropriate indoption value into the scankey sk_flags
+ * (shifting to avoid clobbering system-defined flag bits).  Also, if
+ * the DESC option is set, commute (flip) the operator strategy number.
+ *
+ * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
+ * the strategy field correctly for them.
+ *
+ * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
+ * NULL comparison value.  Since all btree operators are assumed strict,
+ * a NULL means that the qual cannot be satisfied.  We return true if the
+ * comparison value isn't NULL, or false if the scan should be abandoned.
+ *
+ * This function is applied to the *input* scankey structure; therefore
+ * on a rescan we will be looking at already-processed scankeys.  Hence
+ * we have to be careful not to re-commute the strategy if we already did it.
+ * It's a bit ugly to modify the caller's copy of the scankey but in practice
+ * there shouldn't be any problem, since the index's indoptions are certainly
+ * not going to change while the scankey survives.
+ */
+static bool
+_bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
+{
+	int			addflags;
+
+	addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
+
+	/*
+	 * We treat all btree operators as strict (even if they're not so marked
+	 * in pg_proc). This means that it is impossible for an operator condition
+	 * with a NULL comparison constant to succeed, and we can reject it right
+	 * away.
+	 *
+	 * However, we now also support "x IS NULL" clauses as search conditions,
+	 * so in that case keep going. The planner has not filled in any
+	 * particular strategy in this case, so set it to BTEqualStrategyNumber
+	 * --- we can treat IS NULL as an equality operator for purposes of search
+	 * strategy.
+	 *
+	 * Likewise, "x IS NOT NULL" is supported.  We treat that as either "less
+	 * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
+	 * FIRST index.
+	 *
+	 * Note: someday we might have to fill in sk_collation from the index
+	 * column's collation.  At the moment this is a non-issue because we'll
+	 * never actually call the comparison operator on a NULL.
+	 */
+	if (skey->sk_flags & SK_ISNULL)
+	{
+		/* SK_ISNULL shouldn't be set in a row header scankey */
+		Assert(!(skey->sk_flags & SK_ROW_HEADER));
+
+		/* Set indoption flags in scankey (might be done already) */
+		skey->sk_flags |= addflags;
+
+		/* Set correct strategy for IS NULL or NOT NULL search */
+		if (skey->sk_flags & SK_SEARCHNULL)
+		{
+			skey->sk_strategy = BTEqualStrategyNumber;
+			skey->sk_subtype = InvalidOid;
+			skey->sk_collation = InvalidOid;
+		}
+		else if (skey->sk_flags & SK_SEARCHNOTNULL)
+		{
+			if (skey->sk_flags & SK_BT_NULLS_FIRST)
+				skey->sk_strategy = BTGreaterStrategyNumber;
+			else
+				skey->sk_strategy = BTLessStrategyNumber;
+			skey->sk_subtype = InvalidOid;
+			skey->sk_collation = InvalidOid;
+		}
+		else
+		{
+			/* regular qual, so it cannot be satisfied */
+			return false;
+		}
+
+		/* Needn't do the rest */
+		return true;
+	}
+
+	/* Adjust strategy for DESC, if we didn't already */
+	if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
+		skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
+	skey->sk_flags |= addflags;
+
+	/* If it's a row header, fix row member flags and strategies similarly */
+	if (skey->sk_flags & SK_ROW_HEADER)
+	{
+		ScanKey		subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+
+		for (;;)
+		{
+			Assert(subkey->sk_flags & SK_ROW_MEMBER);
+			addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
+			if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
+				subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
+			subkey->sk_flags |= addflags;
+			if (subkey->sk_flags & SK_ROW_END)
+				break;
+			subkey++;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Mark a scankey as "required to continue the scan".
+ *
+ * Depending on the operator type, the key may be required for both scan
+ * directions or just one.  Also, if the key is a row comparison header,
+ * we have to mark its first subsidiary ScanKey as required.  (Subsequent
+ * subsidiary ScanKeys are normally for lower-order columns, and thus
+ * cannot be required, since they're after the first non-equality scankey.)
+ *
+ * Note: when we set required-key flag bits in a subsidiary scankey, we are
+ * scribbling on a data structure belonging to the index AM's caller, not on
+ * our private copy.  This should be OK because the marking will not change
+ * from scan to scan within a query, and so we'd just re-mark the same way
+ * anyway on a rescan.  Something to keep an eye on though.
+ */
+static void
+_bt_mark_scankey_required(ScanKey skey)
+{
+	int			addflags;
+
+	switch (skey->sk_strategy)
+	{
+		case BTLessStrategyNumber:
+		case BTLessEqualStrategyNumber:
+			addflags = SK_BT_REQFWD;
+			break;
+		case BTEqualStrategyNumber:
+			addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
+			break;
+		case BTGreaterEqualStrategyNumber:
+		case BTGreaterStrategyNumber:
+			addflags = SK_BT_REQBKWD;
+			break;
+		default:
+			elog(ERROR, "unrecognized StrategyNumber: %d",
+				 (int) skey->sk_strategy);
+			addflags = 0;		/* keep compiler quiet */
+			break;
+	}
+
+	skey->sk_flags |= addflags;
+
+	if (skey->sk_flags & SK_ROW_HEADER)
+	{
+		ScanKey		subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+
+		/* First subkey should be same column/operator as the header */
+		Assert(subkey->sk_flags & SK_ROW_MEMBER);
+		Assert(subkey->sk_attno == skey->sk_attno);
+		Assert(subkey->sk_strategy == skey->sk_strategy);
+		subkey->sk_flags |= addflags;
+	}
+}
+
+/*
+ * Test whether an indextuple satisfies all the scankey conditions.
+ *
+ * Return true if so, false if not.  If the tuple fails to pass the qual,
+ * we also determine whether there's any need to continue the scan beyond
+ * this tuple, and set *continuescan accordingly.  See comments for
+ * _bt_preprocess_keys(), above, about how this is done.
+ *
+ * Forward scan callers can pass a high key tuple in the hopes of having
+ * us set *continuescan to false, and avoiding an unnecessary visit to
+ * the page to the right.
+ *
+ * scan: index scan descriptor (containing a search-type scankey)
+ * tuple: index tuple to test
+ * tupnatts: number of attributes in tupnatts (high key may be truncated)
+ * dir: direction we are scanning in
+ * continuescan: output parameter (will be set correctly in all cases)
+ */
+bool
+_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
+			  ScanDirection dir, bool *continuescan)
+{
+	TupleDesc	tupdesc;
+	BTScanOpaque so;
+	int			keysz;
+	int			ikey;
+	ScanKey		key;
+
+	Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
+
+	*continuescan = true;		/* default assumption */
+
+	tupdesc = RelationGetDescr(scan->indexRelation);
+	so = (BTScanOpaque) scan->opaque;
+	keysz = so->numberOfKeys;
+
+	for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
+	{
+		Datum		datum;
+		bool		isNull;
+		Datum		test;
+
+		if (key->sk_attno > tupnatts)
+		{
+			/*
+			 * This attribute is truncated (must be high key).  The value for
+			 * this attribute in the first non-pivot tuple on the page to the
+			 * right could be any possible value.  Assume that truncated
+			 * attribute passes the qual.
+			 */
+			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
+			continue;
+		}
+
+		/* row-comparison keys need special processing */
+		if (key->sk_flags & SK_ROW_HEADER)
+		{
+			if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
+									 continuescan))
+				continue;
+			return false;
+		}
+
+		datum = index_getattr(tuple,
+							  key->sk_attno,
+							  tupdesc,
+							  &isNull);
+
+		if (key->sk_flags & SK_ISNULL)
+		{
+			/* Handle IS NULL/NOT NULL tests */
+			if (key->sk_flags & SK_SEARCHNULL)
+			{
+				if (isNull)
+					continue;	/* tuple satisfies this qual */
+			}
+			else
+			{
+				Assert(key->sk_flags & SK_SEARCHNOTNULL);
+				if (!isNull)
+					continue;	/* tuple satisfies this qual */
+			}
+
+			/*
+			 * Tuple fails this qual.  If it's a required qual for the current
+			 * scan direction, then we can conclude no further tuples will
+			 * pass, either.
+			 */
+			if ((key->sk_flags & SK_BT_REQFWD) &&
+				ScanDirectionIsForward(dir))
+				*continuescan = false;
+			else if ((key->sk_flags & SK_BT_REQBKWD) &&
+					 ScanDirectionIsBackward(dir))
+				*continuescan = false;
+
+			/*
+			 * In any case, this indextuple doesn't match the qual.
+			 */
+			return false;
+		}
+
+		if (isNull)
+		{
+			if (key->sk_flags & SK_BT_NULLS_FIRST)
+			{
+				/*
+				 * Since NULLs are sorted before non-NULLs, we know we have
+				 * reached the lower limit of the range of values for this
+				 * index attr.  On a backward scan, we can stop if this qual
+				 * is one of the "must match" subset.  We can stop regardless
+				 * of whether the qual is > or <, so long as it's required,
+				 * because it's not possible for any future tuples to pass. On
+				 * a forward scan, however, we must keep going, because we may
+				 * have initially positioned to the start of the index.
+				 */
+				if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+					ScanDirectionIsBackward(dir))
+					*continuescan = false;
+			}
+			else
+			{
+				/*
+				 * Since NULLs are sorted after non-NULLs, we know we have
+				 * reached the upper limit of the range of values for this
+				 * index attr.  On a forward scan, we can stop if this qual is
+				 * one of the "must match" subset.  We can stop regardless of
+				 * whether the qual is > or <, so long as it's required,
+				 * because it's not possible for any future tuples to pass. On
+				 * a backward scan, however, we must keep going, because we
+				 * may have initially positioned to the end of the index.
+				 */
+				if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+					ScanDirectionIsForward(dir))
+					*continuescan = false;
+			}
+
+			/*
+			 * In any case, this indextuple doesn't match the qual.
+			 */
+			return false;
+		}
+
+		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+								 datum, key->sk_argument);
+
+		if (!DatumGetBool(test))
+		{
+			/*
+			 * Tuple fails this qual.  If it's a required qual for the current
+			 * scan direction, then we can conclude no further tuples will
+			 * pass, either.
+			 *
+			 * Note: because we stop the scan as soon as any required equality
+			 * qual fails, it is critical that equality quals be used for the
+			 * initial positioning in _bt_first() when they are available. See
+			 * comments in _bt_first().
+			 */
+			if ((key->sk_flags & SK_BT_REQFWD) &&
+				ScanDirectionIsForward(dir))
+				*continuescan = false;
+			else if ((key->sk_flags & SK_BT_REQBKWD) &&
+					 ScanDirectionIsBackward(dir))
+				*continuescan = false;
+
+			/*
+			 * In any case, this indextuple doesn't match the qual.
+			 */
+			return false;
+		}
+	}
+
+	/* If we get here, the tuple passes all index quals. */
+	return true;
+}
+
+/*
+ * Test whether an indextuple satisfies a row-comparison scan condition.
+ *
+ * Return true if so, false if not.  If not, also clear *continuescan if
+ * it's not possible for any future tuples in the current scan direction
+ * to pass the qual.
+ *
+ * This is a subroutine for _bt_checkkeys, which see for more info.
+ */
+static bool
+_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
+					 TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
+{
+	ScanKey		subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+	int32		cmpresult = 0;
+	bool		result;
+
+	/* First subkey should be same as the header says */
+	Assert(subkey->sk_attno == skey->sk_attno);
+
+	/* Loop over columns of the row condition */
+	for (;;)
+	{
+		Datum		datum;
+		bool		isNull;
+
+		Assert(subkey->sk_flags & SK_ROW_MEMBER);
+
+		if (subkey->sk_attno > tupnatts)
+		{
+			/*
+			 * This attribute is truncated (must be high key).  The value for
+			 * this attribute in the first non-pivot tuple on the page to the
+			 * right could be any possible value.  Assume that truncated
+			 * attribute passes the qual.
+			 */
+			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
+			cmpresult = 0;
+			if (subkey->sk_flags & SK_ROW_END)
+				break;
+			subkey++;
+			continue;
+		}
+
+		datum = index_getattr(tuple,
+							  subkey->sk_attno,
+							  tupdesc,
+							  &isNull);
+
+		if (isNull)
+		{
+			if (subkey->sk_flags & SK_BT_NULLS_FIRST)
+			{
+				/*
+				 * Since NULLs are sorted before non-NULLs, we know we have
+				 * reached the lower limit of the range of values for this
+				 * index attr.  On a backward scan, we can stop if this qual
+				 * is one of the "must match" subset.  We can stop regardless
+				 * of whether the qual is > or <, so long as it's required,
+				 * because it's not possible for any future tuples to pass. On
+				 * a forward scan, however, we must keep going, because we may
+				 * have initially positioned to the start of the index.
+				 */
+				if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+					ScanDirectionIsBackward(dir))
+					*continuescan = false;
+			}
+			else
+			{
+				/*
+				 * Since NULLs are sorted after non-NULLs, we know we have
+				 * reached the upper limit of the range of values for this
+				 * index attr.  On a forward scan, we can stop if this qual is
+				 * one of the "must match" subset.  We can stop regardless of
+				 * whether the qual is > or <, so long as it's required,
+				 * because it's not possible for any future tuples to pass. On
+				 * a backward scan, however, we must keep going, because we
+				 * may have initially positioned to the end of the index.
+				 */
+				if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+					ScanDirectionIsForward(dir))
+					*continuescan = false;
+			}
+
+			/*
+			 * In any case, this indextuple doesn't match the qual.
+			 */
+			return false;
+		}
+
+		if (subkey->sk_flags & SK_ISNULL)
+		{
+			/*
+			 * Unlike the simple-scankey case, this isn't a disallowed case.
+			 * But it can never match.  If all the earlier row comparison
+			 * columns are required for the scan direction, we can stop the
+			 * scan, because there can't be another tuple that will succeed.
+			 */
+			if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
+				subkey--;
+			if ((subkey->sk_flags & SK_BT_REQFWD) &&
+				ScanDirectionIsForward(dir))
+				*continuescan = false;
+			else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
+					 ScanDirectionIsBackward(dir))
+				*continuescan = false;
+			return false;
+		}
+
+		/* Perform the test --- three-way comparison not bool operator */
+		cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
+													subkey->sk_collation,
+													datum,
+													subkey->sk_argument));
+
+		if (subkey->sk_flags & SK_BT_DESC)
+			INVERT_COMPARE_RESULT(cmpresult);
+
+		/* Done comparing if unequal, else advance to next column */
+		if (cmpresult != 0)
+			break;
+
+		if (subkey->sk_flags & SK_ROW_END)
+			break;
+		subkey++;
+	}
+
+	/*
+	 * At this point cmpresult indicates the overall result of the row
+	 * comparison, and subkey points to the deciding column (or the last
+	 * column if the result is "=").
+	 */
+	switch (subkey->sk_strategy)
+	{
+			/* EQ and NE cases aren't allowed here */
+		case BTLessStrategyNumber:
+			result = (cmpresult < 0);
+			break;
+		case BTLessEqualStrategyNumber:
+			result = (cmpresult <= 0);
+			break;
+		case BTGreaterEqualStrategyNumber:
+			result = (cmpresult >= 0);
+			break;
+		case BTGreaterStrategyNumber:
+			result = (cmpresult > 0);
+			break;
+		default:
+			elog(ERROR, "unrecognized RowCompareType: %d",
+				 (int) subkey->sk_strategy);
+			result = 0;			/* keep compiler quiet */
+			break;
+	}
+
+	if (!result)
+	{
+		/*
+		 * Tuple fails this qual.  If it's a required qual for the current
+		 * scan direction, then we can conclude no further tuples will pass,
+		 * either.  Note we have to look at the deciding column, not
+		 * necessarily the first or last column of the row condition.
+		 */
+		if ((subkey->sk_flags & SK_BT_REQFWD) &&
+			ScanDirectionIsForward(dir))
+			*continuescan = false;
+		else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
+				 ScanDirectionIsBackward(dir))
+			*continuescan = false;
+	}
+
+	return result;
+}
+
+/*
+ * _bt_killitems - set LP_DEAD state for items an indexscan caller has
+ * told us were killed
+ *
+ * scan->opaque, referenced locally through so, contains information about the
+ * current page and killed tuples thereon (generally, this should only be
+ * called if so->numKilled > 0).
+ *
+ * The caller does not have a lock on the page and may or may not have the
+ * page pinned in a buffer.  Note that read-lock is sufficient for setting
+ * LP_DEAD status (which is only a hint).
+ *
+ * We match items by heap TID before assuming they are the right ones to
+ * delete.  We cope with cases where items have moved right due to insertions.
+ * If an item has moved off the current page due to a split, we'll fail to
+ * find it and do nothing (this is not an error case --- we assume the item
+ * will eventually get marked in a future indexscan).
+ *
+ * Note that if we hold a pin on the target page continuously from initially
+ * reading the items until applying this function, VACUUM cannot have deleted
+ * any items from the page, and so there is no need to search left from the
+ * recorded offset.  (This observation also guarantees that the item is still
+ * the right one to delete, which might otherwise be questionable since heap
+ * TIDs can get recycled.)	This holds true even if the page has been modified
+ * by inserts and page splits, so there is no need to consult the LSN.
+ *
+ * If the pin was released after reading the page, then we re-read it.  If it
+ * has been modified since we read it (as determined by the LSN), we dare not
+ * flag any entries because it is possible that the old entry was vacuumed
+ * away and the TID was re-used by a completely different heap tuple.
+ */
+void
+_bt_killitems(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber minoff;
+	OffsetNumber maxoff;
+	int			i;
+	int			numKilled = so->numKilled;
+	bool		killedsomething = false;
+	bool		droppedpin PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(BTScanPosIsValid(so->currPos));
+
+	/*
+	 * Always reset the scan state, so we don't look for same items on other
+	 * pages.
+	 */
+	so->numKilled = 0;
+
+	if (BTScanPosIsPinned(so->currPos))
+	{
+		/*
+		 * We have held the pin on this page since we read the index tuples,
+		 * so all we need to do is lock it.  The pin will have prevented
+		 * re-use of any TID on the page, so there is no need to check the
+		 * LSN.
+		 */
+		droppedpin = false;
+		_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
+
+		page = BufferGetPage(so->currPos.buf);
+	}
+	else
+	{
+		Buffer		buf;
+
+		droppedpin = true;
+		/* Attempt to re-read the buffer, getting pin and lock. */
+		buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
+
+		page = BufferGetPage(buf);
+		if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
+			so->currPos.buf = buf;
+		else
+		{
+			/* Modified while not pinned means hinting is not safe. */
+			_bt_relbuf(scan->indexRelation, buf);
+			return;
+		}
+	}
+
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	for (i = 0; i < numKilled; i++)
+	{
+		int			itemIndex = so->killedItems[i];
+		BTScanPosItem *kitem = &so->currPos.items[itemIndex];
+		OffsetNumber offnum = kitem->indexOffset;
+
+		Assert(itemIndex >= so->currPos.firstItem &&
+			   itemIndex <= so->currPos.lastItem);
+		if (offnum < minoff)
+			continue;			/* pure paranoia */
+		while (offnum <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
+			bool		killtuple = false;
+
+			if (BTreeTupleIsPosting(ituple))
+			{
+				int			pi = i + 1;
+				int			nposting = BTreeTupleGetNPosting(ituple);
+				int			j;
+
+				/*
+				 * We rely on the convention that heap TIDs in the scanpos
+				 * items array are stored in ascending heap TID order for a
+				 * group of TIDs that originally came from a posting list
+				 * tuple.  This convention even applies during backwards
+				 * scans, where returning the TIDs in descending order might
+				 * seem more natural.  This is about effectiveness, not
+				 * correctness.
+				 *
+				 * Note that the page may have been modified in almost any way
+				 * since we first read it (in the !droppedpin case), so it's
+				 * possible that this posting list tuple wasn't a posting list
+				 * tuple when we first encountered its heap TIDs.
+				 */
+				for (j = 0; j < nposting; j++)
+				{
+					ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+
+					if (!ItemPointerEquals(item, &kitem->heapTid))
+						break;	/* out of posting list loop */
+
+					/*
+					 * kitem must have matching offnum when heap TIDs match,
+					 * though only in the common case where the page can't
+					 * have been concurrently modified
+					 */
+					Assert(kitem->indexOffset == offnum || !droppedpin);
+
+					/*
+					 * Read-ahead to later kitems here.
+					 *
+					 * We rely on the assumption that not advancing kitem here
+					 * will prevent us from considering the posting list tuple
+					 * fully dead by not matching its next heap TID in next
+					 * loop iteration.
+					 *
+					 * If, on the other hand, this is the final heap TID in
+					 * the posting list tuple, then tuple gets killed
+					 * regardless (i.e. we handle the case where the last
+					 * kitem is also the last heap TID in the last index tuple
+					 * correctly -- posting tuple still gets killed).
+					 */
+					if (pi < numKilled)
+						kitem = &so->currPos.items[so->killedItems[pi++]];
+				}
+
+				/*
+				 * Don't bother advancing the outermost loop's int iterator to
+				 * avoid processing killed items that relate to the same
+				 * offnum/posting list tuple.  This micro-optimization hardly
+				 * seems worth it.  (Further iterations of the outermost loop
+				 * will fail to match on this same posting list's first heap
+				 * TID instead, so we'll advance to the next offnum/index
+				 * tuple pretty quickly.)
+				 */
+				if (j == nposting)
+					killtuple = true;
+			}
+			else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+				killtuple = true;
+
+			/*
+			 * Mark index item as dead, if it isn't already.  Since this
+			 * happens while holding a buffer lock possibly in shared mode,
+			 * it's possible that multiple processes attempt to do this
+			 * simultaneously, leading to multiple full-page images being sent
+			 * to WAL (if wal_log_hints or data checksums are enabled), which
+			 * is undesirable.
+			 */
+			if (killtuple && !ItemIdIsDead(iid))
+			{
+				/* found the item/all posting list items */
+				ItemIdMarkDead(iid);
+				killedsomething = true;
+				break;			/* out of inner search loop */
+			}
+			offnum = OffsetNumberNext(offnum);
+		}
+	}
+
+	/*
+	 * Since this can be redone later if needed, mark as dirty hint.
+	 *
+	 * Whenever we mark anything LP_DEAD, we also set the page's
+	 * BTP_HAS_GARBAGE flag, which is likewise just a hint.  (Note that we
+	 * only rely on the page-level flag in !heapkeyspace indexes.)
+	 */
+	if (killedsomething)
+	{
+		opaque->btpo_flags |= BTP_HAS_GARBAGE;
+		MarkBufferDirtyHint(so->currPos.buf, true);
+	}
+
+	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+}
+
+
+/*
+ * The following routines manage a shared-memory area in which we track
+ * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
+ * operations.  There is a single counter which increments each time we
+ * start a vacuum to assign it a cycle ID.  Since multiple vacuums could
+ * be active concurrently, we have to track the cycle ID for each active
+ * vacuum; this requires at most MaxBackends entries (usually far fewer).
+ * We assume at most one vacuum can be active for a given index.
+ *
+ * Access to the shared memory area is controlled by BtreeVacuumLock.
+ * In principle we could use a separate lmgr locktag for each index,
+ * but a single LWLock is much cheaper, and given the short time that
+ * the lock is ever held, the concurrency hit should be minimal.
+ */
+
+typedef struct BTOneVacInfo
+{
+	LockRelId	relid;			/* global identifier of an index */
+	BTCycleId	cycleid;		/* cycle ID for its active VACUUM */
+} BTOneVacInfo;
+
+typedef struct BTVacInfo
+{
+	BTCycleId	cycle_ctr;		/* cycle ID most recently assigned */
+	int			num_vacuums;	/* number of currently active VACUUMs */
+	int			max_vacuums;	/* allocated length of vacuums[] array */
+	BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
+} BTVacInfo;
+
+static BTVacInfo *btvacinfo;
+
+
+/*
+ * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
+ *		or zero if there is no active VACUUM
+ *
+ * Note: for correct interlocking, the caller must already hold pin and
+ * exclusive lock on each buffer it will store the cycle ID into.  This
+ * ensures that even if a VACUUM starts immediately afterwards, it cannot
+ * process those pages until the page split is complete.
+ */
+BTCycleId
+_bt_vacuum_cycleid(Relation rel)
+{
+	BTCycleId	result = 0;
+	int			i;
+
+	/* Share lock is enough since this is a read-only operation */
+	LWLockAcquire(BtreeVacuumLock, LW_SHARED);
+
+	for (i = 0; i < btvacinfo->num_vacuums; i++)
+	{
+		BTOneVacInfo *vac = &btvacinfo->vacuums[i];
+
+		if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
+			vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
+		{
+			result = vac->cycleid;
+			break;
+		}
+	}
+
+	LWLockRelease(BtreeVacuumLock);
+	return result;
+}
+
+/*
+ * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
+ *
+ * Note: the caller must guarantee that it will eventually call
+ * _bt_end_vacuum, else we'll permanently leak an array slot.  To ensure
+ * that this happens even in elog(FATAL) scenarios, the appropriate coding
+ * is not just a PG_TRY, but
+ *		PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
+ */
+BTCycleId
+_bt_start_vacuum(Relation rel)
+{
+	BTCycleId	result;
+	int			i;
+	BTOneVacInfo *vac;
+
+	LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
+
+	/*
+	 * Assign the next cycle ID, being careful to avoid zero as well as the
+	 * reserved high values.
+	 */
+	result = ++(btvacinfo->cycle_ctr);
+	if (result == 0 || result > MAX_BT_CYCLE_ID)
+		result = btvacinfo->cycle_ctr = 1;
+
+	/* Let's just make sure there's no entry already for this index */
+	for (i = 0; i < btvacinfo->num_vacuums; i++)
+	{
+		vac = &btvacinfo->vacuums[i];
+		if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
+			vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
+		{
+			/*
+			 * Unlike most places in the backend, we have to explicitly
+			 * release our LWLock before throwing an error.  This is because
+			 * we expect _bt_end_vacuum() to be called before transaction
+			 * abort cleanup can run to release LWLocks.
+			 */
+			LWLockRelease(BtreeVacuumLock);
+			elog(ERROR, "multiple active vacuums for index \"%s\"",
+				 RelationGetRelationName(rel));
+		}
+	}
+
+	/* OK, add an entry */
+	if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
+	{
+		LWLockRelease(BtreeVacuumLock);
+		elog(ERROR, "out of btvacinfo slots");
+	}
+	vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
+	vac->relid = rel->rd_lockInfo.lockRelId;
+	vac->cycleid = result;
+	btvacinfo->num_vacuums++;
+
+	LWLockRelease(BtreeVacuumLock);
+	return result;
+}
+
+/*
+ * _bt_end_vacuum --- mark a btree VACUUM operation as done
+ *
+ * Note: this is deliberately coded not to complain if no entry is found;
+ * this allows the caller to put PG_TRY around the start_vacuum operation.
+ */
+void
+_bt_end_vacuum(Relation rel)
+{
+	int			i;
+
+	LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
+
+	/* Find the array entry */
+	for (i = 0; i < btvacinfo->num_vacuums; i++)
+	{
+		BTOneVacInfo *vac = &btvacinfo->vacuums[i];
+
+		if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
+			vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
+		{
+			/* Remove it by shifting down the last entry */
+			*vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
+			btvacinfo->num_vacuums--;
+			break;
+		}
+	}
+
+	LWLockRelease(BtreeVacuumLock);
+}
+
+/*
+ * _bt_end_vacuum wrapped as an on_shmem_exit callback function
+ */
+void
+_bt_end_vacuum_callback(int code, Datum arg)
+{
+	_bt_end_vacuum((Relation) DatumGetPointer(arg));
+}
+
+/*
+ * BTreeShmemSize --- report amount of shared memory space needed
+ */
+Size
+BTreeShmemSize(void)
+{
+	Size		size;
+
+	size = offsetof(BTVacInfo, vacuums);
+	size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
+	return size;
+}
+
+/*
+ * BTreeShmemInit --- initialize this module's shared memory
+ */
+void
+BTreeShmemInit(void)
+{
+	bool		found;
+
+	btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
+											  BTreeShmemSize(),
+											  &found);
+
+	if (!IsUnderPostmaster)
+	{
+		/* Initialize shared memory area */
+		Assert(!found);
+
+		/*
+		 * It doesn't really matter what the cycle counter starts at, but
+		 * having it always start the same doesn't seem good.  Seed with
+		 * low-order bits of time() instead.
+		 */
+		btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
+
+		btvacinfo->num_vacuums = 0;
+		btvacinfo->max_vacuums = MaxBackends;
+	}
+	else
+		Assert(found);
+}
+
+bytea *
+btoptions(Datum reloptions, bool validate)
+{
+	static const relopt_parse_elt tab[] = {
+		{"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
+		{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
+		offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
+		{"deduplicate_items", RELOPT_TYPE_BOOL,
+		offsetof(BTOptions, deduplicate_items)}
+
+	};
+
+	return (bytea *) build_reloptions(reloptions, validate,
+									  RELOPT_KIND_BTREE,
+									  sizeof(BTOptions),
+									  tab, lengthof(tab));
+
+}
+
+/*
+ *	btproperty() -- Check boolean properties of indexes.
+ *
+ * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
+ * to call btcanreturn.
+ */
+bool
+btproperty(Oid index_oid, int attno,
+		   IndexAMProperty prop, const char *propname,
+		   bool *res, bool *isnull)
+{
+	switch (prop)
+	{
+		case AMPROP_RETURNABLE:
+			/* answer only for columns, not AM or whole index */
+			if (attno == 0)
+				return false;
+			/* otherwise, btree can always return data */
+			*res = true;
+			return true;
+
+		default:
+			return false;		/* punt to generic code */
+	}
+}
+
+/*
+ *	btbuildphasename() -- Return name of index build phase.
+ */
+char *
+btbuildphasename(int64 phasenum)
+{
+	switch (phasenum)
+	{
+		case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
+			return "initializing";
+		case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
+			return "scanning table";
+		case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
+			return "sorting live tuples";
+		case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
+			return "sorting dead tuples";
+		case PROGRESS_BTREE_PHASE_LEAF_LOAD:
+			return "loading tuples in tree";
+		default:
+			return NULL;
+	}
+}
+
+/*
+ *	_bt_truncate() -- create tuple without unneeded suffix attributes.
+ *
+ * Returns truncated pivot index tuple allocated in caller's memory context,
+ * with key attributes copied from caller's firstright argument.  If rel is
+ * an INCLUDE index, non-key attributes will definitely be truncated away,
+ * since they're not part of the key space.  More aggressive suffix
+ * truncation can take place when it's clear that the returned tuple does not
+ * need one or more suffix key attributes.  We only need to keep firstright
+ * attributes up to and including the first non-lastleft-equal attribute.
+ * Caller's insertion scankey is used to compare the tuples; the scankey's
+ * argument values are not considered here.
+ *
+ * Note that returned tuple's t_tid offset will hold the number of attributes
+ * present, so the original item pointer offset is not represented.  Caller
+ * should only change truncated tuple's downlink.  Note also that truncated
+ * key attributes are treated as containing "minus infinity" values by
+ * _bt_compare().
+ *
+ * In the worst case (when a heap TID must be appended to distinguish lastleft
+ * from firstright), the size of the returned tuple is the size of firstright
+ * plus the size of an additional MAXALIGN()'d item pointer.  This guarantee
+ * is important, since callers need to stay under the 1/3 of a page
+ * restriction on tuple size.  If this routine is ever taught to truncate
+ * within an attribute/datum, it will need to avoid returning an enlarged
+ * tuple to caller when truncation + TOAST compression ends up enlarging the
+ * final datum.
+ */
+IndexTuple
+_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
+			 BTScanInsert itup_key)
+{
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	int			keepnatts;
+	IndexTuple	pivot;
+	IndexTuple	tidpivot;
+	ItemPointer pivotheaptid;
+	Size		newsize;
+
+	/*
+	 * We should only ever truncate non-pivot tuples from leaf pages.  It's
+	 * never okay to truncate when splitting an internal page.
+	 */
+	Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
+
+	/* Determine how many attributes must be kept in truncated tuple */
+	keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
+
+#ifdef DEBUG_NO_TRUNCATE
+	/* Force truncation to be ineffective for testing purposes */
+	keepnatts = nkeyatts + 1;
+#endif
+
+	pivot = index_truncate_tuple(itupdesc, firstright,
+								 Min(keepnatts, nkeyatts));
+
+	if (BTreeTupleIsPosting(pivot))
+	{
+		/*
+		 * index_truncate_tuple() just returns a straight copy of firstright
+		 * when it has no attributes to truncate.  When that happens, we may
+		 * need to truncate away a posting list here instead.
+		 */
+		Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
+		Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
+		pivot->t_info &= ~INDEX_SIZE_MASK;
+		pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
+	}
+
+	/*
+	 * If there is a distinguishing key attribute within pivot tuple, we're
+	 * done
+	 */
+	if (keepnatts <= nkeyatts)
+	{
+		BTreeTupleSetNAtts(pivot, keepnatts, false);
+		return pivot;
+	}
+
+	/*
+	 * We have to store a heap TID in the new pivot tuple, since no non-TID
+	 * key attribute value in firstright distinguishes the right side of the
+	 * split from the left side.  nbtree conceptualizes this case as an
+	 * inability to truncate away any key attributes, since heap TID is
+	 * treated as just another key attribute (despite lacking a pg_attribute
+	 * entry).
+	 *
+	 * Use enlarged space that holds a copy of pivot.  We need the extra space
+	 * to store a heap TID at the end (using the special pivot tuple
+	 * representation).  Note that the original pivot already has firstright's
+	 * possible posting list/non-key attribute values removed at this point.
+	 */
+	newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
+	tidpivot = palloc0(newsize);
+	memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
+	/* Cannot leak memory here */
+	pfree(pivot);
+
+	/*
+	 * Store all of firstright's key attribute values plus a tiebreaker heap
+	 * TID value in enlarged pivot tuple
+	 */
+	tidpivot->t_info &= ~INDEX_SIZE_MASK;
+	tidpivot->t_info |= newsize;
+	BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
+	pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
+
+	/*
+	 * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
+	 * consider suffix truncation.  It seems like a good idea to follow that
+	 * example in cases where no truncation takes place -- use lastleft's heap
+	 * TID.  (This is also the closest value to negative infinity that's
+	 * legally usable.)
+	 */
+	ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
+
+	/*
+	 * We're done.  Assert() that heap TID invariants hold before returning.
+	 *
+	 * Lehman and Yao require that the downlink to the right page, which is to
+	 * be inserted into the parent page in the second phase of a page split be
+	 * a strict lower bound on items on the right page, and a non-strict upper
+	 * bound for items on the left page.  Assert that heap TIDs follow these
+	 * invariants, since a heap TID value is apparently needed as a
+	 * tiebreaker.
+	 */
+#ifndef DEBUG_NO_TRUNCATE
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
+							  BTreeTupleGetHeapTID(firstright)) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(lastleft)) >= 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
+#else
+
+	/*
+	 * Those invariants aren't guaranteed to hold for lastleft + firstright
+	 * heap TID attribute values when they're considered here only because
+	 * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
+	 * needed as a tiebreaker).  DEBUG_NO_TRUNCATE must therefore use a heap
+	 * TID value that always works as a strict lower bound for items to the
+	 * right.  In particular, it must avoid using firstright's leading key
+	 * attribute values along with lastleft's heap TID value when lastleft's
+	 * TID happens to be greater than firstright's TID.
+	 */
+	ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
+
+	/*
+	 * Pivot heap TID should never be fully equal to firstright.  Note that
+	 * the pivot heap TID will still end up equal to lastleft's heap TID when
+	 * that's the only usable value.
+	 */
+	ItemPointerSetOffsetNumber(pivotheaptid,
+							   OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
+#endif
+
+	return tidpivot;
+}
+
+/*
+ * _bt_keep_natts - how many key attributes to keep when truncating.
+ *
+ * Caller provides two tuples that enclose a split point.  Caller's insertion
+ * scankey is used to compare the tuples; the scankey's argument values are
+ * not considered here.
+ *
+ * This can return a number of attributes that is one greater than the
+ * number of key attributes for the index relation.  This indicates that the
+ * caller must use a heap TID as a unique-ifier in new pivot tuple.
+ */
+static int
+_bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
+			   BTScanInsert itup_key)
+{
+	int			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	int			keepnatts;
+	ScanKey		scankey;
+
+	/*
+	 * _bt_compare() treats truncated key attributes as having the value minus
+	 * infinity, which would break searches within !heapkeyspace indexes.  We
+	 * must still truncate away non-key attribute values, though.
+	 */
+	if (!itup_key->heapkeyspace)
+		return nkeyatts;
+
+	scankey = itup_key->scankeys;
+	keepnatts = 1;
+	for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
+	{
+		Datum		datum1,
+					datum2;
+		bool		isNull1,
+					isNull2;
+
+		datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
+		datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
+
+		if (isNull1 != isNull2)
+			break;
+
+		if (!isNull1 &&
+			DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+											scankey->sk_collation,
+											datum1,
+											datum2)) != 0)
+			break;
+
+		keepnatts++;
+	}
+
+	/*
+	 * Assert that _bt_keep_natts_fast() agrees with us in passing.  This is
+	 * expected in an allequalimage index.
+	 */
+	Assert(!itup_key->allequalimage ||
+		   keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
+
+	return keepnatts;
+}
+
+/*
+ * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
+ *
+ * This is exported so that a candidate split point can have its effect on
+ * suffix truncation inexpensively evaluated ahead of time when finding a
+ * split location.  A naive bitwise approach to datum comparisons is used to
+ * save cycles.
+ *
+ * The approach taken here usually provides the same answer as _bt_keep_natts
+ * will (for the same pair of tuples from a heapkeyspace index), since the
+ * majority of btree opclasses can never indicate that two datums are equal
+ * unless they're bitwise equal after detoasting.  When an index only has
+ * "equal image" columns, routine is guaranteed to give the same result as
+ * _bt_keep_natts would.
+ *
+ * Callers can rely on the fact that attributes considered equal here are
+ * definitely also equal according to _bt_keep_natts, even when the index uses
+ * an opclass or collation that is not "allequalimage"/deduplication-safe.
+ * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
+ * negatives generally only have the effect of making leaf page splits use a
+ * more balanced split point.
+ */
+int
+_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
+{
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	int			keysz = IndexRelationGetNumberOfKeyAttributes(rel);
+	int			keepnatts;
+
+	keepnatts = 1;
+	for (int attnum = 1; attnum <= keysz; attnum++)
+	{
+		Datum		datum1,
+					datum2;
+		bool		isNull1,
+					isNull2;
+		Form_pg_attribute att;
+
+		datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
+		datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
+		att = TupleDescAttr(itupdesc, attnum - 1);
+
+		if (isNull1 != isNull2)
+			break;
+
+		if (!isNull1 &&
+			!datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
+			break;
+
+		keepnatts++;
+	}
+
+	return keepnatts;
+}
+
+/*
+ *  _bt_check_natts() -- Verify tuple has expected number of attributes.
+ *
+ * Returns value indicating if the expected number of attributes were found
+ * for a particular offset on page.  This can be used as a general purpose
+ * sanity check.
+ *
+ * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
+ * preferred to calling here.  That's usually more convenient, and is always
+ * more explicit.  Call here instead when offnum's tuple may be a negative
+ * infinity tuple that uses the pre-v11 on-disk representation, or when a low
+ * context check is appropriate.  This routine is as strict as possible about
+ * what is expected on each version of btree.
+ */
+bool
+_bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
+{
+	int16		natts = IndexRelationGetNumberOfAttributes(rel);
+	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	IndexTuple	itup;
+	int			tupnatts;
+
+	/*
+	 * We cannot reliably test a deleted or half-dead page, since they have
+	 * dummy high keys
+	 */
+	if (P_IGNORE(opaque))
+		return true;
+
+	Assert(offnum >= FirstOffsetNumber &&
+		   offnum <= PageGetMaxOffsetNumber(page));
+
+	/*
+	 * Mask allocated for number of keys in index tuple must be able to fit
+	 * maximum possible number of index attributes
+	 */
+	StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
+					 "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
+
+	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+	tupnatts = BTreeTupleGetNAtts(itup, rel);
+
+	/* !heapkeyspace indexes do not support deduplication */
+	if (!heapkeyspace && BTreeTupleIsPosting(itup))
+		return false;
+
+	/* Posting list tuples should never have "pivot heap TID" bit set */
+	if (BTreeTupleIsPosting(itup) &&
+		(ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+		 BT_PIVOT_HEAP_TID_ATTR) != 0)
+		return false;
+
+	/* INCLUDE indexes do not support deduplication */
+	if (natts != nkeyatts && BTreeTupleIsPosting(itup))
+		return false;
+
+	if (P_ISLEAF(opaque))
+	{
+		if (offnum >= P_FIRSTDATAKEY(opaque))
+		{
+			/*
+			 * Non-pivot tuple should never be explicitly marked as a pivot
+			 * tuple
+			 */
+			if (BTreeTupleIsPivot(itup))
+				return false;
+
+			/*
+			 * Leaf tuples that are not the page high key (non-pivot tuples)
+			 * should never be truncated.  (Note that tupnatts must have been
+			 * inferred, even with a posting list tuple, because only pivot
+			 * tuples store tupnatts directly.)
+			 */
+			return tupnatts == natts;
+		}
+		else
+		{
+			/*
+			 * Rightmost page doesn't contain a page high key, so tuple was
+			 * checked above as ordinary leaf tuple
+			 */
+			Assert(!P_RIGHTMOST(opaque));
+
+			/*
+			 * !heapkeyspace high key tuple contains only key attributes. Note
+			 * that tupnatts will only have been explicitly represented in
+			 * !heapkeyspace indexes that happen to have non-key attributes.
+			 */
+			if (!heapkeyspace)
+				return tupnatts == nkeyatts;
+
+			/* Use generic heapkeyspace pivot tuple handling */
+		}
+	}
+	else						/* !P_ISLEAF(opaque) */
+	{
+		if (offnum == P_FIRSTDATAKEY(opaque))
+		{
+			/*
+			 * The first tuple on any internal page (possibly the first after
+			 * its high key) is its negative infinity tuple.  Negative
+			 * infinity tuples are always truncated to zero attributes.  They
+			 * are a particular kind of pivot tuple.
+			 */
+			if (heapkeyspace)
+				return tupnatts == 0;
+
+			/*
+			 * The number of attributes won't be explicitly represented if the
+			 * negative infinity tuple was generated during a page split that
+			 * occurred with a version of Postgres before v11.  There must be
+			 * a problem when there is an explicit representation that is
+			 * non-zero, or when there is no explicit representation and the
+			 * tuple is evidently not a pre-pg_upgrade tuple.
+			 *
+			 * Prior to v11, downlinks always had P_HIKEY as their offset.
+			 * Accept that as an alternative indication of a valid
+			 * !heapkeyspace negative infinity tuple.
+			 */
+			return tupnatts == 0 ||
+				ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
+		}
+		else
+		{
+			/*
+			 * !heapkeyspace downlink tuple with separator key contains only
+			 * key attributes.  Note that tupnatts will only have been
+			 * explicitly represented in !heapkeyspace indexes that happen to
+			 * have non-key attributes.
+			 */
+			if (!heapkeyspace)
+				return tupnatts == nkeyatts;
+
+			/* Use generic heapkeyspace pivot tuple handling */
+		}
+
+	}
+
+	/* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
+	Assert(heapkeyspace);
+
+	/*
+	 * Explicit representation of the number of attributes is mandatory with
+	 * heapkeyspace index pivot tuples, regardless of whether or not there are
+	 * non-key attributes.
+	 */
+	if (!BTreeTupleIsPivot(itup))
+		return false;
+
+	/* Pivot tuple should not use posting list representation (redundant) */
+	if (BTreeTupleIsPosting(itup))
+		return false;
+
+	/*
+	 * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
+	 * when any other key attribute is truncated
+	 */
+	if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
+		return false;
+
+	/*
+	 * Pivot tuple must have at least one untruncated key attribute (minus
+	 * infinity pivot tuples are the only exception).  Pivot tuples can never
+	 * represent that there is a value present for a key attribute that
+	 * exceeds pg_index.indnkeyatts for the index.
+	 */
+	return tupnatts > 0 && tupnatts <= nkeyatts;
+}
+
+/*
+ *
+ *  _bt_check_third_page() -- check whether tuple fits on a btree page at all.
+ *
+ * We actually need to be able to fit three items on every page, so restrict
+ * any one item to 1/3 the per-page available space.  Note that itemsz should
+ * not include the ItemId overhead.
+ *
+ * It might be useful to apply TOAST methods rather than throw an error here.
+ * Using out of line storage would break assumptions made by suffix truncation
+ * and by contrib/amcheck, though.
+ */
+void
+_bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
+					 Page page, IndexTuple newtup)
+{
+	Size		itemsz;
+	BTPageOpaque opaque;
+
+	itemsz = MAXALIGN(IndexTupleSize(newtup));
+
+	/* Double check item size against limit */
+	if (itemsz <= BTMaxItemSize(page))
+		return;
+
+	/*
+	 * Tuple is probably too large to fit on page, but it's possible that the
+	 * index uses version 2 or version 3, or that page is an internal page, in
+	 * which case a slightly higher limit applies.
+	 */
+	if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
+		return;
+
+	/*
+	 * Internal page insertions cannot fail here, because that would mean that
+	 * an earlier leaf level insertion that should have failed didn't
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	if (!P_ISLEAF(opaque))
+		elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
+			 itemsz, RelationGetRelationName(rel));
+
+	ereport(ERROR,
+			(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+			 errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
+					itemsz,
+					needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
+					needheaptidspace ? BTMaxItemSize(page) :
+					BTMaxItemSizeNoHeapTid(page),
+					RelationGetRelationName(rel)),
+			 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
+					   ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
+					   ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
+					   RelationGetRelationName(heap)),
+			 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
+					 "Consider a function index of an MD5 hash of the value, "
+					 "or use full text indexing."),
+			 errtableconstraint(heap, RelationGetRelationName(rel))));
+}
+
+/*
+ * Are all attributes in rel "equality is image equality" attributes?
+ *
+ * We use each attribute's BTEQUALIMAGE_PROC opclass procedure.  If any
+ * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
+ * return false; otherwise we return true.
+ *
+ * Returned boolean value is stored in index metapage during index builds.
+ * Deduplication can only be used when we return true.
+ */
+bool
+_bt_allequalimage(Relation rel, bool debugmessage)
+{
+	bool		allequalimage = true;
+
+	/* INCLUDE indexes don't support deduplication */
+	if (IndexRelationGetNumberOfAttributes(rel) !=
+		IndexRelationGetNumberOfKeyAttributes(rel))
+		return false;
+
+	/*
+	 * There is no special reason why deduplication cannot work with system
+	 * relations (i.e. with system catalog indexes and TOAST indexes).  We
+	 * deem deduplication unsafe for these indexes all the same, since the
+	 * alternative is to force users to always use deduplication, without
+	 * being able to opt out.  (ALTER INDEX is not supported with system
+	 * indexes, so users would have no way to set the deduplicate_items
+	 * storage parameter to 'off'.)
+	 */
+	if (IsSystemRelation(rel))
+		return false;
+
+	for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
+	{
+		Oid			opfamily = rel->rd_opfamily[i];
+		Oid			opcintype = rel->rd_opcintype[i];
+		Oid			collation = rel->rd_indcollation[i];
+		Oid			equalimageproc;
+
+		equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
+										   BTEQUALIMAGE_PROC);
+
+		/*
+		 * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
+		 * be unsafe.  Otherwise, actually call proc and see what it says.
+		 */
+		if (!OidIsValid(equalimageproc) ||
+			!DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
+											   ObjectIdGetDatum(opcintype))))
+		{
+			allequalimage = false;
+			break;
+		}
+	}
+
+	/*
+	 * Don't elog() until here to avoid reporting on a system relation index
+	 * or an INCLUDE index
+	 */
+	if (debugmessage)
+	{
+		if (allequalimage)
+			elog(DEBUG1, "index \"%s\" can safely use deduplication",
+				 RelationGetRelationName(rel));
+		else
+			elog(DEBUG1, "index \"%s\" cannot use deduplication",
+				 RelationGetRelationName(rel));
+	}
+
+	return allequalimage;
+}
diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c
new file mode 100644
index 0000000..7acb64e
--- /dev/null
+++ b/src/backend/access/nbtree/nbtvalidate.c
@@ -0,0 +1,380 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtvalidate.c
+ *	  Opclass validator for btree.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtvalidate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amvalidate.h"
+#include "access/htup_details.h"
+#include "access/nbtree.h"
+#include "access/xact.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_amproc.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+
+/*
+ * Validator for a btree opclass.
+ *
+ * Some of the checks done here cover the whole opfamily, and therefore are
+ * redundant when checking each opclass in a family.  But they don't run long
+ * enough to be much of a problem, so we accept the duplication rather than
+ * complicate the amvalidate API.
+ */
+bool
+btvalidate(Oid opclassoid)
+{
+	bool		result = true;
+	HeapTuple	classtup;
+	Form_pg_opclass classform;
+	Oid			opfamilyoid;
+	Oid			opcintype;
+	char	   *opclassname;
+	HeapTuple	familytup;
+	Form_pg_opfamily familyform;
+	char	   *opfamilyname;
+	CatCList   *proclist,
+			   *oprlist;
+	List	   *grouplist;
+	OpFamilyOpFuncGroup *opclassgroup;
+	List	   *familytypes;
+	int			usefulgroups;
+	int			i;
+	ListCell   *lc;
+
+	/* Fetch opclass information */
+	classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
+	if (!HeapTupleIsValid(classtup))
+		elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
+	classform = (Form_pg_opclass) GETSTRUCT(classtup);
+
+	opfamilyoid = classform->opcfamily;
+	opcintype = classform->opcintype;
+	opclassname = NameStr(classform->opcname);
+
+	/* Fetch opfamily information */
+	familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
+	if (!HeapTupleIsValid(familytup))
+		elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
+	familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
+
+	opfamilyname = NameStr(familyform->opfname);
+
+	/* Fetch all operators and support functions of the opfamily */
+	oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
+	proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
+
+	/* Check individual support functions */
+	for (i = 0; i < proclist->n_members; i++)
+	{
+		HeapTuple	proctup = &proclist->members[i]->tuple;
+		Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
+		bool		ok;
+
+		/* Check procedure numbers and function signatures */
+		switch (procform->amprocnum)
+		{
+			case BTORDER_PROC:
+				ok = check_amproc_signature(procform->amproc, INT4OID, true,
+											2, 2, procform->amproclefttype,
+											procform->amprocrighttype);
+				break;
+			case BTSORTSUPPORT_PROC:
+				ok = check_amproc_signature(procform->amproc, VOIDOID, true,
+											1, 1, INTERNALOID);
+				break;
+			case BTINRANGE_PROC:
+				ok = check_amproc_signature(procform->amproc, BOOLOID, true,
+											5, 5,
+											procform->amproclefttype,
+											procform->amproclefttype,
+											procform->amprocrighttype,
+											BOOLOID, BOOLOID);
+				break;
+			case BTEQUALIMAGE_PROC:
+				ok = check_amproc_signature(procform->amproc, BOOLOID, true,
+											1, 1, OIDOID);
+				break;
+			case BTOPTIONS_PROC:
+				ok = check_amoptsproc_signature(procform->amproc);
+				break;
+			default:
+				ereport(INFO,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
+								opfamilyname, "btree",
+								format_procedure(procform->amproc),
+								procform->amprocnum)));
+				result = false;
+				continue;		/* don't want additional message */
+		}
+
+		if (!ok)
+		{
+			ereport(INFO,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
+							opfamilyname, "btree",
+							format_procedure(procform->amproc),
+							procform->amprocnum)));
+			result = false;
+		}
+	}
+
+	/* Check individual operators */
+	for (i = 0; i < oprlist->n_members; i++)
+	{
+		HeapTuple	oprtup = &oprlist->members[i]->tuple;
+		Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
+
+		/* Check that only allowed strategy numbers exist */
+		if (oprform->amopstrategy < 1 ||
+			oprform->amopstrategy > BTMaxStrategyNumber)
+		{
+			ereport(INFO,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
+							opfamilyname, "btree",
+							format_operator(oprform->amopopr),
+							oprform->amopstrategy)));
+			result = false;
+		}
+
+		/* btree doesn't support ORDER BY operators */
+		if (oprform->amoppurpose != AMOP_SEARCH ||
+			OidIsValid(oprform->amopsortfamily))
+		{
+			ereport(INFO,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
+							opfamilyname, "btree",
+							format_operator(oprform->amopopr))));
+			result = false;
+		}
+
+		/* Check operator signature --- same for all btree strategies */
+		if (!check_amop_signature(oprform->amopopr, BOOLOID,
+								  oprform->amoplefttype,
+								  oprform->amoprighttype))
+		{
+			ereport(INFO,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
+							opfamilyname, "btree",
+							format_operator(oprform->amopopr))));
+			result = false;
+		}
+	}
+
+	/* Now check for inconsistent groups of operators/functions */
+	grouplist = identify_opfamily_groups(oprlist, proclist);
+	usefulgroups = 0;
+	opclassgroup = NULL;
+	familytypes = NIL;
+	foreach(lc, grouplist)
+	{
+		OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
+
+		/*
+		 * It is possible for an in_range support function to have a RHS type
+		 * that is otherwise irrelevant to the opfamily --- for instance, SQL
+		 * requires the datetime_ops opclass to have range support with an
+		 * interval offset.  So, if this group appears to contain only an
+		 * in_range function, ignore it: it doesn't represent a pair of
+		 * supported types.
+		 */
+		if (thisgroup->operatorset == 0 &&
+			thisgroup->functionset == (1 << BTINRANGE_PROC))
+			continue;
+
+		/* Else count it as a relevant group */
+		usefulgroups++;
+
+		/* Remember the group exactly matching the test opclass */
+		if (thisgroup->lefttype == opcintype &&
+			thisgroup->righttype == opcintype)
+			opclassgroup = thisgroup;
+
+		/*
+		 * Identify all distinct data types handled in this opfamily.  This
+		 * implementation is O(N^2), but there aren't likely to be enough
+		 * types in the family for it to matter.
+		 */
+		familytypes = list_append_unique_oid(familytypes, thisgroup->lefttype);
+		familytypes = list_append_unique_oid(familytypes, thisgroup->righttype);
+
+		/*
+		 * Complain if there seems to be an incomplete set of either operators
+		 * or support functions for this datatype pair.  The sortsupport,
+		 * in_range, and equalimage functions are considered optional.
+		 */
+		if (thisgroup->operatorset !=
+			((1 << BTLessStrategyNumber) |
+			 (1 << BTLessEqualStrategyNumber) |
+			 (1 << BTEqualStrategyNumber) |
+			 (1 << BTGreaterEqualStrategyNumber) |
+			 (1 << BTGreaterStrategyNumber)))
+		{
+			ereport(INFO,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s",
+							opfamilyname, "btree",
+							format_type_be(thisgroup->lefttype),
+							format_type_be(thisgroup->righttype))));
+			result = false;
+		}
+		if ((thisgroup->functionset & (1 << BTORDER_PROC)) == 0)
+		{
+			ereport(INFO,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("operator family \"%s\" of access method %s is missing support function for types %s and %s",
+							opfamilyname, "btree",
+							format_type_be(thisgroup->lefttype),
+							format_type_be(thisgroup->righttype))));
+			result = false;
+		}
+	}
+
+	/* Check that the originally-named opclass is supported */
+	/* (if group is there, we already checked it adequately above) */
+	if (!opclassgroup)
+	{
+		ereport(INFO,
+				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("operator class \"%s\" of access method %s is missing operator(s)",
+						opclassname, "btree")));
+		result = false;
+	}
+
+	/*
+	 * Complain if the opfamily doesn't have entries for all possible
+	 * combinations of its supported datatypes.  While missing cross-type
+	 * operators are not fatal, they do limit the planner's ability to derive
+	 * additional qual clauses from equivalence classes, so it seems
+	 * reasonable to insist that all built-in btree opfamilies be complete.
+	 */
+	if (usefulgroups != (list_length(familytypes) * list_length(familytypes)))
+	{
+		ereport(INFO,
+				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)",
+						opfamilyname, "btree")));
+		result = false;
+	}
+
+	ReleaseCatCacheList(proclist);
+	ReleaseCatCacheList(oprlist);
+	ReleaseSysCache(familytup);
+	ReleaseSysCache(classtup);
+
+	return result;
+}
+
+/*
+ * Prechecking function for adding operators/functions to a btree opfamily.
+ */
+void
+btadjustmembers(Oid opfamilyoid,
+				Oid opclassoid,
+				List *operators,
+				List *functions)
+{
+	Oid			opcintype;
+	ListCell   *lc;
+
+	/*
+	 * Btree operators and comparison support functions are always "loose"
+	 * members of the opfamily if they are cross-type.  If they are not
+	 * cross-type, we prefer to tie them to the appropriate opclass ... but if
+	 * the user hasn't created one, we can't do that, and must fall back to
+	 * using the opfamily dependency.  (We mustn't force creation of an
+	 * opclass in such a case, as leaving an incomplete opclass laying about
+	 * would be bad.  Throwing an error is another undesirable alternative.)
+	 *
+	 * This behavior results in a bit of a dump/reload hazard, in that the
+	 * order of restoring objects could affect what dependencies we end up
+	 * with.  pg_dump's existing behavior will preserve the dependency choices
+	 * in most cases, but not if a cross-type operator has been bound tightly
+	 * into an opclass.  That's a mistake anyway, so silently "fixing" it
+	 * isn't awful.
+	 *
+	 * Optional support functions are always "loose" family members.
+	 *
+	 * To avoid repeated lookups, we remember the most recently used opclass's
+	 * input type.
+	 */
+	if (OidIsValid(opclassoid))
+	{
+		/* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
+		CommandCounterIncrement();
+		opcintype = get_opclass_input_type(opclassoid);
+	}
+	else
+		opcintype = InvalidOid;
+
+	/*
+	 * We handle operators and support functions almost identically, so rather
+	 * than duplicate this code block, just join the lists.
+	 */
+	foreach(lc, list_concat_copy(operators, functions))
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		if (op->is_func && op->number != BTORDER_PROC)
+		{
+			/* Optional support proc, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else if (op->lefttype != op->righttype)
+		{
+			/* Cross-type, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else
+		{
+			/* Not cross-type; is there a suitable opclass? */
+			if (op->lefttype != opcintype)
+			{
+				/* Avoid repeating this expensive lookup, even if it fails */
+				opcintype = op->lefttype;
+				opclassoid = opclass_for_family_datatype(BTREE_AM_OID,
+														 opfamilyoid,
+														 opcintype);
+			}
+			if (OidIsValid(opclassoid))
+			{
+				/* Hard dependency on opclass */
+				op->ref_is_hard = true;
+				op->ref_is_family = false;
+				op->refobjid = opclassoid;
+			}
+			else
+			{
+				/* We're stuck, so make a soft dependency on the opfamily */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+			}
+		}
+	}
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
new file mode 100644
index 0000000..786c08c
--- /dev/null
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -0,0 +1,1126 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtxlog.c
+ *	  WAL replay logic for btrees.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtxlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/bufmask.h"
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
+#include "utils/memutils.h"
+
+static MemoryContext opCtx;		/* working memory for operations */
+
+/*
+ * _bt_restore_page -- re-enter all the index tuples on a page
+ *
+ * The page is freshly init'd, and *from (length len) is a copy of what
+ * had been its upper part (pd_upper to pd_special).  We assume that the
+ * tuples had been added to the page in item-number order, and therefore
+ * the one with highest item number appears first (lowest on the page).
+ */
+static void
+_bt_restore_page(Page page, char *from, int len)
+{
+	IndexTupleData itupdata;
+	Size		itemsz;
+	char	   *end = from + len;
+	Item		items[MaxIndexTuplesPerPage];
+	uint16		itemsizes[MaxIndexTuplesPerPage];
+	int			i;
+	int			nitems;
+
+	/*
+	 * To get the items back in the original order, we add them to the page in
+	 * reverse.  To figure out where one tuple ends and another begins, we
+	 * have to scan them in forward order first.
+	 */
+	i = 0;
+	while (from < end)
+	{
+		/*
+		 * As we step through the items, 'from' won't always be properly
+		 * aligned, so we need to use memcpy().  Further, we use Item (which
+		 * is just a char*) here for our items array for the same reason;
+		 * wouldn't want the compiler or anyone thinking that an item is
+		 * aligned when it isn't.
+		 */
+		memcpy(&itupdata, from, sizeof(IndexTupleData));
+		itemsz = IndexTupleSize(&itupdata);
+		itemsz = MAXALIGN(itemsz);
+
+		items[i] = (Item) from;
+		itemsizes[i] = itemsz;
+		i++;
+
+		from += itemsz;
+	}
+	nitems = i;
+
+	for (i = nitems - 1; i >= 0; i--)
+	{
+		if (PageAddItem(page, items[i], itemsizes[i], nitems - i,
+						false, false) == InvalidOffsetNumber)
+			elog(PANIC, "_bt_restore_page: cannot add item to page");
+	}
+}
+
+static void
+_bt_restore_meta(XLogReaderState *record, uint8 block_id)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	Buffer		metabuf;
+	Page		metapg;
+	BTMetaPageData *md;
+	BTPageOpaque pageop;
+	xl_btree_metadata *xlrec;
+	char	   *ptr;
+	Size		len;
+
+	metabuf = XLogInitBufferForRedo(record, block_id);
+	ptr = XLogRecGetBlockData(record, block_id, &len);
+
+	Assert(len == sizeof(xl_btree_metadata));
+	Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
+	xlrec = (xl_btree_metadata *) ptr;
+	metapg = BufferGetPage(metabuf);
+
+	_bt_pageinit(metapg, BufferGetPageSize(metabuf));
+
+	md = BTPageGetMeta(metapg);
+	md->btm_magic = BTREE_MAGIC;
+	md->btm_version = xlrec->version;
+	md->btm_root = xlrec->root;
+	md->btm_level = xlrec->level;
+	md->btm_fastroot = xlrec->fastroot;
+	md->btm_fastlevel = xlrec->fastlevel;
+	/* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
+	Assert(md->btm_version >= BTREE_NOVAC_VERSION);
+	md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages;
+	md->btm_last_cleanup_num_heap_tuples = -1.0;
+	md->btm_allequalimage = xlrec->allequalimage;
+
+	pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
+	pageop->btpo_flags = BTP_META;
+
+	/*
+	 * Set pd_lower just past the end of the metadata.  This is essential,
+	 * because without doing so, metadata will be lost if xlog.c compresses
+	 * the page.
+	 */
+	((PageHeader) metapg)->pd_lower =
+		((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
+
+	PageSetLSN(metapg, lsn);
+	MarkBufferDirty(metabuf);
+	UnlockReleaseBuffer(metabuf);
+}
+
+/*
+ * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
+ *
+ * This is a common subroutine of the redo functions of all the WAL record
+ * types that can insert a downlink: insert, split, and newroot.
+ */
+static void
+_bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	Buffer		buf;
+
+	if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO)
+	{
+		Page		page = (Page) BufferGetPage(buf);
+		BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		Assert(P_INCOMPLETE_SPLIT(pageop));
+		pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buf);
+	}
+	if (BufferIsValid(buf))
+		UnlockReleaseBuffer(buf);
+}
+
+static void
+btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
+				  XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+
+	/*
+	 * Insertion to an internal page finishes an incomplete split at the child
+	 * level.  Clear the incomplete-split flag in the child.  Note: during
+	 * normal operation, the child and parent pages are locked at the same
+	 * time (the locks are coupled), so that clearing the flag and inserting
+	 * the downlink appear atomic to other backends.  We don't bother with
+	 * that during replay, because readers don't care about the
+	 * incomplete-split flag and there cannot be updates happening.
+	 */
+	if (!isleaf)
+		_bt_clear_incomplete_split(record, 1);
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		Size		datalen;
+		char	   *datapos = XLogRecGetBlockData(record, 0, &datalen);
+
+		page = BufferGetPage(buffer);
+
+		if (!posting)
+		{
+			/* Simple retail insertion */
+			if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
+							false, false) == InvalidOffsetNumber)
+				elog(PANIC, "failed to add new item");
+		}
+		else
+		{
+			ItemId		itemid;
+			IndexTuple	oposting,
+						newitem,
+						nposting;
+			uint16		postingoff;
+
+			/*
+			 * A posting list split occurred during leaf page insertion.  WAL
+			 * record data will start with an offset number representing the
+			 * point in an existing posting list that a split occurs at.
+			 *
+			 * Use _bt_swap_posting() to repeat posting list split steps from
+			 * primary.  Note that newitem from WAL record is 'orignewitem',
+			 * not the final version of newitem that is actually inserted on
+			 * page.
+			 */
+			postingoff = *((uint16 *) datapos);
+			datapos += sizeof(uint16);
+			datalen -= sizeof(uint16);
+
+			itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum));
+			oposting = (IndexTuple) PageGetItem(page, itemid);
+
+			/* Use mutable, aligned newitem copy in _bt_swap_posting() */
+			Assert(isleaf && postingoff > 0);
+			newitem = CopyIndexTuple((IndexTuple) datapos);
+			nposting = _bt_swap_posting(newitem, oposting, postingoff);
+
+			/* Replace existing posting list with post-split version */
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+			/* Insert "final" new item (not orignewitem from WAL stream) */
+			Assert(IndexTupleSize(newitem) == datalen);
+			if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum,
+							false, false) == InvalidOffsetNumber)
+				elog(PANIC, "failed to add posting split new item");
+		}
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	/*
+	 * Note: in normal operation, we'd update the metapage while still holding
+	 * lock on the page we inserted into.  But during replay it's not
+	 * necessary to hold that lock, since no other index updates can be
+	 * happening concurrently, and readers will cope fine with following an
+	 * obsolete link from the metapage.
+	 */
+	if (ismeta)
+		_bt_restore_meta(record, 2);
+}
+
+static void
+btree_xlog_split(bool newitemonleft, XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
+	bool		isleaf = (xlrec->level == 0);
+	Buffer		buf;
+	Buffer		rbuf;
+	Page		rpage;
+	BTPageOpaque ropaque;
+	char	   *datapos;
+	Size		datalen;
+	BlockNumber origpagenumber;
+	BlockNumber rightpagenumber;
+	BlockNumber spagenumber;
+
+	XLogRecGetBlockTag(record, 0, NULL, NULL, &origpagenumber);
+	XLogRecGetBlockTag(record, 1, NULL, NULL, &rightpagenumber);
+	if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &spagenumber))
+		spagenumber = P_NONE;
+
+	/*
+	 * Clear the incomplete split flag on the appropriate child page one level
+	 * down when origpage/buf is an internal page (there must have been
+	 * cascading page splits during original execution in the event of an
+	 * internal page split).  This is like the corresponding btree_xlog_insert
+	 * call for internal pages.  We're not clearing the incomplete split flag
+	 * for the current page split here (you can think of this as part of the
+	 * insert of newitem that the page split action needs to perform in
+	 * passing).
+	 *
+	 * Like in btree_xlog_insert, this can be done before locking other pages.
+	 * We never need to couple cross-level locks in REDO routines.
+	 */
+	if (!isleaf)
+		_bt_clear_incomplete_split(record, 3);
+
+	/* Reconstruct right (new) sibling page from scratch */
+	rbuf = XLogInitBufferForRedo(record, 1);
+	datapos = XLogRecGetBlockData(record, 1, &datalen);
+	rpage = (Page) BufferGetPage(rbuf);
+
+	_bt_pageinit(rpage, BufferGetPageSize(rbuf));
+	ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+	ropaque->btpo_prev = origpagenumber;
+	ropaque->btpo_next = spagenumber;
+	ropaque->btpo_level = xlrec->level;
+	ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
+	ropaque->btpo_cycleid = 0;
+
+	_bt_restore_page(rpage, datapos, datalen);
+
+	PageSetLSN(rpage, lsn);
+	MarkBufferDirty(rbuf);
+
+	/* Now reconstruct original page (left half of split) */
+	if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
+	{
+		/*
+		 * To retain the same physical order of the tuples that they had, we
+		 * initialize a temporary empty page for the left page and add all the
+		 * items to that in item number order.  This mirrors how _bt_split()
+		 * works.  Retaining the same physical order makes WAL consistency
+		 * checking possible.  See also _bt_restore_page(), which does the
+		 * same for the right page.
+		 */
+		Page		origpage = (Page) BufferGetPage(buf);
+		BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+		OffsetNumber off;
+		IndexTuple	newitem = NULL,
+					left_hikey = NULL,
+					nposting = NULL;
+		Size		newitemsz = 0,
+					left_hikeysz = 0;
+		Page		leftpage;
+		OffsetNumber leftoff,
+					replacepostingoff = InvalidOffsetNumber;
+
+		datapos = XLogRecGetBlockData(record, 0, &datalen);
+
+		if (newitemonleft || xlrec->postingoff != 0)
+		{
+			newitem = (IndexTuple) datapos;
+			newitemsz = MAXALIGN(IndexTupleSize(newitem));
+			datapos += newitemsz;
+			datalen -= newitemsz;
+
+			if (xlrec->postingoff != 0)
+			{
+				ItemId		itemid;
+				IndexTuple	oposting;
+
+				/* Posting list must be at offset number before new item's */
+				replacepostingoff = OffsetNumberPrev(xlrec->newitemoff);
+
+				/* Use mutable, aligned newitem copy in _bt_swap_posting() */
+				newitem = CopyIndexTuple(newitem);
+				itemid = PageGetItemId(origpage, replacepostingoff);
+				oposting = (IndexTuple) PageGetItem(origpage, itemid);
+				nposting = _bt_swap_posting(newitem, oposting,
+											xlrec->postingoff);
+			}
+		}
+
+		/*
+		 * Extract left hikey and its size.  We assume that 16-bit alignment
+		 * is enough to apply IndexTupleSize (since it's fetching from a
+		 * uint16 field).
+		 */
+		left_hikey = (IndexTuple) datapos;
+		left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
+		datapos += left_hikeysz;
+		datalen -= left_hikeysz;
+
+		Assert(datalen == 0);
+
+		leftpage = PageGetTempPageCopySpecial(origpage);
+
+		/* Add high key tuple from WAL record to temp page */
+		leftoff = P_HIKEY;
+		if (PageAddItem(leftpage, (Item) left_hikey, left_hikeysz, P_HIKEY,
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "failed to add high key to left page after split");
+		leftoff = OffsetNumberNext(leftoff);
+
+		for (off = P_FIRSTDATAKEY(oopaque); off < xlrec->firstrightoff; off++)
+		{
+			ItemId		itemid;
+			Size		itemsz;
+			IndexTuple	item;
+
+			/* Add replacement posting list when required */
+			if (off == replacepostingoff)
+			{
+				Assert(newitemonleft ||
+					   xlrec->firstrightoff == xlrec->newitemoff);
+				if (PageAddItem(leftpage, (Item) nposting,
+								MAXALIGN(IndexTupleSize(nposting)), leftoff,
+								false, false) == InvalidOffsetNumber)
+					elog(ERROR, "failed to add new posting list item to left page after split");
+				leftoff = OffsetNumberNext(leftoff);
+				continue;		/* don't insert oposting */
+			}
+
+			/* add the new item if it was inserted on left page */
+			else if (newitemonleft && off == xlrec->newitemoff)
+			{
+				if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff,
+								false, false) == InvalidOffsetNumber)
+					elog(ERROR, "failed to add new item to left page after split");
+				leftoff = OffsetNumberNext(leftoff);
+			}
+
+			itemid = PageGetItemId(origpage, off);
+			itemsz = ItemIdGetLength(itemid);
+			item = (IndexTuple) PageGetItem(origpage, itemid);
+			if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
+							false, false) == InvalidOffsetNumber)
+				elog(ERROR, "failed to add old item to left page after split");
+			leftoff = OffsetNumberNext(leftoff);
+		}
+
+		/* cope with possibility that newitem goes at the end */
+		if (newitemonleft && off == xlrec->newitemoff)
+		{
+			if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff,
+							false, false) == InvalidOffsetNumber)
+				elog(ERROR, "failed to add new item to left page after split");
+			leftoff = OffsetNumberNext(leftoff);
+		}
+
+		PageRestoreTempPage(leftpage, origpage);
+
+		/* Fix opaque fields */
+		oopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
+		if (isleaf)
+			oopaque->btpo_flags |= BTP_LEAF;
+		oopaque->btpo_next = rightpagenumber;
+		oopaque->btpo_cycleid = 0;
+
+		PageSetLSN(origpage, lsn);
+		MarkBufferDirty(buf);
+	}
+
+	/* Fix left-link of the page to the right of the new right sibling */
+	if (spagenumber != P_NONE)
+	{
+		Buffer		sbuf;
+
+		if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO)
+		{
+			Page		spage = (Page) BufferGetPage(sbuf);
+			BTPageOpaque spageop = (BTPageOpaque) PageGetSpecialPointer(spage);
+
+			spageop->btpo_prev = rightpagenumber;
+
+			PageSetLSN(spage, lsn);
+			MarkBufferDirty(sbuf);
+		}
+		if (BufferIsValid(sbuf))
+			UnlockReleaseBuffer(sbuf);
+	}
+
+	/*
+	 * Finally, release the remaining buffers.  sbuf, rbuf, and buf must be
+	 * released together, so that readers cannot observe inconsistencies.
+	 */
+	UnlockReleaseBuffer(rbuf);
+	if (BufferIsValid(buf))
+		UnlockReleaseBuffer(buf);
+}
+
+static void
+btree_xlog_dedup(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record);
+	Buffer		buf;
+
+	if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
+	{
+		char	   *ptr = XLogRecGetBlockData(record, 0, NULL);
+		Page		page = (Page) BufferGetPage(buf);
+		BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		OffsetNumber offnum,
+					minoff,
+					maxoff;
+		BTDedupState state;
+		BTDedupInterval *intervals;
+		Page		newpage;
+
+		state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+		state->deduplicate = true;	/* unused */
+		state->nmaxitems = 0;	/* unused */
+		/* Conservatively use larger maxpostingsize than primary */
+		state->maxpostingsize = BTMaxItemSize(page);
+		state->base = NULL;
+		state->baseoff = InvalidOffsetNumber;
+		state->basetupsize = 0;
+		state->htids = palloc(state->maxpostingsize);
+		state->nhtids = 0;
+		state->nitems = 0;
+		state->phystupsize = 0;
+		state->nintervals = 0;
+
+		minoff = P_FIRSTDATAKEY(opaque);
+		maxoff = PageGetMaxOffsetNumber(page);
+		newpage = PageGetTempPageCopySpecial(page);
+
+		if (!P_RIGHTMOST(opaque))
+		{
+			ItemId		itemid = PageGetItemId(page, P_HIKEY);
+			Size		itemsz = ItemIdGetLength(itemid);
+			IndexTuple	item = (IndexTuple) PageGetItem(page, itemid);
+
+			if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY,
+							false, false) == InvalidOffsetNumber)
+				elog(ERROR, "deduplication failed to add highkey");
+		}
+
+		intervals = (BTDedupInterval *) ptr;
+		for (offnum = minoff;
+			 offnum <= maxoff;
+			 offnum = OffsetNumberNext(offnum))
+		{
+			ItemId		itemid = PageGetItemId(page, offnum);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+			if (offnum == minoff)
+				_bt_dedup_start_pending(state, itup, offnum);
+			else if (state->nintervals < xlrec->nintervals &&
+					 state->baseoff == intervals[state->nintervals].baseoff &&
+					 state->nitems < intervals[state->nintervals].nitems)
+			{
+				if (!_bt_dedup_save_htid(state, itup))
+					elog(ERROR, "deduplication failed to add heap tid to pending posting list");
+			}
+			else
+			{
+				_bt_dedup_finish_pending(newpage, state);
+				_bt_dedup_start_pending(state, itup, offnum);
+			}
+		}
+
+		_bt_dedup_finish_pending(newpage, state);
+		Assert(state->nintervals == xlrec->nintervals);
+		Assert(memcmp(state->intervals, intervals,
+					  state->nintervals * sizeof(BTDedupInterval)) == 0);
+
+		if (P_HAS_GARBAGE(opaque))
+		{
+			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+			nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+		}
+
+		PageRestoreTempPage(newpage, page);
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buf);
+	}
+
+	if (BufferIsValid(buf))
+		UnlockReleaseBuffer(buf);
+}
+
+static void
+btree_xlog_updates(Page page, OffsetNumber *updatedoffsets,
+				   xl_btree_update *updates, int nupdated)
+{
+	BTVacuumPosting vacposting;
+	IndexTuple	origtuple;
+	ItemId		itemid;
+	Size		itemsz;
+
+	for (int i = 0; i < nupdated; i++)
+	{
+		itemid = PageGetItemId(page, updatedoffsets[i]);
+		origtuple = (IndexTuple) PageGetItem(page, itemid);
+
+		vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+							updates->ndeletedtids * sizeof(uint16));
+		vacposting->updatedoffset = updatedoffsets[i];
+		vacposting->itup = origtuple;
+		vacposting->ndeletedtids = updates->ndeletedtids;
+		memcpy(vacposting->deletetids,
+			   (char *) updates + SizeOfBtreeUpdate,
+			   updates->ndeletedtids * sizeof(uint16));
+
+		_bt_update_posting(vacposting);
+
+		/* Overwrite updated version of tuple */
+		itemsz = MAXALIGN(IndexTupleSize(vacposting->itup));
+		if (!PageIndexTupleOverwrite(page, updatedoffsets[i],
+									 (Item) vacposting->itup, itemsz))
+			elog(PANIC, "failed to update partially dead item");
+
+		pfree(vacposting->itup);
+		pfree(vacposting);
+
+		/* advance to next xl_btree_update from array */
+		updates = (xl_btree_update *)
+			((char *) updates + SizeOfBtreeUpdate +
+			 updates->ndeletedtids * sizeof(uint16));
+	}
+}
+
+static void
+btree_xlog_vacuum(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque opaque;
+
+	/*
+	 * We need to take a cleanup lock here, just like btvacuumpage(). However,
+	 * it isn't necessary to exhaustively get a cleanup lock on every block in
+	 * the index during recovery (just getting a cleanup lock on pages with
+	 * items to kill suffices).  See nbtree/README for details.
+	 */
+	if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer)
+		== BLK_NEEDS_REDO)
+	{
+		char	   *ptr = XLogRecGetBlockData(record, 0, NULL);
+
+		page = (Page) BufferGetPage(buffer);
+
+		if (xlrec->nupdated > 0)
+		{
+			OffsetNumber *updatedoffsets;
+			xl_btree_update *updates;
+
+			updatedoffsets = (OffsetNumber *)
+				(ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+			updates = (xl_btree_update *) ((char *) updatedoffsets +
+										   xlrec->nupdated *
+										   sizeof(OffsetNumber));
+
+			btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated);
+		}
+
+		if (xlrec->ndeleted > 0)
+			PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
+
+		/*
+		 * Mark the page as not containing any LP_DEAD items --- see comments
+		 * in _bt_delitems_vacuum().
+		 */
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
+
+static void
+btree_xlog_delete(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque opaque;
+
+	/*
+	 * If we have any conflict processing to do, it must happen before we
+	 * update the page
+	 */
+	if (InHotStandby)
+	{
+		RelFileNode rnode;
+
+		XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
+	}
+
+	/*
+	 * We don't need to take a cleanup lock to apply these changes. See
+	 * nbtree/README for details.
+	 */
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		char	   *ptr = XLogRecGetBlockData(record, 0, NULL);
+
+		page = (Page) BufferGetPage(buffer);
+
+		if (xlrec->nupdated > 0)
+		{
+			OffsetNumber *updatedoffsets;
+			xl_btree_update *updates;
+
+			updatedoffsets = (OffsetNumber *)
+				(ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+			updates = (xl_btree_update *) ((char *) updatedoffsets +
+										   xlrec->nupdated *
+										   sizeof(OffsetNumber));
+
+			btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated);
+		}
+
+		if (xlrec->ndeleted > 0)
+			PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
+
+		/* Mark the page as not containing any LP_DEAD items */
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
+
+static void
+btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque pageop;
+	IndexTupleData trunctuple;
+
+	/*
+	 * In normal operation, we would lock all the pages this WAL record
+	 * touches before changing any of them.  In WAL replay, it should be okay
+	 * to lock just one page at a time, since no concurrent index updates can
+	 * be happening, and readers should not care whether they arrive at the
+	 * target page or not (since it's surely empty).
+	 */
+
+	/* to-be-deleted subtree's parent page */
+	if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+	{
+		OffsetNumber poffset;
+		ItemId		itemid;
+		IndexTuple	itup;
+		OffsetNumber nextoffset;
+		BlockNumber rightsib;
+
+		page = (Page) BufferGetPage(buffer);
+		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		poffset = xlrec->poffset;
+
+		nextoffset = OffsetNumberNext(poffset);
+		itemid = PageGetItemId(page, nextoffset);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+		rightsib = BTreeTupleGetDownLink(itup);
+
+		itemid = PageGetItemId(page, poffset);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+		BTreeTupleSetDownLink(itup, rightsib);
+		nextoffset = OffsetNumberNext(poffset);
+		PageIndexTupleDelete(page, nextoffset);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+
+	/*
+	 * Don't need to couple cross-level locks in REDO routines, so release
+	 * lock on internal page immediately
+	 */
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	/* Rewrite the leaf page as a halfdead page */
+	buffer = XLogInitBufferForRedo(record, 0);
+	page = (Page) BufferGetPage(buffer);
+
+	_bt_pageinit(page, BufferGetPageSize(buffer));
+	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	pageop->btpo_prev = xlrec->leftblk;
+	pageop->btpo_next = xlrec->rightblk;
+	pageop->btpo_level = 0;
+	pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
+	pageop->btpo_cycleid = 0;
+
+	/*
+	 * Construct a dummy high key item that points to top parent page (value
+	 * is InvalidBlockNumber when the top parent page is the leaf page itself)
+	 */
+	MemSet(&trunctuple, 0, sizeof(IndexTupleData));
+	trunctuple.t_info = sizeof(IndexTupleData);
+	BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
+
+	if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
+					false, false) == InvalidOffsetNumber)
+		elog(ERROR, "could not add dummy high key to half-dead page");
+
+	PageSetLSN(page, lsn);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
+}
+
+
+static void
+btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
+	BlockNumber leftsib;
+	BlockNumber rightsib;
+	uint32		level;
+	bool		isleaf;
+	FullTransactionId safexid;
+	Buffer		leftbuf;
+	Buffer		target;
+	Buffer		rightbuf;
+	Page		page;
+	BTPageOpaque pageop;
+
+	leftsib = xlrec->leftsib;
+	rightsib = xlrec->rightsib;
+	level = xlrec->level;
+	isleaf = (level == 0);
+	safexid = xlrec->safexid;
+
+	/* No leaftopparent for level 0 (leaf page) or level 1 target */
+	Assert(!BlockNumberIsValid(xlrec->leaftopparent) || level > 1);
+
+	/*
+	 * In normal operation, we would lock all the pages this WAL record
+	 * touches before changing any of them.  In WAL replay, we at least lock
+	 * the pages in the same standard left-to-right order (leftsib, target,
+	 * rightsib), and don't release the sibling locks until the target is
+	 * marked deleted.
+	 */
+
+	/* Fix right-link of left sibling, if any */
+	if (leftsib != P_NONE)
+	{
+		if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO)
+		{
+			page = (Page) BufferGetPage(leftbuf);
+			pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+			pageop->btpo_next = rightsib;
+
+			PageSetLSN(page, lsn);
+			MarkBufferDirty(leftbuf);
+		}
+	}
+	else
+		leftbuf = InvalidBuffer;
+
+	/* Rewrite target page as empty deleted page */
+	target = XLogInitBufferForRedo(record, 0);
+	page = (Page) BufferGetPage(target);
+
+	_bt_pageinit(page, BufferGetPageSize(target));
+	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	pageop->btpo_prev = leftsib;
+	pageop->btpo_next = rightsib;
+	pageop->btpo_level = level;
+	BTPageSetDeleted(page, safexid);
+	if (isleaf)
+		pageop->btpo_flags |= BTP_LEAF;
+	pageop->btpo_cycleid = 0;
+
+	PageSetLSN(page, lsn);
+	MarkBufferDirty(target);
+
+	/* Fix left-link of right sibling */
+	if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(rightbuf);
+		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+		pageop->btpo_prev = leftsib;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(rightbuf);
+	}
+
+	/* Release siblings */
+	if (BufferIsValid(leftbuf))
+		UnlockReleaseBuffer(leftbuf);
+	if (BufferIsValid(rightbuf))
+		UnlockReleaseBuffer(rightbuf);
+
+	/* Release target */
+	UnlockReleaseBuffer(target);
+
+	/*
+	 * If we deleted a parent of the targeted leaf page, instead of the leaf
+	 * itself, update the leaf to point to the next remaining child in the
+	 * to-be-deleted subtree
+	 */
+	if (XLogRecHasBlockRef(record, 3))
+	{
+		/*
+		 * There is no real data on the page, so we just re-create it from
+		 * scratch using the information from the WAL record.
+		 *
+		 * Note that we don't end up here when the target page is also the
+		 * leafbuf page.  There is no need to add a dummy hikey item with a
+		 * top parent link when deleting leafbuf because it's the last page
+		 * we'll delete in the subtree undergoing deletion.
+		 */
+		Buffer		leafbuf;
+		IndexTupleData trunctuple;
+
+		Assert(!isleaf);
+
+		leafbuf = XLogInitBufferForRedo(record, 3);
+		page = (Page) BufferGetPage(leafbuf);
+
+		_bt_pageinit(page, BufferGetPageSize(leafbuf));
+		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
+		pageop->btpo_prev = xlrec->leafleftsib;
+		pageop->btpo_next = xlrec->leafrightsib;
+		pageop->btpo_level = 0;
+		pageop->btpo_cycleid = 0;
+
+		/* Add a dummy hikey item */
+		MemSet(&trunctuple, 0, sizeof(IndexTupleData));
+		trunctuple.t_info = sizeof(IndexTupleData);
+		BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent);
+
+		if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "could not add dummy high key to half-dead page");
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(leafbuf);
+		UnlockReleaseBuffer(leafbuf);
+	}
+
+	/* Update metapage if needed */
+	if (info == XLOG_BTREE_UNLINK_PAGE_META)
+		_bt_restore_meta(record, 4);
+}
+
+static void
+btree_xlog_newroot(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque pageop;
+	char	   *ptr;
+	Size		len;
+
+	buffer = XLogInitBufferForRedo(record, 0);
+	page = (Page) BufferGetPage(buffer);
+
+	_bt_pageinit(page, BufferGetPageSize(buffer));
+	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	pageop->btpo_flags = BTP_ROOT;
+	pageop->btpo_prev = pageop->btpo_next = P_NONE;
+	pageop->btpo_level = xlrec->level;
+	if (xlrec->level == 0)
+		pageop->btpo_flags |= BTP_LEAF;
+	pageop->btpo_cycleid = 0;
+
+	if (xlrec->level > 0)
+	{
+		ptr = XLogRecGetBlockData(record, 0, &len);
+		_bt_restore_page(page, ptr, len);
+
+		/* Clear the incomplete-split flag in left child */
+		_bt_clear_incomplete_split(record, 1);
+	}
+
+	PageSetLSN(page, lsn);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
+
+	_bt_restore_meta(record, 2);
+}
+
+/*
+ * In general VACUUM must defer recycling as a way of avoiding certain race
+ * conditions.  Deleted pages contain a safexid value that is used by VACUUM
+ * to determine whether or not it's safe to place a page that was deleted by
+ * VACUUM earlier into the FSM now.  See nbtree/README.
+ *
+ * As far as any backend operating during original execution is concerned, the
+ * FSM is a cache of recycle-safe pages; the mere presence of the page in the
+ * FSM indicates that the page must already be safe to recycle (actually,
+ * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just
+ * because it would be unwise to completely trust the FSM, given its current
+ * limitations).
+ *
+ * This isn't sufficient to prevent similar concurrent recycling race
+ * conditions during Hot Standby, though.  For that we need to log a
+ * xl_btree_reuse_page record at the point that a page is actually recycled
+ * and reused for an entirely unrelated page inside _bt_split().  These
+ * records include the same safexid value from the original deleted page,
+ * stored in the record's latestRemovedFullXid field.
+ *
+ * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used
+ * to determine if it's safe to recycle a page.  This mirrors our own test:
+ * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs().
+ * Consequently, one XID value achieves the same exclusion effect on primary
+ * and standby.
+ */
+static void
+btree_xlog_reuse_page(XLogReaderState *record)
+{
+	xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
+
+	if (InHotStandby)
+		ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+												   xlrec->node);
+}
+
+void
+btree_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	MemoryContext oldCtx;
+
+	oldCtx = MemoryContextSwitchTo(opCtx);
+	switch (info)
+	{
+		case XLOG_BTREE_INSERT_LEAF:
+			btree_xlog_insert(true, false, false, record);
+			break;
+		case XLOG_BTREE_INSERT_UPPER:
+			btree_xlog_insert(false, false, false, record);
+			break;
+		case XLOG_BTREE_INSERT_META:
+			btree_xlog_insert(false, true, false, record);
+			break;
+		case XLOG_BTREE_SPLIT_L:
+			btree_xlog_split(true, record);
+			break;
+		case XLOG_BTREE_SPLIT_R:
+			btree_xlog_split(false, record);
+			break;
+		case XLOG_BTREE_INSERT_POST:
+			btree_xlog_insert(true, false, true, record);
+			break;
+		case XLOG_BTREE_DEDUP:
+			btree_xlog_dedup(record);
+			break;
+		case XLOG_BTREE_VACUUM:
+			btree_xlog_vacuum(record);
+			break;
+		case XLOG_BTREE_DELETE:
+			btree_xlog_delete(record);
+			break;
+		case XLOG_BTREE_MARK_PAGE_HALFDEAD:
+			btree_xlog_mark_page_halfdead(info, record);
+			break;
+		case XLOG_BTREE_UNLINK_PAGE:
+		case XLOG_BTREE_UNLINK_PAGE_META:
+			btree_xlog_unlink_page(info, record);
+			break;
+		case XLOG_BTREE_NEWROOT:
+			btree_xlog_newroot(record);
+			break;
+		case XLOG_BTREE_REUSE_PAGE:
+			btree_xlog_reuse_page(record);
+			break;
+		case XLOG_BTREE_META_CLEANUP:
+			_bt_restore_meta(record, 0);
+			break;
+		default:
+			elog(PANIC, "btree_redo: unknown op code %u", info);
+	}
+	MemoryContextSwitchTo(oldCtx);
+	MemoryContextReset(opCtx);
+}
+
+void
+btree_xlog_startup(void)
+{
+	opCtx = AllocSetContextCreate(CurrentMemoryContext,
+								  "Btree recovery temporary context",
+								  ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+btree_xlog_cleanup(void)
+{
+	MemoryContextDelete(opCtx);
+	opCtx = NULL;
+}
+
+/*
+ * Mask a btree page before performing consistency checks on it.
+ */
+void
+btree_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	BTPageOpaque maskopaq;
+
+	mask_page_lsn_and_checksum(page);
+
+	mask_page_hint_bits(page);
+	mask_unused_space(page);
+
+	maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	if (P_ISLEAF(maskopaq))
+	{
+		/*
+		 * In btree leaf pages, it is possible to modify the LP_FLAGS without
+		 * emitting any WAL record. Hence, mask the line pointer flags. See
+		 * _bt_killitems(), _bt_check_unique() for details.
+		 */
+		mask_lp_flags(page);
+	}
+
+	/*
+	 * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
+	 * _bt_delete_or_dedup_one_page(), _bt_killitems(), and _bt_check_unique()
+	 * for details.
+	 */
+	maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	/*
+	 * During replay of a btree page split, we don't set the BTP_SPLIT_END
+	 * flag of the right sibling and initialize the cycle_id to 0 for the same
+	 * page. See btree_xlog_split() for details.
+	 */
+	maskopaq->btpo_flags &= ~BTP_SPLIT_END;
+	maskopaq->btpo_cycleid = 0;
+}