1 files changed, 3009 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
new file mode 100644
index 0000000..f6f4af8
--- /dev/null
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -0,0 +1,3009 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtinsert.c
+ *	  Item insertion in Lehman and Yao btrees for Postgres.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtinsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/transam.h"
+#include "access/xloginsert.h"
+#include "common/pg_prng.h"
+#include "lib/qunique.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/smgr.h"
+
+/* Minimum tree height for application of fastpath optimization */
+#define BTREE_FASTPATH_MIN_LEVEL	2
+
+
+static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
+static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
+									  Relation heapRel,
+									  IndexUniqueCheck checkUnique, bool *is_unique,
+									  uint32 *speculativeToken);
+static OffsetNumber _bt_findinsertloc(Relation rel,
+									  BTInsertState insertstate,
+									  bool checkingunique,
+									  bool indexUnchanged,
+									  BTStack stack,
+									  Relation heapRel);
+static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
+static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
+						   Buffer buf,
+						   Buffer cbuf,
+						   BTStack stack,
+						   IndexTuple itup,
+						   Size itemsz,
+						   OffsetNumber newitemoff,
+						   int postingoff,
+						   bool split_only_page);
+static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
+						Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
+						IndexTuple newitem, IndexTuple orignewitem,
+						IndexTuple nposting, uint16 postingoff);
+static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+							  BTStack stack, bool isroot, bool isonly);
+static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
+								OffsetNumber itup_off, bool newfirstdataitem);
+static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
+										 BTInsertState insertstate,
+										 bool simpleonly, bool checkingunique,
+										 bool uniquedup, bool indexUnchanged);
+static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
+							   OffsetNumber *deletable, int ndeletable,
+							   IndexTuple newitem, OffsetNumber minoff,
+							   OffsetNumber maxoff);
+static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
+								   int ndeletable, IndexTuple newitem,
+								   int *nblocks);
+static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
+
+/*
+ *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
+ *
+ *		This routine is called by the public interface routine, btinsert.
+ *		By here, itup is filled in, including the TID.
+ *
+ *		If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
+ *		will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
+ *		UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
+ *		For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
+ *		don't actually insert.
+ *
+ *		indexUnchanged executor hint indicates if itup is from an
+ *		UPDATE that didn't logically change the indexed value, but
+ *		must nevertheless have a new entry to point to a successor
+ *		version.
+ *
+ *		The result value is only significant for UNIQUE_CHECK_PARTIAL:
+ *		it must be true if the entry is known unique, else false.
+ *		(In the current implementation we'll also return true after a
+ *		successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
+ *		that's just a coding artifact.)
+ */
+bool
+_bt_doinsert(Relation rel, IndexTuple itup,
+			 IndexUniqueCheck checkUnique, bool indexUnchanged,
+			 Relation heapRel)
+{
+	bool		is_unique = false;
+	BTInsertStateData insertstate;
+	BTScanInsert itup_key;
+	BTStack		stack;
+	bool		checkingunique = (checkUnique != UNIQUE_CHECK_NO);
+
+	/* we need an insertion scan key to do our search, so build one */
+	itup_key = _bt_mkscankey(rel, itup);
+
+	if (checkingunique)
+	{
+		if (!itup_key->anynullkeys)
+		{
+			/* No (heapkeyspace) scantid until uniqueness established */
+			itup_key->scantid = NULL;
+		}
+		else
+		{
+			/*
+			 * Scan key for new tuple contains NULL key values.  Bypass
+			 * checkingunique steps.  They are unnecessary because core code
+			 * considers NULL unequal to every value, including NULL.
+			 *
+			 * This optimization avoids O(N^2) behavior within the
+			 * _bt_findinsertloc() heapkeyspace path when a unique index has a
+			 * large number of "duplicates" with NULL key values.
+			 */
+			checkingunique = false;
+			/* Tuple is unique in the sense that core code cares about */
+			Assert(checkUnique != UNIQUE_CHECK_EXISTING);
+			is_unique = true;
+		}
+	}
+
+	/*
+	 * Fill in the BTInsertState working area, to track the current page and
+	 * position within the page to insert on.
+	 *
+	 * Note that itemsz is passed down to lower level code that deals with
+	 * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
+	 * accounting code consistently considers the alignment overhead that we
+	 * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
+	 * already conservative about alignment, but we don't rely on that from
+	 * this distance.  Besides, preserving the "true" tuple size in index
+	 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
+	 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
+	 */
+	insertstate.itup = itup;
+	insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
+	insertstate.itup_key = itup_key;
+	insertstate.bounds_valid = false;
+	insertstate.buf = InvalidBuffer;
+	insertstate.postingoff = 0;
+
+search:
+
+	/*
+	 * Find and lock the leaf page that the tuple should be added to by
+	 * searching from the root page.  insertstate.buf will hold a buffer that
+	 * is locked in exclusive mode afterwards.
+	 */
+	stack = _bt_search_insert(rel, &insertstate);
+
+	/*
+	 * checkingunique inserts are not allowed to go ahead when two tuples with
+	 * equal key attribute values would be visible to new MVCC snapshots once
+	 * the xact commits.  Check for conflicts in the locked page/buffer (if
+	 * needed) here.
+	 *
+	 * It might be necessary to check a page to the right in _bt_check_unique,
+	 * though that should be very rare.  In practice the first page the value
+	 * could be on (with scantid omitted) is almost always also the only page
+	 * that a matching tuple might be found on.  This is due to the behavior
+	 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
+	 * only be allowed to cross a page boundary when there is no candidate
+	 * leaf page split point that avoids it.  Also, _bt_check_unique can use
+	 * the leaf page high key to determine that there will be no duplicates on
+	 * the right sibling without actually visiting it (it uses the high key in
+	 * cases where the new item happens to belong at the far right of the leaf
+	 * page).
+	 *
+	 * NOTE: obviously, _bt_check_unique can only detect keys that are already
+	 * in the index; so it cannot defend against concurrent insertions of the
+	 * same key.  We protect against that by means of holding a write lock on
+	 * the first page the value could be on, with omitted/-inf value for the
+	 * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
+	 * the same key must acquire a write lock on the same page, so only one
+	 * would-be inserter can be making the check at one time.  Furthermore,
+	 * once we are past the check we hold write locks continuously until we
+	 * have performed our insertion, so no later inserter can fail to see our
+	 * insertion.  (This requires some care in _bt_findinsertloc.)
+	 *
+	 * If we must wait for another xact, we release the lock while waiting,
+	 * and then must perform a new search.
+	 *
+	 * For a partial uniqueness check, we don't wait for the other xact. Just
+	 * let the tuple in and return false for possibly non-unique, or true for
+	 * definitely unique.
+	 */
+	if (checkingunique)
+	{
+		TransactionId xwait;
+		uint32		speculativeToken;
+
+		xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
+								 &is_unique, &speculativeToken);
+
+		if (unlikely(TransactionIdIsValid(xwait)))
+		{
+			/* Have to wait for the other guy ... */
+			_bt_relbuf(rel, insertstate.buf);
+			insertstate.buf = InvalidBuffer;
+
+			/*
+			 * If it's a speculative insertion, wait for it to finish (ie. to
+			 * go ahead with the insertion, or kill the tuple).  Otherwise
+			 * wait for the transaction to finish as usual.
+			 */
+			if (speculativeToken)
+				SpeculativeInsertionWait(xwait, speculativeToken);
+			else
+				XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
+
+			/* start over... */
+			if (stack)
+				_bt_freestack(stack);
+			goto search;
+		}
+
+		/* Uniqueness is established -- restore heap tid as scantid */
+		if (itup_key->heapkeyspace)
+			itup_key->scantid = &itup->t_tid;
+	}
+
+	if (checkUnique != UNIQUE_CHECK_EXISTING)
+	{
+		OffsetNumber newitemoff;
+
+		/*
+		 * The only conflict predicate locking cares about for indexes is when
+		 * an index tuple insert conflicts with an existing lock.  We don't
+		 * know the actual page we're going to insert on for sure just yet in
+		 * checkingunique and !heapkeyspace cases, but it's okay to use the
+		 * first page the value could be on (with scantid omitted) instead.
+		 */
+		CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
+
+		/*
+		 * Do the insertion.  Note that insertstate contains cached binary
+		 * search bounds established within _bt_check_unique when insertion is
+		 * checkingunique.
+		 */
+		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
+									   indexUnchanged, stack, heapRel);
+		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
+					   itup, insertstate.itemsz, newitemoff,
+					   insertstate.postingoff, false);
+	}
+	else
+	{
+		/* just release the buffer */
+		_bt_relbuf(rel, insertstate.buf);
+	}
+
+	/* be tidy */
+	if (stack)
+		_bt_freestack(stack);
+	pfree(itup_key);
+
+	return is_unique;
+}
+
+/*
+ *	_bt_search_insert() -- _bt_search() wrapper for inserts
+ *
+ * Search the tree for a particular scankey, or more precisely for the first
+ * leaf page it could be on.  Try to make use of the fastpath optimization's
+ * rightmost leaf page cache before actually searching the tree from the root
+ * page, though.
+ *
+ * Return value is a stack of parent-page pointers (though see notes about
+ * fastpath optimization and page splits below).  insertstate->buf is set to
+ * the address of the leaf-page buffer, which is write-locked and pinned in
+ * all cases (if necessary by creating a new empty root page for caller).
+ *
+ * The fastpath optimization avoids most of the work of searching the tree
+ * repeatedly when a single backend inserts successive new tuples on the
+ * rightmost leaf page of an index.  A backend cache of the rightmost leaf
+ * page is maintained within _bt_insertonpg(), and used here.  The cache is
+ * invalidated here when an insert of a non-pivot tuple must take place on a
+ * non-rightmost leaf page.
+ *
+ * The optimization helps with indexes on an auto-incremented field.  It also
+ * helps with indexes on datetime columns, as well as indexes with lots of
+ * NULL values.  (NULLs usually get inserted in the rightmost page for single
+ * column indexes, since they usually get treated as coming after everything
+ * else in the key space.  Individual NULL tuples will generally be placed on
+ * the rightmost leaf page due to the influence of the heap TID column.)
+ *
+ * Note that we avoid applying the optimization when there is insufficient
+ * space on the rightmost page to fit caller's new item.  This is necessary
+ * because we'll need to return a real descent stack when a page split is
+ * expected (actually, caller can cope with a leaf page split that uses a NULL
+ * stack, but that's very slow and so must be avoided).  Note also that the
+ * fastpath optimization acquires the lock on the page conditionally as a way
+ * of reducing extra contention when there are concurrent insertions into the
+ * rightmost page (we give up if we'd have to wait for the lock).  We assume
+ * that it isn't useful to apply the optimization when there is contention,
+ * since each per-backend cache won't stay valid for long.
+ */
+static BTStack
+_bt_search_insert(Relation rel, BTInsertState insertstate)
+{
+	Assert(insertstate->buf == InvalidBuffer);
+	Assert(!insertstate->bounds_valid);
+	Assert(insertstate->postingoff == 0);
+
+	if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
+	{
+		/* Simulate a _bt_getbuf() call with conditional locking */
+		insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
+		if (_bt_conditionallockbuf(rel, insertstate->buf))
+		{
+			Page		page;
+			BTPageOpaque opaque;
+
+			_bt_checkpage(rel, insertstate->buf);
+			page = BufferGetPage(insertstate->buf);
+			opaque = BTPageGetOpaque(page);
+
+			/*
+			 * Check if the page is still the rightmost leaf page and has
+			 * enough free space to accommodate the new tuple.  Also check
+			 * that the insertion scan key is strictly greater than the first
+			 * non-pivot tuple on the page.  (Note that we expect itup_key's
+			 * scantid to be unset when our caller is a checkingunique
+			 * inserter.)
+			 */
+			if (P_RIGHTMOST(opaque) &&
+				P_ISLEAF(opaque) &&
+				!P_IGNORE(opaque) &&
+				PageGetFreeSpace(page) > insertstate->itemsz &&
+				PageGetMaxOffsetNumber(page) >= P_HIKEY &&
+				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
+			{
+				/*
+				 * Caller can use the fastpath optimization because cached
+				 * block is still rightmost leaf page, which can fit caller's
+				 * new tuple without splitting.  Keep block in local cache for
+				 * next insert, and have caller use NULL stack.
+				 *
+				 * Note that _bt_insert_parent() has an assertion that catches
+				 * leaf page splits that somehow follow from a fastpath insert
+				 * (it should only be passed a NULL stack when it must deal
+				 * with a concurrent root page split, and never because a NULL
+				 * stack was returned here).
+				 */
+				return NULL;
+			}
+
+			/* Page unsuitable for caller, drop lock and pin */
+			_bt_relbuf(rel, insertstate->buf);
+		}
+		else
+		{
+			/* Lock unavailable, drop pin */
+			ReleaseBuffer(insertstate->buf);
+		}
+
+		/* Forget block, since cache doesn't appear to be useful */
+		RelationSetTargetBlock(rel, InvalidBlockNumber);
+	}
+
+	/* Cannot use optimization -- descend tree, return proper descent stack */
+	return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
+					  NULL);
+}
+
+/*
+ *	_bt_check_unique() -- Check for violation of unique index constraint
+ *
+ * Returns InvalidTransactionId if there is no conflict, else an xact ID
+ * we must wait for to see if it commits a conflicting tuple.   If an actual
+ * conflict is detected, no return --- just ereport().  If an xact ID is
+ * returned, and the conflicting tuple still has a speculative insertion in
+ * progress, *speculativeToken is set to non-zero, and the caller can wait for
+ * the verdict on the insertion using SpeculativeInsertionWait().
+ *
+ * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
+ * InvalidTransactionId because we don't want to wait.  In this case we
+ * set *is_unique to false if there is a potential conflict, and the
+ * core code must redo the uniqueness check later.
+ *
+ * As a side-effect, sets state in insertstate that can later be used by
+ * _bt_findinsertloc() to reuse most of the binary search work we do
+ * here.
+ *
+ * This code treats NULLs as equal, unlike the default semantics for unique
+ * indexes.  So do not call here when there are NULL values in scan key and
+ * the index uses the default NULLS DISTINCT mode.
+ */
+static TransactionId
+_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
+				 IndexUniqueCheck checkUnique, bool *is_unique,
+				 uint32 *speculativeToken)
+{
+	IndexTuple	itup = insertstate->itup;
+	IndexTuple	curitup = NULL;
+	ItemId		curitemid = NULL;
+	BTScanInsert itup_key = insertstate->itup_key;
+	SnapshotData SnapshotDirty;
+	OffsetNumber offset;
+	OffsetNumber maxoff;
+	Page		page;
+	BTPageOpaque opaque;
+	Buffer		nbuf = InvalidBuffer;
+	bool		found = false;
+	bool		inposting = false;
+	bool		prevalldead = true;
+	int			curposti = 0;
+
+	/* Assume unique until we find a duplicate */
+	*is_unique = true;
+
+	InitDirtySnapshot(SnapshotDirty);
+
+	page = BufferGetPage(insertstate->buf);
+	opaque = BTPageGetOpaque(page);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * Find the first tuple with the same key.
+	 *
+	 * This also saves the binary search bounds in insertstate.  We use them
+	 * in the fastpath below, but also in the _bt_findinsertloc() call later.
+	 */
+	Assert(!insertstate->bounds_valid);
+	offset = _bt_binsrch_insert(rel, insertstate);
+
+	/*
+	 * Scan over all equal tuples, looking for live conflicts.
+	 */
+	Assert(!insertstate->bounds_valid || insertstate->low == offset);
+	Assert(!itup_key->anynullkeys);
+	Assert(itup_key->scantid == NULL);
+	for (;;)
+	{
+		/*
+		 * Each iteration of the loop processes one heap TID, not one index
+		 * tuple.  Current offset number for page isn't usually advanced on
+		 * iterations that process heap TIDs from posting list tuples.
+		 *
+		 * "inposting" state is set when _inside_ a posting list --- not when
+		 * we're at the start (or end) of a posting list.  We advance curposti
+		 * at the end of the iteration when inside a posting list tuple.  In
+		 * general, every loop iteration either advances the page offset or
+		 * advances curposti --- an iteration that handles the rightmost/max
+		 * heap TID in a posting list finally advances the page offset (and
+		 * unsets "inposting").
+		 *
+		 * Make sure the offset points to an actual index tuple before trying
+		 * to examine it...
+		 */
+		if (offset <= maxoff)
+		{
+			/*
+			 * Fastpath: In most cases, we can use cached search bounds to
+			 * limit our consideration to items that are definitely
+			 * duplicates.  This fastpath doesn't apply when the original page
+			 * is empty, or when initial offset is past the end of the
+			 * original page, which may indicate that we need to examine a
+			 * second or subsequent page.
+			 *
+			 * Note that this optimization allows us to avoid calling
+			 * _bt_compare() directly when there are no duplicates, as long as
+			 * the offset where the key will go is not at the end of the page.
+			 */
+			if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
+			{
+				Assert(insertstate->bounds_valid);
+				Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
+				Assert(insertstate->low <= insertstate->stricthigh);
+				Assert(_bt_compare(rel, itup_key, page, offset) < 0);
+				break;
+			}
+
+			/*
+			 * We can skip items that are already marked killed.
+			 *
+			 * In the presence of heavy update activity an index may contain
+			 * many killed items with the same key; running _bt_compare() on
+			 * each killed item gets expensive.  Just advance over killed
+			 * items as quickly as we can.  We only apply _bt_compare() when
+			 * we get to a non-killed item.  We could reuse the bounds to
+			 * avoid _bt_compare() calls for known equal tuples, but it
+			 * doesn't seem worth it.
+			 */
+			if (!inposting)
+				curitemid = PageGetItemId(page, offset);
+			if (inposting || !ItemIdIsDead(curitemid))
+			{
+				ItemPointerData htid;
+				bool		all_dead = false;
+
+				if (!inposting)
+				{
+					/* Plain tuple, or first TID in posting list tuple */
+					if (_bt_compare(rel, itup_key, page, offset) != 0)
+						break;	/* we're past all the equal tuples */
+
+					/* Advanced curitup */
+					curitup = (IndexTuple) PageGetItem(page, curitemid);
+					Assert(!BTreeTupleIsPivot(curitup));
+				}
+
+				/* okay, we gotta fetch the heap tuple using htid ... */
+				if (!BTreeTupleIsPosting(curitup))
+				{
+					/* ... htid is from simple non-pivot tuple */
+					Assert(!inposting);
+					htid = curitup->t_tid;
+				}
+				else if (!inposting)
+				{
+					/* ... htid is first TID in new posting list */
+					inposting = true;
+					prevalldead = true;
+					curposti = 0;
+					htid = *BTreeTupleGetPostingN(curitup, 0);
+				}
+				else
+				{
+					/* ... htid is second or subsequent TID in posting list */
+					Assert(curposti > 0);
+					htid = *BTreeTupleGetPostingN(curitup, curposti);
+				}
+
+				/*
+				 * If we are doing a recheck, we expect to find the tuple we
+				 * are rechecking.  It's not a duplicate, but we have to keep
+				 * scanning.
+				 */
+				if (checkUnique == UNIQUE_CHECK_EXISTING &&
+					ItemPointerCompare(&htid, &itup->t_tid) == 0)
+				{
+					found = true;
+				}
+
+				/*
+				 * Check if there's any table tuples for this index entry
+				 * satisfying SnapshotDirty. This is necessary because for AMs
+				 * with optimizations like heap's HOT, we have just a single
+				 * index entry for the entire chain.
+				 */
+				else if (table_index_fetch_tuple_check(heapRel, &htid,
+													   &SnapshotDirty,
+													   &all_dead))
+				{
+					TransactionId xwait;
+
+					/*
+					 * It is a duplicate. If we are only doing a partial
+					 * check, then don't bother checking if the tuple is being
+					 * updated in another transaction. Just return the fact
+					 * that it is a potential conflict and leave the full
+					 * check till later. Don't invalidate binary search
+					 * bounds.
+					 */
+					if (checkUnique == UNIQUE_CHECK_PARTIAL)
+					{
+						if (nbuf != InvalidBuffer)
+							_bt_relbuf(rel, nbuf);
+						*is_unique = false;
+						return InvalidTransactionId;
+					}
+
+					/*
+					 * If this tuple is being updated by other transaction
+					 * then we have to wait for its commit/abort.
+					 */
+					xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
+						SnapshotDirty.xmin : SnapshotDirty.xmax;
+
+					if (TransactionIdIsValid(xwait))
+					{
+						if (nbuf != InvalidBuffer)
+							_bt_relbuf(rel, nbuf);
+						/* Tell _bt_doinsert to wait... */
+						*speculativeToken = SnapshotDirty.speculativeToken;
+						/* Caller releases lock on buf immediately */
+						insertstate->bounds_valid = false;
+						return xwait;
+					}
+
+					/*
+					 * Otherwise we have a definite conflict.  But before
+					 * complaining, look to see if the tuple we want to insert
+					 * is itself now committed dead --- if so, don't complain.
+					 * This is a waste of time in normal scenarios but we must
+					 * do it to support CREATE INDEX CONCURRENTLY.
+					 *
+					 * We must follow HOT-chains here because during
+					 * concurrent index build, we insert the root TID though
+					 * the actual tuple may be somewhere in the HOT-chain.
+					 * While following the chain we might not stop at the
+					 * exact tuple which triggered the insert, but that's OK
+					 * because if we find a live tuple anywhere in this chain,
+					 * we have a unique key conflict.  The other live tuple is
+					 * not part of this chain because it had a different index
+					 * entry.
+					 */
+					htid = itup->t_tid;
+					if (table_index_fetch_tuple_check(heapRel, &htid,
+													  SnapshotSelf, NULL))
+					{
+						/* Normal case --- it's still live */
+					}
+					else
+					{
+						/*
+						 * It's been deleted, so no error, and no need to
+						 * continue searching
+						 */
+						break;
+					}
+
+					/*
+					 * Check for a conflict-in as we would if we were going to
+					 * write to this page.  We aren't actually going to write,
+					 * but we want a chance to report SSI conflicts that would
+					 * otherwise be masked by this unique constraint
+					 * violation.
+					 */
+					CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
+
+					/*
+					 * This is a definite conflict.  Break the tuple down into
+					 * datums and report the error.  But first, make sure we
+					 * release the buffer locks we're holding ---
+					 * BuildIndexValueDescription could make catalog accesses,
+					 * which in the worst case might touch this same index and
+					 * cause deadlocks.
+					 */
+					if (nbuf != InvalidBuffer)
+						_bt_relbuf(rel, nbuf);
+					_bt_relbuf(rel, insertstate->buf);
+					insertstate->buf = InvalidBuffer;
+					insertstate->bounds_valid = false;
+
+					{
+						Datum		values[INDEX_MAX_KEYS];
+						bool		isnull[INDEX_MAX_KEYS];
+						char	   *key_desc;
+
+						index_deform_tuple(itup, RelationGetDescr(rel),
+										   values, isnull);
+
+						key_desc = BuildIndexValueDescription(rel, values,
+															  isnull);
+
+						ereport(ERROR,
+								(errcode(ERRCODE_UNIQUE_VIOLATION),
+								 errmsg("duplicate key value violates unique constraint \"%s\"",
+										RelationGetRelationName(rel)),
+								 key_desc ? errdetail("Key %s already exists.",
+													  key_desc) : 0,
+								 errtableconstraint(heapRel,
+													RelationGetRelationName(rel))));
+					}
+				}
+				else if (all_dead && (!inposting ||
+									  (prevalldead &&
+									   curposti == BTreeTupleGetNPosting(curitup) - 1)))
+				{
+					/*
+					 * The conflicting tuple (or all HOT chains pointed to by
+					 * all posting list TIDs) is dead to everyone, so mark the
+					 * index entry killed.
+					 */
+					ItemIdMarkDead(curitemid);
+					opaque->btpo_flags |= BTP_HAS_GARBAGE;
+
+					/*
+					 * Mark buffer with a dirty hint, since state is not
+					 * crucial. Be sure to mark the proper buffer dirty.
+					 */
+					if (nbuf != InvalidBuffer)
+						MarkBufferDirtyHint(nbuf, true);
+					else
+						MarkBufferDirtyHint(insertstate->buf, true);
+				}
+
+				/*
+				 * Remember if posting list tuple has even a single HOT chain
+				 * whose members are not all dead
+				 */
+				if (!all_dead && inposting)
+					prevalldead = false;
+			}
+		}
+
+		if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
+		{
+			/* Advance to next TID in same posting list */
+			curposti++;
+			continue;
+		}
+		else if (offset < maxoff)
+		{
+			/* Advance to next tuple */
+			curposti = 0;
+			inposting = false;
+			offset = OffsetNumberNext(offset);
+		}
+		else
+		{
+			int			highkeycmp;
+
+			/* If scankey == hikey we gotta check the next page too */
+			if (P_RIGHTMOST(opaque))
+				break;
+			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
+			Assert(highkeycmp <= 0);
+			if (highkeycmp != 0)
+				break;
+			/* Advance to next non-dead page --- there must be one */
+			for (;;)
+			{
+				BlockNumber nblkno = opaque->btpo_next;
+
+				nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
+				page = BufferGetPage(nbuf);
+				opaque = BTPageGetOpaque(page);
+				if (!P_IGNORE(opaque))
+					break;
+				if (P_RIGHTMOST(opaque))
+					elog(ERROR, "fell off the end of index \"%s\"",
+						 RelationGetRelationName(rel));
+			}
+			/* Will also advance to next tuple */
+			curposti = 0;
+			inposting = false;
+			maxoff = PageGetMaxOffsetNumber(page);
+			offset = P_FIRSTDATAKEY(opaque);
+			/* Don't invalidate binary search bounds */
+		}
+	}
+
+	/*
+	 * If we are doing a recheck then we should have found the tuple we are
+	 * checking.  Otherwise there's something very wrong --- probably, the
+	 * index is on a non-immutable expression.
+	 */
+	if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("failed to re-find tuple within index \"%s\"",
+						RelationGetRelationName(rel)),
+				 errhint("This may be because of a non-immutable index expression."),
+				 errtableconstraint(heapRel,
+									RelationGetRelationName(rel))));
+
+	if (nbuf != InvalidBuffer)
+		_bt_relbuf(rel, nbuf);
+
+	return InvalidTransactionId;
+}
+
+
+/*
+ *	_bt_findinsertloc() -- Finds an insert location for a tuple
+ *
+ *		On entry, insertstate buffer contains the page the new tuple belongs
+ *		on.  It is exclusive-locked and pinned by the caller.
+ *
+ *		If 'checkingunique' is true, the buffer on entry is the first page
+ *		that contains duplicates of the new key.  If there are duplicates on
+ *		multiple pages, the correct insertion position might be some page to
+ *		the right, rather than the first page.  In that case, this function
+ *		moves right to the correct target page.
+ *
+ *		(In a !heapkeyspace index, there can be multiple pages with the same
+ *		high key, where the new tuple could legitimately be placed on.  In
+ *		that case, the caller passes the first page containing duplicates,
+ *		just like when checkingunique=true.  If that page doesn't have enough
+ *		room for the new tuple, this function moves right, trying to find a
+ *		legal page that does.)
+ *
+ *		If 'indexUnchanged' is true, this is for an UPDATE that didn't
+ *		logically change the indexed value, but must nevertheless have a new
+ *		entry to point to a successor version.  This hint from the executor
+ *		will influence our behavior when the page might have to be split and
+ *		we must consider our options.  Bottom-up index deletion can avoid
+ *		pathological version-driven page splits, but we only want to go to the
+ *		trouble of trying it when we already have moderate confidence that
+ *		it's appropriate.  The hint should not significantly affect our
+ *		behavior over time unless practically all inserts on to the leaf page
+ *		get the hint.
+ *
+ *		On exit, insertstate buffer contains the chosen insertion page, and
+ *		the offset within that page is returned.  If _bt_findinsertloc needed
+ *		to move right, the lock and pin on the original page are released, and
+ *		the new buffer is exclusively locked and pinned instead.
+ *
+ *		If insertstate contains cached binary search bounds, we will take
+ *		advantage of them.  This avoids repeating comparisons that we made in
+ *		_bt_check_unique() already.
+ */
+static OffsetNumber
+_bt_findinsertloc(Relation rel,
+				  BTInsertState insertstate,
+				  bool checkingunique,
+				  bool indexUnchanged,
+				  BTStack stack,
+				  Relation heapRel)
+{
+	BTScanInsert itup_key = insertstate->itup_key;
+	Page		page = BufferGetPage(insertstate->buf);
+	BTPageOpaque opaque;
+	OffsetNumber newitemoff;
+
+	opaque = BTPageGetOpaque(page);
+
+	/* Check 1/3 of a page restriction */
+	if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
+		_bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
+							 insertstate->itup);
+
+	Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
+	Assert(!insertstate->bounds_valid || checkingunique);
+	Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
+	Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
+	Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
+
+	if (itup_key->heapkeyspace)
+	{
+		/* Keep track of whether checkingunique duplicate seen */
+		bool		uniquedup = indexUnchanged;
+
+		/*
+		 * If we're inserting into a unique index, we may have to walk right
+		 * through leaf pages to find the one leaf page that we must insert on
+		 * to.
+		 *
+		 * This is needed for checkingunique callers because a scantid was not
+		 * used when we called _bt_search().  scantid can only be set after
+		 * _bt_check_unique() has checked for duplicates.  The buffer
+		 * initially stored in insertstate->buf has the page where the first
+		 * duplicate key might be found, which isn't always the page that new
+		 * tuple belongs on.  The heap TID attribute for new tuple (scantid)
+		 * could force us to insert on a sibling page, though that should be
+		 * very rare in practice.
+		 */
+		if (checkingunique)
+		{
+			if (insertstate->low < insertstate->stricthigh)
+			{
+				/* Encountered a duplicate in _bt_check_unique() */
+				Assert(insertstate->bounds_valid);
+				uniquedup = true;
+			}
+
+			for (;;)
+			{
+				/*
+				 * Does the new tuple belong on this page?
+				 *
+				 * The earlier _bt_check_unique() call may well have
+				 * established a strict upper bound on the offset for the new
+				 * item.  If it's not the last item of the page (i.e. if there
+				 * is at least one tuple on the page that goes after the tuple
+				 * we're inserting) then we know that the tuple belongs on
+				 * this page.  We can skip the high key check.
+				 */
+				if (insertstate->bounds_valid &&
+					insertstate->low <= insertstate->stricthigh &&
+					insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+					break;
+
+				/* Test '<=', not '!=', since scantid is set now */
+				if (P_RIGHTMOST(opaque) ||
+					_bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
+					break;
+
+				_bt_stepright(rel, insertstate, stack);
+				/* Update local state after stepping right */
+				page = BufferGetPage(insertstate->buf);
+				opaque = BTPageGetOpaque(page);
+				/* Assume duplicates (if checkingunique) */
+				uniquedup = true;
+			}
+		}
+
+		/*
+		 * If the target page cannot fit newitem, try to avoid splitting the
+		 * page on insert by performing deletion or deduplication now
+		 */
+		if (PageGetFreeSpace(page) < insertstate->itemsz)
+			_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
+										 checkingunique, uniquedup,
+										 indexUnchanged);
+	}
+	else
+	{
+		/*----------
+		 * This is a !heapkeyspace (version 2 or 3) index.  The current page
+		 * is the first page that we could insert the new tuple to, but there
+		 * may be other pages to the right that we could opt to use instead.
+		 *
+		 * If the new key is equal to one or more existing keys, we can
+		 * legitimately place it anywhere in the series of equal keys.  In
+		 * fact, if the new key is equal to the page's "high key" we can place
+		 * it on the next page.  If it is equal to the high key, and there's
+		 * not room to insert the new tuple on the current page without
+		 * splitting, then we move right hoping to find more free space and
+		 * avoid a split.
+		 *
+		 * Keep scanning right until we
+		 *		(a) find a page with enough free space,
+		 *		(b) reach the last page where the tuple can legally go, or
+		 *		(c) get tired of searching.
+		 * (c) is not flippant; it is important because if there are many
+		 * pages' worth of equal keys, it's better to split one of the early
+		 * pages than to scan all the way to the end of the run of equal keys
+		 * on every insert.  We implement "get tired" as a random choice,
+		 * since stopping after scanning a fixed number of pages wouldn't work
+		 * well (we'd never reach the right-hand side of previously split
+		 * pages).  The probability of moving right is set at 0.99, which may
+		 * seem too high to change the behavior much, but it does an excellent
+		 * job of preventing O(N^2) behavior with many equal keys.
+		 *----------
+		 */
+		while (PageGetFreeSpace(page) < insertstate->itemsz)
+		{
+			/*
+			 * Before considering moving right, see if we can obtain enough
+			 * space by erasing LP_DEAD items
+			 */
+			if (P_HAS_GARBAGE(opaque))
+			{
+				/* Perform simple deletion */
+				_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
+											 false, false, false);
+
+				if (PageGetFreeSpace(page) >= insertstate->itemsz)
+					break;		/* OK, now we have enough space */
+			}
+
+			/*
+			 * Nope, so check conditions (b) and (c) enumerated above
+			 *
+			 * The earlier _bt_check_unique() call may well have established a
+			 * strict upper bound on the offset for the new item.  If it's not
+			 * the last item of the page (i.e. if there is at least one tuple
+			 * on the page that's greater than the tuple we're inserting to)
+			 * then we know that the tuple belongs on this page.  We can skip
+			 * the high key check.
+			 */
+			if (insertstate->bounds_valid &&
+				insertstate->low <= insertstate->stricthigh &&
+				insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+				break;
+
+			if (P_RIGHTMOST(opaque) ||
+				_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
+				pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
+				break;
+
+			_bt_stepright(rel, insertstate, stack);
+			/* Update local state after stepping right */
+			page = BufferGetPage(insertstate->buf);
+			opaque = BTPageGetOpaque(page);
+		}
+	}
+
+	/*
+	 * We should now be on the correct page.  Find the offset within the page
+	 * for the new tuple. (Possibly reusing earlier search bounds.)
+	 */
+	Assert(P_RIGHTMOST(opaque) ||
+		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
+
+	newitemoff = _bt_binsrch_insert(rel, insertstate);
+
+	if (insertstate->postingoff == -1)
+	{
+		/*
+		 * There is an overlapping posting list tuple with its LP_DEAD bit
+		 * set.  We don't want to unnecessarily unset its LP_DEAD bit while
+		 * performing a posting list split, so perform simple index tuple
+		 * deletion early.
+		 */
+		_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
+									 false, false, false);
+
+		/*
+		 * Do new binary search.  New insert location cannot overlap with any
+		 * posting list now.
+		 */
+		Assert(!insertstate->bounds_valid);
+		insertstate->postingoff = 0;
+		newitemoff = _bt_binsrch_insert(rel, insertstate);
+		Assert(insertstate->postingoff == 0);
+	}
+
+	return newitemoff;
+}
+
+/*
+ * Step right to next non-dead page, during insertion.
+ *
+ * This is a bit more complicated than moving right in a search.  We must
+ * write-lock the target page before releasing write lock on current page;
+ * else someone else's _bt_check_unique scan could fail to see our insertion.
+ * Write locks on intermediate dead pages won't do because we don't know when
+ * they will get de-linked from the tree.
+ *
+ * This is more aggressive than it needs to be for non-unique !heapkeyspace
+ * indexes.
+ */
+static void
+_bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
+{
+	Page		page;
+	BTPageOpaque opaque;
+	Buffer		rbuf;
+	BlockNumber rblkno;
+
+	page = BufferGetPage(insertstate->buf);
+	opaque = BTPageGetOpaque(page);
+
+	rbuf = InvalidBuffer;
+	rblkno = opaque->btpo_next;
+	for (;;)
+	{
+		rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
+		page = BufferGetPage(rbuf);
+		opaque = BTPageGetOpaque(page);
+
+		/*
+		 * If this page was incompletely split, finish the split now.  We do
+		 * this while holding a lock on the left sibling, which is not good
+		 * because finishing the split could be a fairly lengthy operation.
+		 * But this should happen very seldom.
+		 */
+		if (P_INCOMPLETE_SPLIT(opaque))
+		{
+			_bt_finish_split(rel, rbuf, stack);
+			rbuf = InvalidBuffer;
+			continue;
+		}
+
+		if (!P_IGNORE(opaque))
+			break;
+		if (P_RIGHTMOST(opaque))
+			elog(ERROR, "fell off the end of index \"%s\"",
+				 RelationGetRelationName(rel));
+
+		rblkno = opaque->btpo_next;
+	}
+	/* rbuf locked; unlock buf, update state for caller */
+	_bt_relbuf(rel, insertstate->buf);
+	insertstate->buf = rbuf;
+	insertstate->bounds_valid = false;
+}
+
+/*----------
+ *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ *		This recursive procedure does the following things:
+ *
+ *			+  if postingoff != 0, splits existing posting list tuple
+ *			   (since it overlaps with new 'itup' tuple).
+ *			+  if necessary, splits the target page, using 'itup_key' for
+ *			   suffix truncation on leaf pages (caller passes NULL for
+ *			   non-leaf pages).
+ *			+  inserts the new tuple (might be split from posting list).
+ *			+  if the page was split, pops the parent stack, and finds the
+ *			   right place to insert the new child pointer (by walking
+ *			   right using information stored in the parent stack).
+ *			+  invokes itself with the appropriate tuple for the right
+ *			   child page on the parent.
+ *			+  updates the metapage if a true root or fast root is split.
+ *
+ *		On entry, we must have the correct buffer in which to do the
+ *		insertion, and the buffer must be pinned and write-locked.  On return,
+ *		we will have dropped both the pin and the lock on the buffer.
+ *
+ *		This routine only performs retail tuple insertions.  'itup' should
+ *		always be either a non-highkey leaf item, or a downlink (new high
+ *		key items are created indirectly, when a page is split).  When
+ *		inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
+ *		we're inserting the downlink for.  This function will clear the
+ *		INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *----------
+ */
+static void
+_bt_insertonpg(Relation rel,
+			   BTScanInsert itup_key,
+			   Buffer buf,
+			   Buffer cbuf,
+			   BTStack stack,
+			   IndexTuple itup,
+			   Size itemsz,
+			   OffsetNumber newitemoff,
+			   int postingoff,
+			   bool split_only_page)
+{
+	Page		page;
+	BTPageOpaque opaque;
+	bool		isleaf,
+				isroot,
+				isrightmost,
+				isonly;
+	IndexTuple	oposting = NULL;
+	IndexTuple	origitup = NULL;
+	IndexTuple	nposting = NULL;
+
+	page = BufferGetPage(buf);
+	opaque = BTPageGetOpaque(page);
+	isleaf = P_ISLEAF(opaque);
+	isroot = P_ISROOT(opaque);
+	isrightmost = P_RIGHTMOST(opaque);
+	isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
+
+	/* child buffer must be given iff inserting on an internal page */
+	Assert(isleaf == !BufferIsValid(cbuf));
+	/* tuple must have appropriate number of attributes */
+	Assert(!isleaf ||
+		   BTreeTupleGetNAtts(itup, rel) ==
+		   IndexRelationGetNumberOfAttributes(rel));
+	Assert(isleaf ||
+		   BTreeTupleGetNAtts(itup, rel) <=
+		   IndexRelationGetNumberOfKeyAttributes(rel));
+	Assert(!BTreeTupleIsPosting(itup));
+	Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
+	/* Caller must always finish incomplete split for us */
+	Assert(!P_INCOMPLETE_SPLIT(opaque));
+
+	/*
+	 * Every internal page should have exactly one negative infinity item at
+	 * all times.  Only _bt_split() and _bt_newroot() should add items that
+	 * become negative infinity items through truncation, since they're the
+	 * only routines that allocate new internal pages.
+	 */
+	Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
+
+	/*
+	 * Do we need to split an existing posting list item?
+	 */
+	if (postingoff != 0)
+	{
+		ItemId		itemid = PageGetItemId(page, newitemoff);
+
+		/*
+		 * The new tuple is a duplicate with a heap TID that falls inside the
+		 * range of an existing posting list tuple on a leaf page.  Prepare to
+		 * split an existing posting list.  Overwriting the posting list with
+		 * its post-split version is treated as an extra step in either the
+		 * insert or page split critical section.
+		 */
+		Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
+		oposting = (IndexTuple) PageGetItem(page, itemid);
+
+		/*
+		 * postingoff value comes from earlier call to _bt_binsrch_posting().
+		 * Its binary search might think that a plain tuple must be a posting
+		 * list tuple that needs to be split.  This can happen with corruption
+		 * involving an existing plain tuple that is a duplicate of the new
+		 * item, up to and including its table TID.  Check for that here in
+		 * passing.
+		 *
+		 * Also verify that our caller has made sure that the existing posting
+		 * list tuple does not have its LP_DEAD bit set.
+		 */
+		if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
+									 ItemPointerGetBlockNumber(&itup->t_tid),
+									 ItemPointerGetOffsetNumber(&itup->t_tid),
+									 newitemoff, BufferGetBlockNumber(buf),
+									 RelationGetRelationName(rel))));
+
+		/* use a mutable copy of itup as our itup from here on */
+		origitup = itup;
+		itup = CopyIndexTuple(origitup);
+		nposting = _bt_swap_posting(itup, oposting, postingoff);
+		/* itup now contains rightmost/max TID from oposting */
+
+		/* Alter offset so that newitem goes after posting list */
+		newitemoff = OffsetNumberNext(newitemoff);
+	}
+
+	/*
+	 * Do we need to split the page to fit the item on it?
+	 *
+	 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
+	 * so this comparison is correct even though we appear to be accounting
+	 * only for the item and not for its line pointer.
+	 */
+	if (PageGetFreeSpace(page) < itemsz)
+	{
+		Buffer		rbuf;
+
+		Assert(!split_only_page);
+
+		/* split the buffer into left and right halves */
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+						 origitup, nposting, postingoff);
+		PredicateLockPageSplit(rel,
+							   BufferGetBlockNumber(buf),
+							   BufferGetBlockNumber(rbuf));
+
+		/*----------
+		 * By here,
+		 *
+		 *		+  our target page has been split;
+		 *		+  the original tuple has been inserted;
+		 *		+  we have write locks on both the old (left half)
+		 *		   and new (right half) buffers, after the split; and
+		 *		+  we know the key we want to insert into the parent
+		 *		   (it's the "high key" on the left child page).
+		 *
+		 * We're ready to do the parent insertion.  We need to hold onto the
+		 * locks for the child pages until we locate the parent, but we can
+		 * at least release the lock on the right child before doing the
+		 * actual insertion.  The lock on the left child will be released
+		 * last of all by parent insertion, where it is the 'cbuf' of parent
+		 * page.
+		 *----------
+		 */
+		_bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly);
+	}
+	else
+	{
+		Buffer		metabuf = InvalidBuffer;
+		Page		metapg = NULL;
+		BTMetaPageData *metad = NULL;
+		BlockNumber blockcache;
+
+		/*
+		 * If we are doing this insert because we split a page that was the
+		 * only one on its tree level, but was not the root, it may have been
+		 * the "fast root".  We need to ensure that the fast root link points
+		 * at or above the current page.  We can safely acquire a lock on the
+		 * metapage here --- see comments for _bt_newroot().
+		 */
+		if (unlikely(split_only_page))
+		{
+			Assert(!isleaf);
+			Assert(BufferIsValid(cbuf));
+
+			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+			metapg = BufferGetPage(metabuf);
+			metad = BTPageGetMeta(metapg);
+
+			if (metad->btm_fastlevel >= opaque->btpo_level)
+			{
+				/* no update wanted */
+				_bt_relbuf(rel, metabuf);
+				metabuf = InvalidBuffer;
+			}
+		}
+
+		/* Do the update.  No ereport(ERROR) until changes are logged */
+		START_CRIT_SECTION();
+
+		if (postingoff != 0)
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+		if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
+						false) == InvalidOffsetNumber)
+			elog(PANIC, "failed to add new item to block %u in index \"%s\"",
+				 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+
+		MarkBufferDirty(buf);
+
+		if (BufferIsValid(metabuf))
+		{
+			/* upgrade meta-page if needed */
+			if (metad->btm_version < BTREE_NOVAC_VERSION)
+				_bt_upgrademetapage(metapg);
+			metad->btm_fastroot = BufferGetBlockNumber(buf);
+			metad->btm_fastlevel = opaque->btpo_level;
+			MarkBufferDirty(metabuf);
+		}
+
+		/*
+		 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
+		 * finishes a split
+		 */
+		if (!isleaf)
+		{
+			Page		cpage = BufferGetPage(cbuf);
+			BTPageOpaque cpageop = BTPageGetOpaque(cpage);
+
+			Assert(P_INCOMPLETE_SPLIT(cpageop));
+			cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+			MarkBufferDirty(cbuf);
+		}
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			xl_btree_insert xlrec;
+			xl_btree_metadata xlmeta;
+			uint8		xlinfo;
+			XLogRecPtr	recptr;
+			uint16		upostingoff;
+
+			xlrec.offnum = newitemoff;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
+
+			if (isleaf && postingoff == 0)
+			{
+				/* Simple leaf insert */
+				xlinfo = XLOG_BTREE_INSERT_LEAF;
+			}
+			else if (postingoff != 0)
+			{
+				/*
+				 * Leaf insert with posting list split.  Must include
+				 * postingoff field before newitem/orignewitem.
+				 */
+				Assert(isleaf);
+				xlinfo = XLOG_BTREE_INSERT_POST;
+			}
+			else
+			{
+				/* Internal page insert, which finishes a split on cbuf */
+				xlinfo = XLOG_BTREE_INSERT_UPPER;
+				XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
+
+				if (BufferIsValid(metabuf))
+				{
+					/* Actually, it's an internal page insert + meta update */
+					xlinfo = XLOG_BTREE_INSERT_META;
+
+					Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+					xlmeta.version = metad->btm_version;
+					xlmeta.root = metad->btm_root;
+					xlmeta.level = metad->btm_level;
+					xlmeta.fastroot = metad->btm_fastroot;
+					xlmeta.fastlevel = metad->btm_fastlevel;
+					xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
+					xlmeta.allequalimage = metad->btm_allequalimage;
+
+					XLogRegisterBuffer(2, metabuf,
+									   REGBUF_WILL_INIT | REGBUF_STANDARD);
+					XLogRegisterBufData(2, (char *) &xlmeta,
+										sizeof(xl_btree_metadata));
+				}
+			}
+
+			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+			if (postingoff == 0)
+			{
+				/* Just log itup from caller */
+				XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+			}
+			else
+			{
+				/*
+				 * Insert with posting list split (XLOG_BTREE_INSERT_POST
+				 * record) case.
+				 *
+				 * Log postingoff.  Also log origitup, not itup.  REDO routine
+				 * must reconstruct final itup (as well as nposting) using
+				 * _bt_swap_posting().
+				 */
+				upostingoff = postingoff;
+
+				XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
+				XLogRegisterBufData(0, (char *) origitup,
+									IndexTupleSize(origitup));
+			}
+
+			recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+			if (BufferIsValid(metabuf))
+				PageSetLSN(metapg, recptr);
+			if (!isleaf)
+				PageSetLSN(BufferGetPage(cbuf), recptr);
+
+			PageSetLSN(page, recptr);
+		}
+
+		END_CRIT_SECTION();
+
+		/* Release subsidiary buffers */
+		if (BufferIsValid(metabuf))
+			_bt_relbuf(rel, metabuf);
+		if (!isleaf)
+			_bt_relbuf(rel, cbuf);
+
+		/*
+		 * Cache the block number if this is the rightmost leaf page.  Cache
+		 * may be used by a future inserter within _bt_search_insert().
+		 */
+		blockcache = InvalidBlockNumber;
+		if (isrightmost && isleaf && !isroot)
+			blockcache = BufferGetBlockNumber(buf);
+
+		/* Release buffer for insertion target block */
+		_bt_relbuf(rel, buf);
+
+		/*
+		 * If we decided to cache the insertion target block before releasing
+		 * its buffer lock, then cache it now.  Check the height of the tree
+		 * first, though.  We don't go for the optimization with small
+		 * indexes.  Defer final check to this point to ensure that we don't
+		 * call _bt_getrootheight while holding a buffer lock.
+		 */
+		if (BlockNumberIsValid(blockcache) &&
+			_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
+			RelationSetTargetBlock(rel, blockcache);
+	}
+
+	/* be tidy */
+	if (postingoff != 0)
+	{
+		/* itup is actually a modified copy of caller's original */
+		pfree(nposting);
+		pfree(itup);
+	}
+}
+
+/*
+ *	_bt_split() -- split a page in the btree.
+ *
+ *		On entry, buf is the page to split, and is pinned and write-locked.
+ *		newitemoff etc. tell us about the new item that must be inserted
+ *		along with the data from the original page.
+ *
+ *		itup_key is used for suffix truncation on leaf pages (internal
+ *		page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
+ *		is the left-sibling of the page we're inserting the downlink for.
+ *		This function will clear the INCOMPLETE_SPLIT flag on it, and
+ *		release the buffer.
+ *
+ *		orignewitem, nposting, and postingoff are needed when an insert of
+ *		orignewitem results in both a posting list split and a page split.
+ *		These extra posting list split details are used here in the same
+ *		way as they are used in the more common case where a posting list
+ *		split does not coincide with a page split.  We need to deal with
+ *		posting list splits directly in order to ensure that everything
+ *		that follows from the insert of orignewitem is handled as a single
+ *		atomic operation (though caller's insert of a new pivot/downlink
+ *		into parent page will still be a separate operation).  See
+ *		nbtree/README for details on the design of posting list splits.
+ *
+ *		Returns the new right sibling of buf, pinned and write-locked.
+ *		The pin and lock on buf are maintained.
+ */
+static Buffer
+_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
+		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+		  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
+{
+	Buffer		rbuf;
+	Page		origpage;
+	Page		leftpage,
+				rightpage;
+	BlockNumber origpagenumber,
+				rightpagenumber;
+	BTPageOpaque ropaque,
+				lopaque,
+				oopaque;
+	Buffer		sbuf = InvalidBuffer;
+	Page		spage = NULL;
+	BTPageOpaque sopaque = NULL;
+	Size		itemsz;
+	ItemId		itemid;
+	IndexTuple	firstright,
+				lefthighkey;
+	OffsetNumber firstrightoff;
+	OffsetNumber afterleftoff,
+				afterrightoff,
+				minusinfoff;
+	OffsetNumber origpagepostingoff;
+	OffsetNumber maxoff;
+	OffsetNumber i;
+	bool		newitemonleft,
+				isleaf,
+				isrightmost;
+
+	/*
+	 * origpage is the original page to be split.  leftpage is a temporary
+	 * buffer that receives the left-sibling data, which will be copied back
+	 * into origpage on success.  rightpage is the new page that will receive
+	 * the right-sibling data.
+	 *
+	 * leftpage is allocated after choosing a split point.  rightpage's new
+	 * buffer isn't acquired until after leftpage is initialized and has new
+	 * high key, the last point where splitting the page may fail (barring
+	 * corruption).  Failing before acquiring new buffer won't have lasting
+	 * consequences, since origpage won't have been modified and leftpage is
+	 * only workspace.
+	 */
+	origpage = BufferGetPage(buf);
+	oopaque = BTPageGetOpaque(origpage);
+	isleaf = P_ISLEAF(oopaque);
+	isrightmost = P_RIGHTMOST(oopaque);
+	maxoff = PageGetMaxOffsetNumber(origpage);
+	origpagenumber = BufferGetBlockNumber(buf);
+
+	/*
+	 * Choose a point to split origpage at.
+	 *
+	 * A split point can be thought of as a point _between_ two existing data
+	 * items on origpage (the lastleft and firstright tuples), provided you
+	 * pretend that the new item that didn't fit is already on origpage.
+	 *
+	 * Since origpage does not actually contain newitem, the representation of
+	 * split points needs to work with two boundary cases: splits where
+	 * newitem is lastleft, and splits where newitem is firstright.
+	 * newitemonleft resolves the ambiguity that would otherwise exist when
+	 * newitemoff == firstrightoff.  In all other cases it's clear which side
+	 * of the split every tuple goes on from context.  newitemonleft is
+	 * usually (but not always) redundant information.
+	 *
+	 * firstrightoff is supposed to be an origpage offset number, but it's
+	 * possible that its value will be maxoff+1, which is "past the end" of
+	 * origpage.  This happens in the rare case where newitem goes after all
+	 * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
+	 * origpage at the point that leaves newitem alone on new right page.  Any
+	 * "!newitemonleft && newitemoff == firstrightoff" split point makes
+	 * newitem the firstright tuple, though, so this case isn't a special
+	 * case.
+	 */
+	firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
+									 newitem, &newitemonleft);
+
+	/* Allocate temp buffer for leftpage */
+	leftpage = PageGetTempPage(origpage);
+	_bt_pageinit(leftpage, BufferGetPageSize(buf));
+	lopaque = BTPageGetOpaque(leftpage);
+
+	/*
+	 * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
+	 * and HAS_GARBAGE flags.
+	 */
+	lopaque->btpo_flags = oopaque->btpo_flags;
+	lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+	/* set flag in leftpage indicating that rightpage has no downlink yet */
+	lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
+	lopaque->btpo_prev = oopaque->btpo_prev;
+	/* handle btpo_next after rightpage buffer acquired */
+	lopaque->btpo_level = oopaque->btpo_level;
+	/* handle btpo_cycleid after rightpage buffer acquired */
+
+	/*
+	 * Copy the original page's LSN into leftpage, which will become the
+	 * updated version of the page.  We need this because XLogInsert will
+	 * examine the LSN and possibly dump it in a page image.
+	 */
+	PageSetLSN(leftpage, PageGetLSN(origpage));
+
+	/*
+	 * Determine page offset number of existing overlapped-with-orignewitem
+	 * posting list when it is necessary to perform a posting list split in
+	 * passing.  Note that newitem was already changed by caller (newitem no
+	 * longer has the orignewitem TID).
+	 *
+	 * This page offset number (origpagepostingoff) will be used to pretend
+	 * that the posting split has already taken place, even though the
+	 * required modifications to origpage won't occur until we reach the
+	 * critical section.  The lastleft and firstright tuples of our page split
+	 * point should, in effect, come from an imaginary version of origpage
+	 * that has the nposting tuple instead of the original posting list tuple.
+	 *
+	 * Note: _bt_findsplitloc() should have compensated for coinciding posting
+	 * list splits in just the same way, at least in theory.  It doesn't
+	 * bother with that, though.  In practice it won't affect its choice of
+	 * split point.
+	 */
+	origpagepostingoff = InvalidOffsetNumber;
+	if (postingoff != 0)
+	{
+		Assert(isleaf);
+		Assert(ItemPointerCompare(&orignewitem->t_tid,
+								  &newitem->t_tid) < 0);
+		Assert(BTreeTupleIsPosting(nposting));
+		origpagepostingoff = OffsetNumberPrev(newitemoff);
+	}
+
+	/*
+	 * The high key for the new left page is a possibly-truncated copy of
+	 * firstright on the leaf level (it's "firstright itself" on internal
+	 * pages; see !isleaf comments below).  This may seem to be contrary to
+	 * Lehman & Yao's approach of using a copy of lastleft as the new high key
+	 * when splitting on the leaf level.  It isn't, though.
+	 *
+	 * Suffix truncation will leave the left page's high key fully equal to
+	 * lastleft when lastleft and firstright are equal prior to heap TID (that
+	 * is, the tiebreaker TID value comes from lastleft).  It isn't actually
+	 * necessary for a new leaf high key to be a copy of lastleft for the L&Y
+	 * "subtree" invariant to hold.  It's sufficient to make sure that the new
+	 * leaf high key is strictly less than firstright, and greater than or
+	 * equal to (not necessarily equal to) lastleft.  In other words, when
+	 * suffix truncation isn't possible during a leaf page split, we take
+	 * L&Y's exact approach to generating a new high key for the left page.
+	 * (Actually, that is slightly inaccurate.  We don't just use a copy of
+	 * lastleft.  A tuple with all the keys from firstright but the max heap
+	 * TID from lastleft is used, to avoid introducing a special case.)
+	 */
+	if (!newitemonleft && newitemoff == firstrightoff)
+	{
+		/* incoming tuple becomes firstright */
+		itemsz = newitemsz;
+		firstright = newitem;
+	}
+	else
+	{
+		/* existing item at firstrightoff becomes firstright */
+		itemid = PageGetItemId(origpage, firstrightoff);
+		itemsz = ItemIdGetLength(itemid);
+		firstright = (IndexTuple) PageGetItem(origpage, itemid);
+		if (firstrightoff == origpagepostingoff)
+			firstright = nposting;
+	}
+
+	if (isleaf)
+	{
+		IndexTuple	lastleft;
+
+		/* Attempt suffix truncation for leaf page splits */
+		if (newitemonleft && newitemoff == firstrightoff)
+		{
+			/* incoming tuple becomes lastleft */
+			lastleft = newitem;
+		}
+		else
+		{
+			OffsetNumber lastleftoff;
+
+			/* existing item before firstrightoff becomes lastleft */
+			lastleftoff = OffsetNumberPrev(firstrightoff);
+			Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
+			itemid = PageGetItemId(origpage, lastleftoff);
+			lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+			if (lastleftoff == origpagepostingoff)
+				lastleft = nposting;
+		}
+
+		lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
+		itemsz = IndexTupleSize(lefthighkey);
+	}
+	else
+	{
+		/*
+		 * Don't perform suffix truncation on a copy of firstright to make
+		 * left page high key for internal page splits.  Must use firstright
+		 * as new high key directly.
+		 *
+		 * Each distinct separator key value originates as a leaf level high
+		 * key; all other separator keys/pivot tuples are copied from one
+		 * level down.  A separator key in a grandparent page must be
+		 * identical to high key in rightmost parent page of the subtree to
+		 * its left, which must itself be identical to high key in rightmost
+		 * child page of that same subtree (this even applies to separator
+		 * from grandparent's high key).  There must always be an unbroken
+		 * "seam" of identical separator keys that guide index scans at every
+		 * level, starting from the grandparent.  That's why suffix truncation
+		 * is unsafe here.
+		 *
+		 * Internal page splits will truncate firstright into a "negative
+		 * infinity" data item when it gets inserted on the new right page
+		 * below, though.  This happens during the call to _bt_pgaddtup() for
+		 * the new first data item for right page.  Do not confuse this
+		 * mechanism with suffix truncation.  It is just a convenient way of
+		 * implementing page splits that split the internal page "inside"
+		 * firstright.  The lefthighkey separator key cannot appear a second
+		 * time in the right page (only firstright's downlink goes in right
+		 * page).
+		 */
+		lefthighkey = firstright;
+	}
+
+	/*
+	 * Add new high key to leftpage
+	 */
+	afterleftoff = P_HIKEY;
+
+	Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
+	Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
+		   IndexRelationGetNumberOfKeyAttributes(rel));
+	Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
+	if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
+					false) == InvalidOffsetNumber)
+		elog(ERROR, "failed to add high key to the left sibling"
+			 " while splitting block %u of index \"%s\"",
+			 origpagenumber, RelationGetRelationName(rel));
+	afterleftoff = OffsetNumberNext(afterleftoff);
+
+	/*
+	 * Acquire a new right page to split into, now that left page has a new
+	 * high key.  From here on, it's not okay to throw an error without
+	 * zeroing rightpage first.  This coding rule ensures that we won't
+	 * confuse future VACUUM operations, which might otherwise try to re-find
+	 * a downlink to a leftover junk page as the page undergoes deletion.
+	 *
+	 * It would be reasonable to start the critical section just after the new
+	 * rightpage buffer is acquired instead; that would allow us to avoid
+	 * leftover junk pages without bothering to zero rightpage.  We do it this
+	 * way because it avoids an unnecessary PANIC when either origpage or its
+	 * existing sibling page are corrupt.
+	 */
+	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+	rightpage = BufferGetPage(rbuf);
+	rightpagenumber = BufferGetBlockNumber(rbuf);
+	/* rightpage was initialized by _bt_getbuf */
+	ropaque = BTPageGetOpaque(rightpage);
+
+	/*
+	 * Finish off remaining leftpage special area fields.  They cannot be set
+	 * before both origpage (leftpage) and rightpage buffers are acquired and
+	 * locked.
+	 *
+	 * btpo_cycleid is only used with leaf pages, though we set it here in all
+	 * cases just to be consistent.
+	 */
+	lopaque->btpo_next = rightpagenumber;
+	lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
+
+	/*
+	 * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
+	 * and HAS_GARBAGE flags.
+	 */
+	ropaque->btpo_flags = oopaque->btpo_flags;
+	ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+	ropaque->btpo_prev = origpagenumber;
+	ropaque->btpo_next = oopaque->btpo_next;
+	ropaque->btpo_level = oopaque->btpo_level;
+	ropaque->btpo_cycleid = lopaque->btpo_cycleid;
+
+	/*
+	 * Add new high key to rightpage where necessary.
+	 *
+	 * If the page we're splitting is not the rightmost page at its level in
+	 * the tree, then the first entry on the page is the high key from
+	 * origpage.
+	 */
+	afterrightoff = P_HIKEY;
+
+	if (!isrightmost)
+	{
+		IndexTuple	righthighkey;
+
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		itemsz = ItemIdGetLength(itemid);
+		righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
+		Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
+		Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
+			   IndexRelationGetNumberOfKeyAttributes(rel));
+		if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
+						false, false) == InvalidOffsetNumber)
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			elog(ERROR, "failed to add high key to the right sibling"
+				 " while splitting block %u of index \"%s\"",
+				 origpagenumber, RelationGetRelationName(rel));
+		}
+		afterrightoff = OffsetNumberNext(afterrightoff);
+	}
+
+	/*
+	 * Internal page splits truncate first data item on right page -- it
+	 * becomes "minus infinity" item for the page.  Set this up here.
+	 */
+	minusinfoff = InvalidOffsetNumber;
+	if (!isleaf)
+		minusinfoff = afterrightoff;
+
+	/*
+	 * Now transfer all the data items (non-pivot tuples in isleaf case, or
+	 * additional pivot tuples in !isleaf case) to the appropriate page.
+	 *
+	 * Note: we *must* insert at least the right page's items in item-number
+	 * order, for the benefit of _bt_restore_page().
+	 */
+	for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
+	{
+		IndexTuple	dataitem;
+
+		itemid = PageGetItemId(origpage, i);
+		itemsz = ItemIdGetLength(itemid);
+		dataitem = (IndexTuple) PageGetItem(origpage, itemid);
+
+		/* replace original item with nposting due to posting split? */
+		if (i == origpagepostingoff)
+		{
+			Assert(BTreeTupleIsPosting(dataitem));
+			Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
+			dataitem = nposting;
+		}
+
+		/* does new item belong before this one? */
+		else if (i == newitemoff)
+		{
+			if (newitemonleft)
+			{
+				Assert(newitemoff <= firstrightoff);
+				if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
+								  false))
+				{
+					memset(rightpage, 0, BufferGetPageSize(rbuf));
+					elog(ERROR, "failed to add new item to the left sibling"
+						 " while splitting block %u of index \"%s\"",
+						 origpagenumber, RelationGetRelationName(rel));
+				}
+				afterleftoff = OffsetNumberNext(afterleftoff);
+			}
+			else
+			{
+				Assert(newitemoff >= firstrightoff);
+				if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
+								  afterrightoff == minusinfoff))
+				{
+					memset(rightpage, 0, BufferGetPageSize(rbuf));
+					elog(ERROR, "failed to add new item to the right sibling"
+						 " while splitting block %u of index \"%s\"",
+						 origpagenumber, RelationGetRelationName(rel));
+				}
+				afterrightoff = OffsetNumberNext(afterrightoff);
+			}
+		}
+
+		/* decide which page to put it on */
+		if (i < firstrightoff)
+		{
+			if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
+			{
+				memset(rightpage, 0, BufferGetPageSize(rbuf));
+				elog(ERROR, "failed to add old item to the left sibling"
+					 " while splitting block %u of index \"%s\"",
+					 origpagenumber, RelationGetRelationName(rel));
+			}
+			afterleftoff = OffsetNumberNext(afterleftoff);
+		}
+		else
+		{
+			if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
+							  afterrightoff == minusinfoff))
+			{
+				memset(rightpage, 0, BufferGetPageSize(rbuf));
+				elog(ERROR, "failed to add old item to the right sibling"
+					 " while splitting block %u of index \"%s\"",
+					 origpagenumber, RelationGetRelationName(rel));
+			}
+			afterrightoff = OffsetNumberNext(afterrightoff);
+		}
+	}
+
+	/* Handle case where newitem goes at the end of rightpage */
+	if (i <= newitemoff)
+	{
+		/*
+		 * Can't have newitemonleft here; that would imply we were told to put
+		 * *everything* on the left page, which cannot fit (if it could, we'd
+		 * not be splitting the page).
+		 */
+		Assert(!newitemonleft && newitemoff == maxoff + 1);
+		if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
+						  afterrightoff == minusinfoff))
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			elog(ERROR, "failed to add new item to the right sibling"
+				 " while splitting block %u of index \"%s\"",
+				 origpagenumber, RelationGetRelationName(rel));
+		}
+		afterrightoff = OffsetNumberNext(afterrightoff);
+	}
+
+	/*
+	 * We have to grab the original right sibling (if any) and update its prev
+	 * link.  We are guaranteed that this is deadlock-free, since we couple
+	 * the locks in the standard order: left to right.
+	 */
+	if (!isrightmost)
+	{
+		sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
+		spage = BufferGetPage(sbuf);
+		sopaque = BTPageGetOpaque(spage);
+		if (sopaque->btpo_prev != origpagenumber)
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("right sibling's left-link doesn't match: "
+									 "block %u links to %u instead of expected %u in index \"%s\"",
+									 oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
+									 RelationGetRelationName(rel))));
+		}
+
+		/*
+		 * Check to see if we can set the SPLIT_END flag in the right-hand
+		 * split page; this can save some I/O for vacuum since it need not
+		 * proceed to the right sibling.  We can set the flag if the right
+		 * sibling has a different cycleid: that means it could not be part of
+		 * a group of pages that were all split off from the same ancestor
+		 * page.  If you're confused, imagine that page A splits to A B and
+		 * then again, yielding A C B, while vacuum is in progress.  Tuples
+		 * originally in A could now be in either B or C, hence vacuum must
+		 * examine both pages.  But if D, our right sibling, has a different
+		 * cycleid then it could not contain any tuples that were in A when
+		 * the vacuum started.
+		 */
+		if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
+			ropaque->btpo_flags |= BTP_SPLIT_END;
+	}
+
+	/*
+	 * Right sibling is locked, new siblings are prepared, but original page
+	 * is not updated yet.
+	 *
+	 * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
+	 * not starting the critical section till here because we haven't been
+	 * scribbling on the original page yet; see comments above.
+	 */
+	START_CRIT_SECTION();
+
+	/*
+	 * By here, the original data page has been split into two new halves, and
+	 * these are correct.  The algorithm requires that the left page never
+	 * move during a split, so we copy the new left page back on top of the
+	 * original.  We need to do this before writing the WAL record, so that
+	 * XLogInsert can WAL log an image of the page if necessary.
+	 */
+	PageRestoreTempPage(leftpage, origpage);
+	/* leftpage, lopaque must not be used below here */
+
+	MarkBufferDirty(buf);
+	MarkBufferDirty(rbuf);
+
+	if (!isrightmost)
+	{
+		sopaque->btpo_prev = rightpagenumber;
+		MarkBufferDirty(sbuf);
+	}
+
+	/*
+	 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
+	 * a split
+	 */
+	if (!isleaf)
+	{
+		Page		cpage = BufferGetPage(cbuf);
+		BTPageOpaque cpageop = BTPageGetOpaque(cpage);
+
+		cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+		MarkBufferDirty(cbuf);
+	}
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_split xlrec;
+		uint8		xlinfo;
+		XLogRecPtr	recptr;
+
+		xlrec.level = ropaque->btpo_level;
+		/* See comments below on newitem, orignewitem, and posting lists */
+		xlrec.firstrightoff = firstrightoff;
+		xlrec.newitemoff = newitemoff;
+		xlrec.postingoff = 0;
+		if (postingoff != 0 && origpagepostingoff < firstrightoff)
+			xlrec.postingoff = postingoff;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
+
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+		XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
+		/* Log original right sibling, since we've changed its prev-pointer */
+		if (!isrightmost)
+			XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
+		if (!isleaf)
+			XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
+
+		/*
+		 * Log the new item, if it was inserted on the left page. (If it was
+		 * put on the right page, we don't need to explicitly WAL log it
+		 * because it's included with all the other items on the right page.)
+		 * Show the new item as belonging to the left page buffer, so that it
+		 * is not stored if XLogInsert decides it needs a full-page image of
+		 * the left page.  We always store newitemoff in the record, though.
+		 *
+		 * The details are sometimes slightly different for page splits that
+		 * coincide with a posting list split.  If both the replacement
+		 * posting list and newitem go on the right page, then we don't need
+		 * to log anything extra, just like the simple !newitemonleft
+		 * no-posting-split case (postingoff is set to zero in the WAL record,
+		 * so recovery doesn't need to process a posting list split at all).
+		 * Otherwise, we set postingoff and log orignewitem instead of
+		 * newitem, despite having actually inserted newitem.  REDO routine
+		 * must reconstruct nposting and newitem using _bt_swap_posting().
+		 *
+		 * Note: It's possible that our page split point is the point that
+		 * makes the posting list lastleft and newitem firstright.  This is
+		 * the only case where we log orignewitem/newitem despite newitem
+		 * going on the right page.  If XLogInsert decides that it can omit
+		 * orignewitem due to logging a full-page image of the left page,
+		 * everything still works out, since recovery only needs to log
+		 * orignewitem for items on the left page (just like the regular
+		 * newitem-logged case).
+		 */
+		if (newitemonleft && xlrec.postingoff == 0)
+			XLogRegisterBufData(0, (char *) newitem, newitemsz);
+		else if (xlrec.postingoff != 0)
+		{
+			Assert(isleaf);
+			Assert(newitemonleft || firstrightoff == newitemoff);
+			Assert(newitemsz == IndexTupleSize(orignewitem));
+			XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
+		}
+
+		/* Log the left page's new high key */
+		if (!isleaf)
+		{
+			/* lefthighkey isn't local copy, get current pointer */
+			itemid = PageGetItemId(origpage, P_HIKEY);
+			lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
+		}
+		XLogRegisterBufData(0, (char *) lefthighkey,
+							MAXALIGN(IndexTupleSize(lefthighkey)));
+
+		/*
+		 * Log the contents of the right page in the format understood by
+		 * _bt_restore_page().  The whole right page will be recreated.
+		 *
+		 * Direct access to page is not good but faster - we should implement
+		 * some new func in page API.  Note we only store the tuples
+		 * themselves, knowing that they were inserted in item-number order
+		 * and so the line pointers can be reconstructed.  See comments for
+		 * _bt_restore_page().
+		 */
+		XLogRegisterBufData(1,
+							(char *) rightpage + ((PageHeader) rightpage)->pd_upper,
+							((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
+
+		xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
+		recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+		PageSetLSN(origpage, recptr);
+		PageSetLSN(rightpage, recptr);
+		if (!isrightmost)
+			PageSetLSN(spage, recptr);
+		if (!isleaf)
+			PageSetLSN(BufferGetPage(cbuf), recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* release the old right sibling */
+	if (!isrightmost)
+		_bt_relbuf(rel, sbuf);
+
+	/* release the child */
+	if (!isleaf)
+		_bt_relbuf(rel, cbuf);
+
+	/* be tidy */
+	if (isleaf)
+		pfree(lefthighkey);
+
+	/* split's done */
+	return rbuf;
+}
+
+/*
+ * _bt_insert_parent() -- Insert downlink into parent, completing split.
+ *
+ * On entry, buf and rbuf are the left and right split pages, which we
+ * still hold write locks on.  Both locks will be released here.  We
+ * release the rbuf lock once we have a write lock on the page that we
+ * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
+ * The lock on buf is released at the same point as the lock on the parent
+ * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
+ * atomic operation that completes the split by inserting a new downlink.
+ *
+ * stack - stack showing how we got here.  Will be NULL when splitting true
+ *			root, or during concurrent root split, where we can be inefficient
+ * isroot - we split the true root
+ * isonly - we split a page alone on its level (might have been fast root)
+ */
+static void
+_bt_insert_parent(Relation rel,
+				  Buffer buf,
+				  Buffer rbuf,
+				  BTStack stack,
+				  bool isroot,
+				  bool isonly)
+{
+	/*
+	 * Here we have to do something Lehman and Yao don't talk about: deal with
+	 * a root split and construction of a new root.  If our stack is empty
+	 * then we have just split a node on what had been the root level when we
+	 * descended the tree.  If it was still the root then we perform a
+	 * new-root construction.  If it *wasn't* the root anymore, search to find
+	 * the next higher level that someone constructed meanwhile, and find the
+	 * right place to insert as for the normal case.
+	 *
+	 * If we have to search for the parent level, we do so by re-descending
+	 * from the root.  This is not super-efficient, but it's rare enough not
+	 * to matter.
+	 */
+	if (isroot)
+	{
+		Buffer		rootbuf;
+
+		Assert(stack == NULL);
+		Assert(isonly);
+		/* create a new root node and update the metapage */
+		rootbuf = _bt_newroot(rel, buf, rbuf);
+		/* release the split buffers */
+		_bt_relbuf(rel, rootbuf);
+		_bt_relbuf(rel, rbuf);
+		_bt_relbuf(rel, buf);
+	}
+	else
+	{
+		BlockNumber bknum = BufferGetBlockNumber(buf);
+		BlockNumber rbknum = BufferGetBlockNumber(rbuf);
+		Page		page = BufferGetPage(buf);
+		IndexTuple	new_item;
+		BTStackData fakestack;
+		IndexTuple	ritem;
+		Buffer		pbuf;
+
+		if (stack == NULL)
+		{
+			BTPageOpaque opaque;
+
+			elog(DEBUG2, "concurrent ROOT page split");
+			opaque = BTPageGetOpaque(page);
+
+			/*
+			 * We should never reach here when a leaf page split takes place
+			 * despite the insert of newitem being able to apply the fastpath
+			 * optimization.  Make sure of that with an assertion.
+			 *
+			 * This is more of a performance issue than a correctness issue.
+			 * The fastpath won't have a descent stack.  Using a phony stack
+			 * here works, but never rely on that.  The fastpath should be
+			 * rejected within _bt_search_insert() when the rightmost leaf
+			 * page will split, since it's faster to go through _bt_search()
+			 * and get a stack in the usual way.
+			 */
+			Assert(!(P_ISLEAF(opaque) &&
+					 BlockNumberIsValid(RelationGetTargetBlock(rel))));
+
+			/* Find the leftmost page at the next level up */
+			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
+			/* Set up a phony stack entry pointing there */
+			stack = &fakestack;
+			stack->bts_blkno = BufferGetBlockNumber(pbuf);
+			stack->bts_offset = InvalidOffsetNumber;
+			stack->bts_parent = NULL;
+			_bt_relbuf(rel, pbuf);
+		}
+
+		/* get high key from left, a strict lower bound for new right page */
+		ritem = (IndexTuple) PageGetItem(page,
+										 PageGetItemId(page, P_HIKEY));
+
+		/* form an index tuple that points at the new right page */
+		new_item = CopyIndexTuple(ritem);
+		BTreeTupleSetDownLink(new_item, rbknum);
+
+		/*
+		 * Re-find and write lock the parent of buf.
+		 *
+		 * It's possible that the location of buf's downlink has changed since
+		 * our initial _bt_search() descent.  _bt_getstackbuf() will detect
+		 * and recover from this, updating the stack, which ensures that the
+		 * new downlink will be inserted at the correct offset. Even buf's
+		 * parent may have changed.
+		 */
+		pbuf = _bt_getstackbuf(rel, stack, bknum);
+
+		/*
+		 * Unlock the right child.  The left child will be unlocked in
+		 * _bt_insertonpg().
+		 *
+		 * Unlocking the right child must be delayed until here to ensure that
+		 * no concurrent VACUUM operation can become confused.  Page deletion
+		 * cannot be allowed to fail to re-find a downlink for the rbuf page.
+		 * (Actually, this is just a vestige of how things used to work.  The
+		 * page deletion code is expected to check for the INCOMPLETE_SPLIT
+		 * flag on the left child.  It won't attempt deletion of the right
+		 * child until the split is complete.  Despite all this, we opt to
+		 * conservatively delay unlocking the right child until here.)
+		 */
+		_bt_relbuf(rel, rbuf);
+
+		if (pbuf == InvalidBuffer)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
+									 RelationGetRelationName(rel), bknum, rbknum)));
+
+		/* Recursively insert into the parent */
+		_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
+					   new_item, MAXALIGN(IndexTupleSize(new_item)),
+					   stack->bts_offset + 1, 0, isonly);
+
+		/* be tidy */
+		pfree(new_item);
+	}
+}
+
+/*
+ * _bt_finish_split() -- Finish an incomplete split
+ *
+ * A crash or other failure can leave a split incomplete.  The insertion
+ * routines won't allow to insert on a page that is incompletely split.
+ * Before inserting on such a page, call _bt_finish_split().
+ *
+ * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
+ * and unpinned.
+ */
+void
+_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
+{
+	Page		lpage = BufferGetPage(lbuf);
+	BTPageOpaque lpageop = BTPageGetOpaque(lpage);
+	Buffer		rbuf;
+	Page		rpage;
+	BTPageOpaque rpageop;
+	bool		wasroot;
+	bool		wasonly;
+
+	Assert(P_INCOMPLETE_SPLIT(lpageop));
+
+	/* Lock right sibling, the one missing the downlink */
+	rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
+	rpage = BufferGetPage(rbuf);
+	rpageop = BTPageGetOpaque(rpage);
+
+	/* Could this be a root split? */
+	if (!stack)
+	{
+		Buffer		metabuf;
+		Page		metapg;
+		BTMetaPageData *metad;
+
+		/* acquire lock on the metapage */
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+		metapg = BufferGetPage(metabuf);
+		metad = BTPageGetMeta(metapg);
+
+		wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
+
+		_bt_relbuf(rel, metabuf);
+	}
+	else
+		wasroot = false;
+
+	/* Was this the only page on the level before split? */
+	wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
+
+	elog(DEBUG1, "finishing incomplete split of %u/%u",
+		 BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
+
+	_bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
+}
+
+/*
+ *	_bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
+ *						 tuple whose downlink points to child page.
+ *
+ *		Caller passes child's block number, which is used to identify
+ *		associated pivot tuple in parent page using a linear search that
+ *		matches on pivot's downlink/block number.  The expected location of
+ *		the pivot tuple is taken from the stack one level above the child
+ *		page.  This is used as a starting point.  Insertions into the
+ *		parent level could cause the pivot tuple to move right; deletions
+ *		could cause it to move left, but not left of the page we previously
+ *		found it on.
+ *
+ *		Caller can use its stack to relocate the pivot tuple/downlink for
+ *		any same-level page to the right of the page found by its initial
+ *		descent.  This is necessary because of the possibility that caller
+ *		moved right to recover from a concurrent page split.  It's also
+ *		convenient for certain callers to be able to step right when there
+ *		wasn't a concurrent page split, while still using their original
+ *		stack.  For example, the checkingunique _bt_doinsert() case may
+ *		have to step right when there are many physical duplicates, and its
+ *		scantid forces an insertion to the right of the "first page the
+ *		value could be on".  (This is also relied on by all of our callers
+ *		when dealing with !heapkeyspace indexes.)
+ *
+ *		Returns write-locked parent page buffer, or InvalidBuffer if pivot
+ *		tuple not found (should not happen).  Adjusts bts_blkno &
+ *		bts_offset if changed.  Page split caller should insert its new
+ *		pivot tuple for its new right sibling page on parent page, at the
+ *		offset number bts_offset + 1.
+ */
+Buffer
+_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
+{
+	BlockNumber blkno;
+	OffsetNumber start;
+
+	blkno = stack->bts_blkno;
+	start = stack->bts_offset;
+
+	for (;;)
+	{
+		Buffer		buf;
+		Page		page;
+		BTPageOpaque opaque;
+
+		buf = _bt_getbuf(rel, blkno, BT_WRITE);
+		page = BufferGetPage(buf);
+		opaque = BTPageGetOpaque(page);
+
+		if (P_INCOMPLETE_SPLIT(opaque))
+		{
+			_bt_finish_split(rel, buf, stack->bts_parent);
+			continue;
+		}
+
+		if (!P_IGNORE(opaque))
+		{
+			OffsetNumber offnum,
+						minoff,
+						maxoff;
+			ItemId		itemid;
+			IndexTuple	item;
+
+			minoff = P_FIRSTDATAKEY(opaque);
+			maxoff = PageGetMaxOffsetNumber(page);
+
+			/*
+			 * start = InvalidOffsetNumber means "search the whole page". We
+			 * need this test anyway due to possibility that page has a high
+			 * key now when it didn't before.
+			 */
+			if (start < minoff)
+				start = minoff;
+
+			/*
+			 * Need this check too, to guard against possibility that page
+			 * split since we visited it originally.
+			 */
+			if (start > maxoff)
+				start = OffsetNumberNext(maxoff);
+
+			/*
+			 * These loops will check every item on the page --- but in an
+			 * order that's attuned to the probability of where it actually
+			 * is.  Scan to the right first, then to the left.
+			 */
+			for (offnum = start;
+				 offnum <= maxoff;
+				 offnum = OffsetNumberNext(offnum))
+			{
+				itemid = PageGetItemId(page, offnum);
+				item = (IndexTuple) PageGetItem(page, itemid);
+
+				if (BTreeTupleGetDownLink(item) == child)
+				{
+					/* Return accurate pointer to where link is now */
+					stack->bts_blkno = blkno;
+					stack->bts_offset = offnum;
+					return buf;
+				}
+			}
+
+			for (offnum = OffsetNumberPrev(start);
+				 offnum >= minoff;
+				 offnum = OffsetNumberPrev(offnum))
+			{
+				itemid = PageGetItemId(page, offnum);
+				item = (IndexTuple) PageGetItem(page, itemid);
+
+				if (BTreeTupleGetDownLink(item) == child)
+				{
+					/* Return accurate pointer to where link is now */
+					stack->bts_blkno = blkno;
+					stack->bts_offset = offnum;
+					return buf;
+				}
+			}
+		}
+
+		/*
+		 * The item we're looking for moved right at least one page.
+		 *
+		 * Lehman and Yao couple/chain locks when moving right here, which we
+		 * can avoid.  See nbtree/README.
+		 */
+		if (P_RIGHTMOST(opaque))
+		{
+			_bt_relbuf(rel, buf);
+			return InvalidBuffer;
+		}
+		blkno = opaque->btpo_next;
+		start = InvalidOffsetNumber;
+		_bt_relbuf(rel, buf);
+	}
+}
+
+/*
+ *	_bt_newroot() -- Create a new root page for the index.
+ *
+ *		We've just split the old root page and need to create a new one.
+ *		In order to do this, we add a new root page to the file, then lock
+ *		the metadata page and update it.  This is guaranteed to be deadlock-
+ *		free, because all readers release their locks on the metadata page
+ *		before trying to lock the root, and all writers lock the root before
+ *		trying to lock the metadata page.  We have a write lock on the old
+ *		root page, so we have not introduced any cycles into the waits-for
+ *		graph.
+ *
+ *		On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ *		locked. On exit, a new root page exists with entries for the
+ *		two new children, metapage is updated and unlocked/unpinned.
+ *		The new root buffer is returned to caller which has to unlock/unpin
+ *		lbuf, rbuf & rootbuf.
+ */
+static Buffer
+_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
+{
+	Buffer		rootbuf;
+	Page		lpage,
+				rootpage;
+	BlockNumber lbkno,
+				rbkno;
+	BlockNumber rootblknum;
+	BTPageOpaque rootopaque;
+	BTPageOpaque lopaque;
+	ItemId		itemid;
+	IndexTuple	item;
+	IndexTuple	left_item;
+	Size		left_item_sz;
+	IndexTuple	right_item;
+	Size		right_item_sz;
+	Buffer		metabuf;
+	Page		metapg;
+	BTMetaPageData *metad;
+
+	lbkno = BufferGetBlockNumber(lbuf);
+	rbkno = BufferGetBlockNumber(rbuf);
+	lpage = BufferGetPage(lbuf);
+	lopaque = BTPageGetOpaque(lpage);
+
+	/* get a new root page */
+	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+	rootpage = BufferGetPage(rootbuf);
+	rootblknum = BufferGetBlockNumber(rootbuf);
+
+	/* acquire lock on the metapage */
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+	metapg = BufferGetPage(metabuf);
+	metad = BTPageGetMeta(metapg);
+
+	/*
+	 * Create downlink item for left page (old root).  The key value used is
+	 * "minus infinity", a sentinel value that's reliably less than any real
+	 * key value that could appear in the left page.
+	 */
+	left_item_sz = sizeof(IndexTupleData);
+	left_item = (IndexTuple) palloc(left_item_sz);
+	left_item->t_info = left_item_sz;
+	BTreeTupleSetDownLink(left_item, lbkno);
+	BTreeTupleSetNAtts(left_item, 0, false);
+
+	/*
+	 * Create downlink item for right page.  The key for it is obtained from
+	 * the "high key" position in the left page.
+	 */
+	itemid = PageGetItemId(lpage, P_HIKEY);
+	right_item_sz = ItemIdGetLength(itemid);
+	item = (IndexTuple) PageGetItem(lpage, itemid);
+	right_item = CopyIndexTuple(item);
+	BTreeTupleSetDownLink(right_item, rbkno);
+
+	/* NO EREPORT(ERROR) from here till newroot op is logged */
+	START_CRIT_SECTION();
+
+	/* upgrade metapage if needed */
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
+		_bt_upgrademetapage(metapg);
+
+	/* set btree special data */
+	rootopaque = BTPageGetOpaque(rootpage);
+	rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+	rootopaque->btpo_flags = BTP_ROOT;
+	rootopaque->btpo_level =
+		(BTPageGetOpaque(lpage))->btpo_level + 1;
+	rootopaque->btpo_cycleid = 0;
+
+	/* update metapage data */
+	metad->btm_root = rootblknum;
+	metad->btm_level = rootopaque->btpo_level;
+	metad->btm_fastroot = rootblknum;
+	metad->btm_fastlevel = rootopaque->btpo_level;
+
+	/*
+	 * Insert the left page pointer into the new root page.  The root page is
+	 * the rightmost page on its level so there is no "high key" in it; the
+	 * two items will go into positions P_HIKEY and P_FIRSTKEY.
+	 *
+	 * Note: we *must* insert the two items in item-number order, for the
+	 * benefit of _bt_restore_page().
+	 */
+	Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
+	if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
+					false, false) == InvalidOffsetNumber)
+		elog(PANIC, "failed to add leftkey to new root page"
+			 " while splitting block %u of index \"%s\"",
+			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
+
+	/*
+	 * insert the right page pointer into the new root page.
+	 */
+	Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
+	Assert(BTreeTupleGetNAtts(right_item, rel) <=
+		   IndexRelationGetNumberOfKeyAttributes(rel));
+	if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
+					false, false) == InvalidOffsetNumber)
+		elog(PANIC, "failed to add rightkey to new root page"
+			 " while splitting block %u of index \"%s\"",
+			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
+
+	/* Clear the incomplete-split flag in the left child */
+	Assert(P_INCOMPLETE_SPLIT(lopaque));
+	lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+	MarkBufferDirty(lbuf);
+
+	MarkBufferDirty(rootbuf);
+	MarkBufferDirty(metabuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_newroot xlrec;
+		XLogRecPtr	recptr;
+		xl_btree_metadata md;
+
+		xlrec.rootblk = rootblknum;
+		xlrec.level = metad->btm_level;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
+
+		XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
+		XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
+		XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+		Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+		md.version = metad->btm_version;
+		md.root = rootblknum;
+		md.level = metad->btm_level;
+		md.fastroot = rootblknum;
+		md.fastlevel = metad->btm_level;
+		md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
+		md.allequalimage = metad->btm_allequalimage;
+
+		XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
+
+		/*
+		 * Direct access to page is not good but faster - we should implement
+		 * some new func in page API.
+		 */
+		XLogRegisterBufData(0,
+							(char *) rootpage + ((PageHeader) rootpage)->pd_upper,
+							((PageHeader) rootpage)->pd_special -
+							((PageHeader) rootpage)->pd_upper);
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
+
+		PageSetLSN(lpage, recptr);
+		PageSetLSN(rootpage, recptr);
+		PageSetLSN(metapg, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* done with metapage */
+	_bt_relbuf(rel, metabuf);
+
+	pfree(left_item);
+	pfree(right_item);
+
+	return rootbuf;
+}
+
+/*
+ *	_bt_pgaddtup() -- add a data item to a particular page during split.
+ *
+ *		The difference between this routine and a bare PageAddItem call is
+ *		that this code can deal with the first data item on an internal btree
+ *		page in passing.  This data item (which is called "firstright" within
+ *		_bt_split()) has a key that must be treated as minus infinity after
+ *		the split.  Therefore, we truncate away all attributes when caller
+ *		specifies it's the first data item on page (downlink is not changed,
+ *		though).  This extra step is only needed for the right page of an
+ *		internal page split.  There is no need to do this for the first data
+ *		item on the existing/left page, since that will already have been
+ *		truncated during an earlier page split.
+ *
+ *		See _bt_split() for a high level explanation of why we truncate here.
+ *		Note that this routine has nothing to do with suffix truncation,
+ *		despite using some of the same infrastructure.
+ */
+static inline bool
+_bt_pgaddtup(Page page,
+			 Size itemsize,
+			 IndexTuple itup,
+			 OffsetNumber itup_off,
+			 bool newfirstdataitem)
+{
+	IndexTupleData trunctuple;
+
+	if (newfirstdataitem)
+	{
+		trunctuple = *itup;
+		trunctuple.t_info = sizeof(IndexTupleData);
+		BTreeTupleSetNAtts(&trunctuple, 0, false);
+		itup = &trunctuple;
+		itemsize = sizeof(IndexTupleData);
+	}
+
+	if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
+							 false) == InvalidOffsetNumber))
+		return false;
+
+	return true;
+}
+
+/*
+ * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
+ *
+ * There are three operations performed here: simple index deletion, bottom-up
+ * index deletion, and deduplication.  If all three operations fail to free
+ * enough space for the incoming item then caller will go on to split the
+ * page.  We always consider simple deletion first.  If that doesn't work out
+ * we consider alternatives.  Callers that only want us to consider simple
+ * deletion (without any fallback) ask for that using the 'simpleonly'
+ * argument.
+ *
+ * We usually pick only one alternative "complex" operation when simple
+ * deletion alone won't prevent a page split.  The 'checkingunique',
+ * 'uniquedup', and 'indexUnchanged' arguments are used for that.
+ *
+ * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
+ * level flag was found set.  The flag was useful back when there wasn't
+ * necessarily one single page for a duplicate tuple to go on (before heap TID
+ * became a part of the key space in version 4 indexes).  But we don't
+ * actually look at the flag anymore (it's not a gating condition for our
+ * caller).  That would cause us to miss tuples that are safe to delete,
+ * without getting any benefit in return.  We know that the alternative is to
+ * split the page; scanning the line pointer array in passing won't have
+ * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
+ * all this because !heapkeyspace indexes must still do a "getting tired"
+ * linear search, and so are likely to get some benefit from using it as a
+ * gating condition.)
+ */
+static void
+_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
+							 BTInsertState insertstate,
+							 bool simpleonly, bool checkingunique,
+							 bool uniquedup, bool indexUnchanged)
+{
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	int			ndeletable = 0;
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Buffer		buffer = insertstate->buf;
+	BTScanInsert itup_key = insertstate->itup_key;
+	Page		page = BufferGetPage(buffer);
+	BTPageOpaque opaque = BTPageGetOpaque(page);
+
+	Assert(P_ISLEAF(opaque));
+	Assert(simpleonly || itup_key->heapkeyspace);
+	Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
+
+	/*
+	 * Scan over all items to see which ones need to be deleted according to
+	 * LP_DEAD flags.  We'll usually manage to delete a few extra items that
+	 * are not marked LP_DEAD in passing.  Often the extra items that actually
+	 * end up getting deleted are items that would have had their LP_DEAD bit
+	 * set before long anyway (if we opted not to include them as extras).
+	 */
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+
+		if (ItemIdIsDead(itemId))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+	{
+		_bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
+						   insertstate->itup, minoff, maxoff);
+		insertstate->bounds_valid = false;
+
+		/* Return when a page split has already been avoided */
+		if (PageGetFreeSpace(page) >= insertstate->itemsz)
+			return;
+
+		/* Might as well assume duplicates (if checkingunique) */
+		uniquedup = true;
+	}
+
+	/*
+	 * We're done with simple deletion.  Return early with callers that only
+	 * call here so that simple deletion can be considered.  This includes
+	 * callers that explicitly ask for this and checkingunique callers that
+	 * probably don't have any version churn duplicates on the page.
+	 *
+	 * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
+	 * return at this point (or when we go on the try either or both of our
+	 * other strategies and they also fail).  We do not bother expending a
+	 * separate write to clear it, however.  Caller will definitely clear it
+	 * when it goes on to split the page (note also that the deduplication
+	 * process will clear the flag in passing, just to keep things tidy).
+	 */
+	if (simpleonly || (checkingunique && !uniquedup))
+	{
+		Assert(!indexUnchanged);
+		return;
+	}
+
+	/* Assume bounds about to be invalidated (this is almost certain now) */
+	insertstate->bounds_valid = false;
+
+	/*
+	 * Perform bottom-up index deletion pass when executor hint indicated that
+	 * incoming item is logically unchanged, or for a unique index that is
+	 * known to have physical duplicates for some other reason.  (There is a
+	 * large overlap between these two cases for a unique index.  It's worth
+	 * having both triggering conditions in order to apply the optimization in
+	 * the event of successive related INSERT and DELETE statements.)
+	 *
+	 * We'll go on to do a deduplication pass when a bottom-up pass fails to
+	 * delete an acceptable amount of free space (a significant fraction of
+	 * the page, or space for the new item, whichever is greater).
+	 *
+	 * Note: Bottom-up index deletion uses the same equality/equivalence
+	 * routines as deduplication internally.  However, it does not merge
+	 * together index tuples, so the same correctness considerations do not
+	 * apply.  We deliberately omit an index-is-allequalimage test here.
+	 */
+	if ((indexUnchanged || uniquedup) &&
+		_bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
+		return;
+
+	/* Perform deduplication pass (when enabled and index-is-allequalimage) */
+	if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
+		_bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
+					   insertstate->itemsz, (indexUnchanged || uniquedup));
+}
+
+/*
+ * _bt_simpledel_pass - Simple index tuple deletion pass.
+ *
+ * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
+ * of all such tuples are determined by caller (caller passes these to us as
+ * its 'deletable' argument).
+ *
+ * We might also delete extra index tuples that turn out to be safe to delete
+ * in passing (though they must be cheap to check in passing to begin with).
+ * There is no certainty that any extra tuples will be deleted, though.  The
+ * high level goal of the approach we take is to get the most out of each call
+ * here (without noticeably increasing the per-call overhead compared to what
+ * we need to do just to be able to delete the page's LP_DEAD-marked index
+ * tuples).
+ *
+ * The number of extra index tuples that turn out to be deletable might
+ * greatly exceed the number of LP_DEAD-marked index tuples due to various
+ * locality related effects.  For example, it's possible that the total number
+ * of table blocks (pointed to by all TIDs on the leaf page) is naturally
+ * quite low, in which case we might end up checking if it's possible to
+ * delete _most_ index tuples on the page (without the tableam needing to
+ * access additional table blocks).  The tableam will sometimes stumble upon
+ * _many_ extra deletable index tuples in indexes where this pattern is
+ * common.
+ *
+ * See nbtree/README for further details on simple index tuple deletion.
+ */
+static void
+_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
+				   OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
+				   OffsetNumber minoff, OffsetNumber maxoff)
+{
+	Page		page = BufferGetPage(buffer);
+	BlockNumber *deadblocks;
+	int			ndeadblocks;
+	TM_IndexDeleteOp delstate;
+	OffsetNumber offnum;
+
+	/* Get array of table blocks pointed to by LP_DEAD-set tuples */
+	deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
+								&ndeadblocks);
+
+	/* Initialize tableam state that describes index deletion operation */
+	delstate.irel = rel;
+	delstate.iblknum = BufferGetBlockNumber(buffer);
+	delstate.bottomup = false;
+	delstate.bottomupfreespace = 0;
+	delstate.ndeltids = 0;
+	delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
+	delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
+
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+		TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
+		TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
+		BlockNumber tidblock;
+		void	   *match;
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
+			match = bsearch(&tidblock, deadblocks, ndeadblocks,
+							sizeof(BlockNumber), _bt_blk_cmp);
+
+			if (!match)
+			{
+				Assert(!ItemIdIsDead(itemid));
+				continue;
+			}
+
+			/*
+			 * TID's table block is among those pointed to by the TIDs from
+			 * LP_DEAD-bit set tuples on page -- add TID to deltids
+			 */
+			odeltid->tid = itup->t_tid;
+			odeltid->id = delstate.ndeltids;
+			ostatus->idxoffnum = offnum;
+			ostatus->knowndeletable = ItemIdIsDead(itemid);
+			ostatus->promising = false; /* unused */
+			ostatus->freespace = 0; /* unused */
+
+			delstate.ndeltids++;
+		}
+		else
+		{
+			int			nitem = BTreeTupleGetNPosting(itup);
+
+			for (int p = 0; p < nitem; p++)
+			{
+				ItemPointer tid = BTreeTupleGetPostingN(itup, p);
+
+				tidblock = ItemPointerGetBlockNumber(tid);
+				match = bsearch(&tidblock, deadblocks, ndeadblocks,
+								sizeof(BlockNumber), _bt_blk_cmp);
+
+				if (!match)
+				{
+					Assert(!ItemIdIsDead(itemid));
+					continue;
+				}
+
+				/*
+				 * TID's table block is among those pointed to by the TIDs
+				 * from LP_DEAD-bit set tuples on page -- add TID to deltids
+				 */
+				odeltid->tid = *tid;
+				odeltid->id = delstate.ndeltids;
+				ostatus->idxoffnum = offnum;
+				ostatus->knowndeletable = ItemIdIsDead(itemid);
+				ostatus->promising = false; /* unused */
+				ostatus->freespace = 0; /* unused */
+
+				odeltid++;
+				ostatus++;
+				delstate.ndeltids++;
+			}
+		}
+	}
+
+	pfree(deadblocks);
+
+	Assert(delstate.ndeltids >= ndeletable);
+
+	/* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
+	_bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
+
+	pfree(delstate.deltids);
+	pfree(delstate.status);
+}
+
+/*
+ * _bt_deadblocks() -- Get LP_DEAD related table blocks.
+ *
+ * Builds sorted and unique-ified array of table block numbers from index
+ * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
+ * block from incoming newitem just in case it isn't among the LP_DEAD-related
+ * table blocks.
+ *
+ * Always counting the newitem's table block as an LP_DEAD related block makes
+ * sense because the cost is consistently low; it is practically certain that
+ * the table block will not incur a buffer miss in tableam.  On the other hand
+ * the benefit is often quite high.  There is a decent chance that there will
+ * be some deletable items from this block, since in general most garbage
+ * tuples became garbage in the recent past (in many cases this won't be the
+ * first logical row that core code added to/modified in table block
+ * recently).
+ *
+ * Returns final array, and sets *nblocks to its final size for caller.
+ */
+static BlockNumber *
+_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
+			   IndexTuple newitem, int *nblocks)
+{
+	int			spacentids,
+				ntids;
+	BlockNumber *tidblocks;
+
+	/*
+	 * Accumulate each TID's block in array whose initial size has space for
+	 * one table block per LP_DEAD-set tuple (plus space for the newitem table
+	 * block).  Array will only need to grow when there are LP_DEAD-marked
+	 * posting list tuples (which is not that common).
+	 */
+	spacentids = ndeletable + 1;
+	ntids = 0;
+	tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
+
+	/*
+	 * First add the table block for the incoming newitem.  This is the one
+	 * case where simple deletion can visit a table block that doesn't have
+	 * any known deletable items.
+	 */
+	Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
+	tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
+
+	for (int i = 0; i < ndeletable; i++)
+	{
+		ItemId		itemid = PageGetItemId(page, deletable[i]);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(ItemIdIsDead(itemid));
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			if (ntids + 1 > spacentids)
+			{
+				spacentids *= 2;
+				tidblocks = (BlockNumber *)
+					repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
+			}
+
+			tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
+		}
+		else
+		{
+			int			nposting = BTreeTupleGetNPosting(itup);
+
+			if (ntids + nposting > spacentids)
+			{
+				spacentids = Max(spacentids * 2, ntids + nposting);
+				tidblocks = (BlockNumber *)
+					repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
+			}
+
+			for (int j = 0; j < nposting; j++)
+			{
+				ItemPointer tid = BTreeTupleGetPostingN(itup, j);
+
+				tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
+			}
+		}
+	}
+
+	qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
+	*nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
+
+	return tidblocks;
+}
+
+/*
+ * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
+ */
+static inline int
+_bt_blk_cmp(const void *arg1, const void *arg2)
+{
+	BlockNumber b1 = *((BlockNumber *) arg1);
+	BlockNumber b2 = *((BlockNumber *) arg2);
+
+	if (b1 < b2)
+		return -1;
+	else if (b1 > b2)
+		return 1;
+
+	return 0;
+}