1 files changed, 2501 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
new file mode 100644
index 0000000..fdf0e56
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -0,0 +1,2501 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsearch.c
+ *	  Search code for postgres btrees.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtsearch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/predicate.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+
+
+static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static int	_bt_binsrch_posting(BTScanInsert key, Page page,
+								OffsetNumber offnum);
+static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
+						 OffsetNumber offnum);
+static void _bt_saveitem(BTScanOpaque so, int itemIndex,
+						 OffsetNumber offnum, IndexTuple itup);
+static int	_bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+								  OffsetNumber offnum, ItemPointer heapTid,
+								  IndexTuple itup);
+static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+									   OffsetNumber offnum,
+									   ItemPointer heapTid, int tupleOffset);
+static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
+static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
+								  ScanDirection dir);
+static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
+static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
+
+
+/*
+ *	_bt_drop_lock_and_maybe_pin()
+ *
+ * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
+ * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
+ * This will prevent vacuum from stalling in a blocked state trying to read a
+ * page when a cursor is sitting on it -- at least in many important cases.
+ *
+ * Set the buffer to invalid if the pin is released, since the buffer may be
+ * re-used.  If we need to go back to this block (for example, to apply
+ * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
+ * will remain in shared memory for as long as it takes to scan the index
+ * buffer page.
+ */
+static void
+_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+{
+	_bt_unlockbuf(scan->indexRelation, sp->buf);
+
+	if (IsMVCCSnapshot(scan->xs_snapshot) &&
+		RelationNeedsWAL(scan->indexRelation) &&
+		!scan->xs_want_itup)
+	{
+		ReleaseBuffer(sp->buf);
+		sp->buf = InvalidBuffer;
+	}
+}
+
+/*
+ *	_bt_search() -- Search the tree for a particular scankey,
+ *		or more precisely for the first leaf page it could be on.
+ *
+ * The passed scankey is an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * Return value is a stack of parent-page pointers (i.e. there is no entry for
+ * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
+ * which is locked and pinned.  No locks are held on the parent pages,
+ * however!
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree.  This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ *
+ * The returned buffer is locked according to access parameter.  Additionally,
+ * access = BT_WRITE will allow an empty root page to be created and returned.
+ * When access = BT_READ, an empty index will result in *bufP being set to
+ * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
+ * during the search will be finished.
+ */
+BTStack
+_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
+		   Snapshot snapshot)
+{
+	BTStack		stack_in = NULL;
+	int			page_access = BT_READ;
+
+	/* Get the root page to start with */
+	*bufP = _bt_getroot(rel, access);
+
+	/* If index is empty and access = BT_READ, no root page is created. */
+	if (!BufferIsValid(*bufP))
+		return (BTStack) NULL;
+
+	/* Loop iterates once per level descended in the tree */
+	for (;;)
+	{
+		Page		page;
+		BTPageOpaque opaque;
+		OffsetNumber offnum;
+		ItemId		itemid;
+		IndexTuple	itup;
+		BlockNumber child;
+		BTStack		new_stack;
+
+		/*
+		 * Race -- the page we just grabbed may have split since we read its
+		 * downlink in its parent page (or the metapage).  If it has, we may
+		 * need to move right to its new sibling.  Do that.
+		 *
+		 * In write-mode, allow _bt_moveright to finish any incomplete splits
+		 * along the way.  Strictly speaking, we'd only need to finish an
+		 * incomplete split on the leaf page we're about to insert to, not on
+		 * any of the upper levels (internal pages with incomplete splits are
+		 * also taken care of in _bt_getstackbuf).  But this is a good
+		 * opportunity to finish splits of internal pages too.
+		 */
+		*bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
+							  page_access, snapshot);
+
+		/* if this is a leaf page, we're done */
+		page = BufferGetPage(*bufP);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (P_ISLEAF(opaque))
+			break;
+
+		/*
+		 * Find the appropriate pivot tuple on this page.  Its downlink points
+		 * to the child page that we're about to descend to.
+		 */
+		offnum = _bt_binsrch(rel, key, *bufP);
+		itemid = PageGetItemId(page, offnum);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+		Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
+		child = BTreeTupleGetDownLink(itup);
+
+		/*
+		 * We need to save the location of the pivot tuple we chose in a new
+		 * stack entry for this page/level.  If caller ends up splitting a
+		 * page one level down, it usually ends up inserting a new pivot
+		 * tuple/downlink immediately after the location recorded here.
+		 */
+		new_stack = (BTStack) palloc(sizeof(BTStackData));
+		new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
+		new_stack->bts_offset = offnum;
+		new_stack->bts_parent = stack_in;
+
+		/*
+		 * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
+		 * we're on the level 1 and asked to lock leaf page in write mode,
+		 * then lock next page in write mode, because it must be a leaf.
+		 */
+		if (opaque->btpo_level == 1 && access == BT_WRITE)
+			page_access = BT_WRITE;
+
+		/* drop the read lock on the page, then acquire one on its child */
+		*bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
+
+		/* okay, all set to move down a level */
+		stack_in = new_stack;
+	}
+
+	/*
+	 * If we're asked to lock leaf in write mode, but didn't manage to, then
+	 * relock.  This should only happen when the root page is a leaf page (and
+	 * the only page in the index other than the metapage).
+	 */
+	if (access == BT_WRITE && page_access == BT_READ)
+	{
+		/* trade in our read lock for a write lock */
+		_bt_unlockbuf(rel, *bufP);
+		_bt_lockbuf(rel, *bufP, BT_WRITE);
+
+		/*
+		 * Race -- the leaf page may have split after we dropped the read lock
+		 * but before we acquired a write lock.  If it has, we may need to
+		 * move right to its new sibling.  Do that.
+		 */
+		*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
+							  snapshot);
+	}
+
+	return stack_in;
+}
+
+/*
+ *	_bt_moveright() -- move right in the btree if necessary.
+ *
+ * When we follow a pointer to reach a page, it is possible that
+ * the page has changed in the meanwhile.  If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
+ *
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page.  If that entry is
+ * strictly less than the scankey, or <= the scankey in the
+ * key.nextkey=true case, then we followed the wrong link and we need
+ * to move right.
+ *
+ * The passed insertion-type scankey can omit the rightmost column(s) of the
+ * index. (see nbtree/README)
+ *
+ * When key.nextkey is false (the usual case), we are looking for the first
+ * item >= key.  When key.nextkey is true, we are looking for the first item
+ * strictly greater than key.
+ *
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter.  This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed.  'stack' is only used if forupdate is true.
+ *
+ * On entry, we have the buffer pinned and a lock of the type specified by
+ * 'access'.  If we move right, we release the buffer and lock and acquire
+ * the same on the right sibling.  Return value is the buffer we stop at.
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree.  This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ */
+Buffer
+_bt_moveright(Relation rel,
+			  BTScanInsert key,
+			  Buffer buf,
+			  bool forupdate,
+			  BTStack stack,
+			  int access,
+			  Snapshot snapshot)
+{
+	Page		page;
+	BTPageOpaque opaque;
+	int32		cmpval;
+
+	/*
+	 * When nextkey = false (normal case): if the scan key that brought us to
+	 * this page is > the high key stored on the page, then the page has split
+	 * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
+	 * have some duplicates to the right as well as the left, but that's
+	 * something that's only ever dealt with on the leaf level, after
+	 * _bt_search has found an initial leaf page.)
+	 *
+	 * When nextkey = true: move right if the scan key is >= page's high key.
+	 * (Note that key.scantid cannot be set in this case.)
+	 *
+	 * The page could even have split more than once, so scan as far as
+	 * needed.
+	 *
+	 * We also have to move right if we followed a link that brought us to a
+	 * dead page.
+	 */
+	cmpval = key->nextkey ? 0 : 1;
+
+	for (;;)
+	{
+		page = BufferGetPage(buf);
+		TestForOldSnapshot(snapshot, rel, page);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		if (P_RIGHTMOST(opaque))
+			break;
+
+		/*
+		 * Finish any incomplete splits we encounter along the way.
+		 */
+		if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+		{
+			BlockNumber blkno = BufferGetBlockNumber(buf);
+
+			/* upgrade our lock if necessary */
+			if (access == BT_READ)
+			{
+				_bt_unlockbuf(rel, buf);
+				_bt_lockbuf(rel, buf, BT_WRITE);
+			}
+
+			if (P_INCOMPLETE_SPLIT(opaque))
+				_bt_finish_split(rel, buf, stack);
+			else
+				_bt_relbuf(rel, buf);
+
+			/* re-acquire the lock in the right mode, and re-check */
+			buf = _bt_getbuf(rel, blkno, access);
+			continue;
+		}
+
+		if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
+		{
+			/* step right one page */
+			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+			continue;
+		}
+		else
+			break;
+	}
+
+	if (P_IGNORE(opaque))
+		elog(ERROR, "fell off the end of index \"%s\"",
+			 RelationGetRelationName(rel));
+
+	return buf;
+}
+
+/*
+ *	_bt_binsrch() -- Do a binary search for a key on a particular page.
+ *
+ * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
+ * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
+ * particular, this means it is possible to return a value 1 greater than the
+ * number of keys on the page, if the scankey is > all keys on the page.)
+ *
+ * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
+ * of the last key < given scankey, or last key <= given scankey if nextkey
+ * is true.  (Since _bt_compare treats the first data key of such a page as
+ * minus infinity, there will be at least one key < scankey, so the result
+ * always points at one of the keys on the page.)  This key indicates the
+ * right place to descend to be sure we find all leaf keys >= given scankey
+ * (or leaf keys > given scankey when nextkey is true).
+ *
+ * This procedure is not responsible for walking right, it just examines
+ * the given page.  _bt_binsrch() has no lock or refcount side effects
+ * on the buffer.
+ */
+static OffsetNumber
+_bt_binsrch(Relation rel,
+			BTScanInsert key,
+			Buffer buf)
+{
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber low,
+				high;
+	int32		result,
+				cmpval;
+
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* Requesting nextkey semantics while using scantid seems nonsensical */
+	Assert(!key->nextkey || key->scantid == NULL);
+	/* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
+	Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
+
+	low = P_FIRSTDATAKEY(opaque);
+	high = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * If there are no keys on the page, return the first available slot. Note
+	 * this covers two cases: the page is really empty (no keys), or it
+	 * contains only a high key.  The latter case is possible after vacuuming.
+	 * This can never happen on an internal page, however, since they are
+	 * never empty (an internal page must have children).
+	 */
+	if (unlikely(high < low))
+		return low;
+
+	/*
+	 * Binary search to find the first key on the page >= scan key, or first
+	 * key > scankey when nextkey is true.
+	 *
+	 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
+	 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
+	 *
+	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
+	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
+	 *
+	 * We can fall out when high == low.
+	 */
+	high++;						/* establish the loop invariant for high */
+
+	cmpval = key->nextkey ? 0 : 1;	/* select comparison value */
+
+	while (high > low)
+	{
+		OffsetNumber mid = low + ((high - low) / 2);
+
+		/* We have low <= mid < high, so mid points at a real slot */
+
+		result = _bt_compare(rel, key, page, mid);
+
+		if (result >= cmpval)
+			low = mid + 1;
+		else
+			high = mid;
+	}
+
+	/*
+	 * At this point we have high == low, but be careful: they could point
+	 * past the last slot on the page.
+	 *
+	 * On a leaf page, we always return the first key >= scan key (resp. >
+	 * scan key), which could be the last slot + 1.
+	 */
+	if (P_ISLEAF(opaque))
+		return low;
+
+	/*
+	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
+	 * There must be one if _bt_compare() is playing by the rules.
+	 */
+	Assert(low > P_FIRSTDATAKEY(opaque));
+
+	return OffsetNumberPrev(low);
+}
+
+/*
+ *
+ *	_bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
+ *
+ * Like _bt_binsrch(), but with support for caching the binary search
+ * bounds.  Only used during insertion, and only on the leaf page that it
+ * looks like caller will insert tuple on.  Exclusive-locked and pinned
+ * leaf page is contained within insertstate.
+ *
+ * Caches the bounds fields in insertstate so that a subsequent call can
+ * reuse the low and strict high bounds of original binary search.  Callers
+ * that use these fields directly must be prepared for the case where low
+ * and/or stricthigh are not on the same page (one or both exceed maxoff
+ * for the page).  The case where there are no items on the page (high <
+ * low) makes bounds invalid.
+ *
+ * Caller is responsible for invalidating bounds when it modifies the page
+ * before calling here a second time, and for dealing with posting list
+ * tuple matches (callers can use insertstate's postingoff field to
+ * determine which existing heap TID will need to be replaced by a posting
+ * list split).
+ */
+OffsetNumber
+_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
+{
+	BTScanInsert key = insertstate->itup_key;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber low,
+				high,
+				stricthigh;
+	int32		result,
+				cmpval;
+
+	page = BufferGetPage(insertstate->buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	Assert(P_ISLEAF(opaque));
+	Assert(!key->nextkey);
+	Assert(insertstate->postingoff == 0);
+
+	if (!insertstate->bounds_valid)
+	{
+		/* Start new binary search */
+		low = P_FIRSTDATAKEY(opaque);
+		high = PageGetMaxOffsetNumber(page);
+	}
+	else
+	{
+		/* Restore result of previous binary search against same page */
+		low = insertstate->low;
+		high = insertstate->stricthigh;
+	}
+
+	/* If there are no keys on the page, return the first available slot */
+	if (unlikely(high < low))
+	{
+		/* Caller can't reuse bounds */
+		insertstate->low = InvalidOffsetNumber;
+		insertstate->stricthigh = InvalidOffsetNumber;
+		insertstate->bounds_valid = false;
+		return low;
+	}
+
+	/*
+	 * Binary search to find the first key on the page >= scan key. (nextkey
+	 * is always false when inserting).
+	 *
+	 * The loop invariant is: all slots before 'low' are < scan key, all slots
+	 * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
+	 * maintained to save additional search effort for caller.
+	 *
+	 * We can fall out when high == low.
+	 */
+	if (!insertstate->bounds_valid)
+		high++;					/* establish the loop invariant for high */
+	stricthigh = high;			/* high initially strictly higher */
+
+	cmpval = 1;					/* !nextkey comparison value */
+
+	while (high > low)
+	{
+		OffsetNumber mid = low + ((high - low) / 2);
+
+		/* We have low <= mid < high, so mid points at a real slot */
+
+		result = _bt_compare(rel, key, page, mid);
+
+		if (result >= cmpval)
+			low = mid + 1;
+		else
+		{
+			high = mid;
+			if (result != 0)
+				stricthigh = high;
+		}
+
+		/*
+		 * If tuple at offset located by binary search is a posting list whose
+		 * TID range overlaps with caller's scantid, perform posting list
+		 * binary search to set postingoff for caller.  Caller must split the
+		 * posting list when postingoff is set.  This should happen
+		 * infrequently.
+		 */
+		if (unlikely(result == 0 && key->scantid != NULL))
+		{
+			/*
+			 * postingoff should never be set more than once per leaf page
+			 * binary search.  That would mean that there are duplicate table
+			 * TIDs in the index, which is never okay.  Check for that here.
+			 */
+			if (insertstate->postingoff != 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INDEX_CORRUPTED),
+						 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
+										 ItemPointerGetBlockNumber(key->scantid),
+										 ItemPointerGetOffsetNumber(key->scantid),
+										 low, stricthigh,
+										 BufferGetBlockNumber(insertstate->buf),
+										 RelationGetRelationName(rel))));
+
+			insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
+		}
+	}
+
+	/*
+	 * On a leaf page, a binary search always returns the first key >= scan
+	 * key (at least in !nextkey case), which could be the last slot + 1. This
+	 * is also the lower bound of cached search.
+	 *
+	 * stricthigh may also be the last slot + 1, which prevents caller from
+	 * using bounds directly, but is still useful to us if we're called a
+	 * second time with cached bounds (cached low will be < stricthigh when
+	 * that happens).
+	 */
+	insertstate->low = low;
+	insertstate->stricthigh = stricthigh;
+	insertstate->bounds_valid = true;
+
+	return low;
+}
+
+/*----------
+ *	_bt_binsrch_posting() -- posting list binary search.
+ *
+ * Helper routine for _bt_binsrch_insert().
+ *
+ * Returns offset into posting list where caller's scantid belongs.
+ *----------
+ */
+static int
+_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
+{
+	IndexTuple	itup;
+	ItemId		itemid;
+	int			low,
+				high,
+				mid,
+				res;
+
+	/*
+	 * If this isn't a posting tuple, then the index must be corrupt (if it is
+	 * an ordinary non-pivot tuple then there must be an existing tuple with a
+	 * heap TID that equals inserter's new heap TID/scantid).  Defensively
+	 * check that tuple is a posting list tuple whose posting list range
+	 * includes caller's scantid.
+	 *
+	 * (This is also needed because contrib/amcheck's rootdescend option needs
+	 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
+	 */
+	itemid = PageGetItemId(page, offnum);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	if (!BTreeTupleIsPosting(itup))
+		return 0;
+
+	Assert(key->heapkeyspace && key->allequalimage);
+
+	/*
+	 * In the event that posting list tuple has LP_DEAD bit set, indicate this
+	 * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
+	 * second call to _bt_binsrch_insert() can take place when its caller has
+	 * removed the dead item.
+	 */
+	if (ItemIdIsDead(itemid))
+		return -1;
+
+	/* "high" is past end of posting list for loop invariant */
+	low = 0;
+	high = BTreeTupleGetNPosting(itup);
+	Assert(high >= 2);
+
+	while (high > low)
+	{
+		mid = low + ((high - low) / 2);
+		res = ItemPointerCompare(key->scantid,
+								 BTreeTupleGetPostingN(itup, mid));
+
+		if (res > 0)
+			low = mid + 1;
+		else if (res < 0)
+			high = mid;
+		else
+			return mid;
+	}
+
+	/* Exact match not found */
+	return low;
+}
+
+/*----------
+ *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
+ *
+ *	page/offnum: location of btree item to be compared to.
+ *
+ *		This routine returns:
+ *			<0 if scankey < tuple at offnum;
+ *			 0 if scankey == tuple at offnum;
+ *			>0 if scankey > tuple at offnum.
+ *
+ * NULLs in the keys are treated as sortable values.  Therefore
+ * "equality" does not necessarily mean that the item should be returned
+ * to the caller as a matching key.  Similarly, an insertion scankey
+ * with its scantid set is treated as equal to a posting tuple whose TID
+ * range overlaps with their scantid.  There generally won't be a
+ * matching TID in the posting tuple, which caller must handle
+ * themselves (e.g., by splitting the posting list tuple).
+ *
+ * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
+ * "minus infinity": this routine will always claim it is less than the
+ * scankey.  The actual key value stored is explicitly truncated to 0
+ * attributes (explicitly minus infinity) with version 3+ indexes, but
+ * that isn't relied upon.  This allows us to implement the Lehman and
+ * Yao convention that the first down-link pointer is before the first
+ * key.  See backend/access/nbtree/README for details.
+ *----------
+ */
+int32
+_bt_compare(Relation rel,
+			BTScanInsert key,
+			Page page,
+			OffsetNumber offnum)
+{
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	IndexTuple	itup;
+	ItemPointer heapTid;
+	ScanKey		scankey;
+	int			ncmpkey;
+	int			ntupatts;
+	int32		result;
+
+	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
+	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
+	Assert(key->heapkeyspace || key->scantid == NULL);
+
+	/*
+	 * Force result ">" if target item is first data item on an internal page
+	 * --- see NOTE above.
+	 */
+	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
+		return 1;
+
+	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+	ntupatts = BTreeTupleGetNAtts(itup, rel);
+
+	/*
+	 * The scan key is set up with the attribute number associated with each
+	 * term in the key.  It is important that, if the index is multi-key, the
+	 * scan contain the first k key attributes, and that they be in order.  If
+	 * you think about how multi-key ordering works, you'll understand why
+	 * this is.
+	 *
+	 * We don't test for violation of this condition here, however.  The
+	 * initial setup for the index scan had better have gotten it right (see
+	 * _bt_first).
+	 */
+
+	ncmpkey = Min(ntupatts, key->keysz);
+	Assert(key->heapkeyspace || ncmpkey == key->keysz);
+	Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
+	scankey = key->scankeys;
+	for (int i = 1; i <= ncmpkey; i++)
+	{
+		Datum		datum;
+		bool		isNull;
+
+		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
+
+		if (scankey->sk_flags & SK_ISNULL)	/* key is NULL */
+		{
+			if (isNull)
+				result = 0;		/* NULL "=" NULL */
+			else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+				result = -1;	/* NULL "<" NOT_NULL */
+			else
+				result = 1;		/* NULL ">" NOT_NULL */
+		}
+		else if (isNull)		/* key is NOT_NULL and item is NULL */
+		{
+			if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+				result = 1;		/* NOT_NULL ">" NULL */
+			else
+				result = -1;	/* NOT_NULL "<" NULL */
+		}
+		else
+		{
+			/*
+			 * The sk_func needs to be passed the index value as left arg and
+			 * the sk_argument as right arg (they might be of different
+			 * types).  Since it is convenient for callers to think of
+			 * _bt_compare as comparing the scankey to the index item, we have
+			 * to flip the sign of the comparison result.  (Unless it's a DESC
+			 * column, in which case we *don't* flip the sign.)
+			 */
+			result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+													 scankey->sk_collation,
+													 datum,
+													 scankey->sk_argument));
+
+			if (!(scankey->sk_flags & SK_BT_DESC))
+				INVERT_COMPARE_RESULT(result);
+		}
+
+		/* if the keys are unequal, return the difference */
+		if (result != 0)
+			return result;
+
+		scankey++;
+	}
+
+	/*
+	 * All non-truncated attributes (other than heap TID) were found to be
+	 * equal.  Treat truncated attributes as minus infinity when scankey has a
+	 * key attribute value that would otherwise be compared directly.
+	 *
+	 * Note: it doesn't matter if ntupatts includes non-key attributes;
+	 * scankey won't, so explicitly excluding non-key attributes isn't
+	 * necessary.
+	 */
+	if (key->keysz > ntupatts)
+		return 1;
+
+	/*
+	 * Use the heap TID attribute and scantid to try to break the tie.  The
+	 * rules are the same as any other key attribute -- only the
+	 * representation differs.
+	 */
+	heapTid = BTreeTupleGetHeapTID(itup);
+	if (key->scantid == NULL)
+	{
+		/*
+		 * Most searches have a scankey that is considered greater than a
+		 * truncated pivot tuple if and when the scankey has equal values for
+		 * attributes up to and including the least significant untruncated
+		 * attribute in tuple.
+		 *
+		 * For example, if an index has the minimum two attributes (single
+		 * user key attribute, plus heap TID attribute), and a page's high key
+		 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
+		 * will not descend to the page to the left.  The search will descend
+		 * right instead.  The truncated attribute in pivot tuple means that
+		 * all non-pivot tuples on the page to the left are strictly < 'foo',
+		 * so it isn't necessary to descend left.  In other words, search
+		 * doesn't have to descend left because it isn't interested in a match
+		 * that has a heap TID value of -inf.
+		 *
+		 * However, some searches (pivotsearch searches) actually require that
+		 * we descend left when this happens.  -inf is treated as a possible
+		 * match for omitted scankey attribute(s).  This is needed by page
+		 * deletion, which must re-find leaf pages that are targets for
+		 * deletion using their high keys.
+		 *
+		 * Note: the heap TID part of the test ensures that scankey is being
+		 * compared to a pivot tuple with one or more truncated key
+		 * attributes.
+		 *
+		 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
+		 * left here, since they have no heap TID attribute (and cannot have
+		 * any -inf key values in any case, since truncation can only remove
+		 * non-key attributes).  !heapkeyspace searches must always be
+		 * prepared to deal with matches on both sides of the pivot once the
+		 * leaf level is reached.
+		 */
+		if (key->heapkeyspace && !key->pivotsearch &&
+			key->keysz == ntupatts && heapTid == NULL)
+			return 1;
+
+		/* All provided scankey arguments found to be equal */
+		return 0;
+	}
+
+	/*
+	 * Treat truncated heap TID as minus infinity, since scankey has a key
+	 * attribute value (scantid) that would otherwise be compared directly
+	 */
+	Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
+	if (heapTid == NULL)
+		return 1;
+
+	/*
+	 * Scankey must be treated as equal to a posting list tuple if its scantid
+	 * value falls within the range of the posting list.  In all other cases
+	 * there can only be a single heap TID value, which is compared directly
+	 * with scantid.
+	 */
+	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
+	result = ItemPointerCompare(key->scantid, heapTid);
+	if (result <= 0 || !BTreeTupleIsPosting(itup))
+		return result;
+	else
+	{
+		result = ItemPointerCompare(key->scantid,
+									BTreeTupleGetMaxHeapTID(itup));
+		if (result > 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
+ *	_bt_first() -- Find the first item in a scan.
+ *
+ *		We need to be clever about the direction of scan, the search
+ *		conditions, and the tree ordering.  We find the first item (or,
+ *		if backwards scan, the last item) in the tree that satisfies the
+ *		qualifications in the scan key.  On success exit, the page containing
+ *		the current index tuple is pinned but not locked, and data about
+ *		the matching tuple(s) on the page has been loaded into so->currPos.
+ *		scan->xs_ctup.t_self is set to the heap TID of the current tuple,
+ *		and if requested, scan->xs_itup points to a copy of the index tuple.
+ *
+ * If there are no matching items in the index, we return false, with no
+ * pins or locks held.
+ *
+ * Note that scan->keyData[], and the so->keyData[] scankey built from it,
+ * are both search-type scankeys (see nbtree/README for more about this).
+ * Within this routine, we build a temporary insertion-type scankey to use
+ * in locating the scan start position.
+ */
+bool
+_bt_first(IndexScanDesc scan, ScanDirection dir)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Buffer		buf;
+	BTStack		stack;
+	OffsetNumber offnum;
+	StrategyNumber strat;
+	bool		nextkey;
+	bool		goback;
+	BTScanInsertData inskey;
+	ScanKey		startKeys[INDEX_MAX_KEYS];
+	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
+	int			keysCount = 0;
+	int			i;
+	bool		status;
+	StrategyNumber strat_total;
+	BTScanPosItem *currItem;
+	BlockNumber blkno;
+
+	Assert(!BTScanPosIsValid(so->currPos));
+
+	pgstat_count_index_scan(rel);
+
+	/*
+	 * Examine the scan keys and eliminate any redundant keys; also mark the
+	 * keys that must be matched to continue the scan.
+	 */
+	_bt_preprocess_keys(scan);
+
+	/*
+	 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
+	 * never be satisfied (eg, x == 1 AND x > 2).
+	 */
+	if (!so->qual_ok)
+	{
+		/* Notify any other workers that we're done with this scan key. */
+		_bt_parallel_done(scan);
+		return false;
+	}
+
+	/*
+	 * For parallel scans, get the starting page from shared state. If the
+	 * scan has not started, proceed to find out first leaf page in the usual
+	 * way while keeping other participating processes waiting.  If the scan
+	 * has already begun, use the page number from the shared structure.
+	 */
+	if (scan->parallel_scan != NULL)
+	{
+		status = _bt_parallel_seize(scan, &blkno);
+		if (!status)
+			return false;
+		else if (blkno == P_NONE)
+		{
+			_bt_parallel_done(scan);
+			return false;
+		}
+		else if (blkno != InvalidBlockNumber)
+		{
+			if (!_bt_parallel_readpage(scan, blkno, dir))
+				return false;
+			goto readcomplete;
+		}
+	}
+
+	/*----------
+	 * Examine the scan keys to discover where we need to start the scan.
+	 *
+	 * We want to identify the keys that can be used as starting boundaries;
+	 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
+	 * a backwards scan.  We can use keys for multiple attributes so long as
+	 * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
+	 * a > or < boundary or find an attribute with no boundary (which can be
+	 * thought of as the same as "> -infinity"), we can't use keys for any
+	 * attributes to its right, because it would break our simplistic notion
+	 * of what initial positioning strategy to use.
+	 *
+	 * When the scan keys include cross-type operators, _bt_preprocess_keys
+	 * may not be able to eliminate redundant keys; in such cases we will
+	 * arbitrarily pick a usable one for each attribute.  This is correct
+	 * but possibly not optimal behavior.  (For example, with keys like
+	 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
+	 * x=5 would be more efficient.)  Since the situation only arises given
+	 * a poorly-worded query plus an incomplete opfamily, live with it.
+	 *
+	 * When both equality and inequality keys appear for a single attribute
+	 * (again, only possible when cross-type operators appear), we *must*
+	 * select one of the equality keys for the starting point, because
+	 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
+	 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
+	 * start at x=4, we will fail and stop before reaching x=10.  If multiple
+	 * equality quals survive preprocessing, however, it doesn't matter which
+	 * one we use --- by definition, they are either redundant or
+	 * contradictory.
+	 *
+	 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
+	 * If the index stores nulls at the end of the index we'll be starting
+	 * from, and we have no boundary key for the column (which means the key
+	 * we deduced NOT NULL from is an inequality key that constrains the other
+	 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
+	 * use as a boundary key.  If we didn't do this, we might find ourselves
+	 * traversing a lot of null entries at the start of the scan.
+	 *
+	 * In this loop, row-comparison keys are treated the same as keys on their
+	 * first (leftmost) columns.  We'll add on lower-order columns of the row
+	 * comparison below, if possible.
+	 *
+	 * The selected scan keys (at most one per index column) are remembered by
+	 * storing their addresses into the local startKeys[] array.
+	 *----------
+	 */
+	strat_total = BTEqualStrategyNumber;
+	if (so->numberOfKeys > 0)
+	{
+		AttrNumber	curattr;
+		ScanKey		chosen;
+		ScanKey		impliesNN;
+		ScanKey		cur;
+
+		/*
+		 * chosen is the so-far-chosen key for the current attribute, if any.
+		 * We don't cast the decision in stone until we reach keys for the
+		 * next attribute.
+		 */
+		curattr = 1;
+		chosen = NULL;
+		/* Also remember any scankey that implies a NOT NULL constraint */
+		impliesNN = NULL;
+
+		/*
+		 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
+		 * pass to handle after-last-key processing.  Actual exit from the
+		 * loop is at one of the "break" statements below.
+		 */
+		for (cur = so->keyData, i = 0;; cur++, i++)
+		{
+			if (i >= so->numberOfKeys || cur->sk_attno != curattr)
+			{
+				/*
+				 * Done looking at keys for curattr.  If we didn't find a
+				 * usable boundary key, see if we can deduce a NOT NULL key.
+				 */
+				if (chosen == NULL && impliesNN != NULL &&
+					((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+					 ScanDirectionIsForward(dir) :
+					 ScanDirectionIsBackward(dir)))
+				{
+					/* Yes, so build the key in notnullkeys[keysCount] */
+					chosen = &notnullkeys[keysCount];
+					ScanKeyEntryInitialize(chosen,
+										   (SK_SEARCHNOTNULL | SK_ISNULL |
+											(impliesNN->sk_flags &
+											 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
+										   curattr,
+										   ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+											BTGreaterStrategyNumber :
+											BTLessStrategyNumber),
+										   InvalidOid,
+										   InvalidOid,
+										   InvalidOid,
+										   (Datum) 0);
+				}
+
+				/*
+				 * If we still didn't find a usable boundary key, quit; else
+				 * save the boundary key pointer in startKeys.
+				 */
+				if (chosen == NULL)
+					break;
+				startKeys[keysCount++] = chosen;
+
+				/*
+				 * Adjust strat_total, and quit if we have stored a > or <
+				 * key.
+				 */
+				strat = chosen->sk_strategy;
+				if (strat != BTEqualStrategyNumber)
+				{
+					strat_total = strat;
+					if (strat == BTGreaterStrategyNumber ||
+						strat == BTLessStrategyNumber)
+						break;
+				}
+
+				/*
+				 * Done if that was the last attribute, or if next key is not
+				 * in sequence (implying no boundary key is available for the
+				 * next attribute).
+				 */
+				if (i >= so->numberOfKeys ||
+					cur->sk_attno != curattr + 1)
+					break;
+
+				/*
+				 * Reset for next attr.
+				 */
+				curattr = cur->sk_attno;
+				chosen = NULL;
+				impliesNN = NULL;
+			}
+
+			/*
+			 * Can we use this key as a starting boundary for this attr?
+			 *
+			 * If not, does it imply a NOT NULL constraint?  (Because
+			 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
+			 * *any* inequality key works for that; we need not test.)
+			 */
+			switch (cur->sk_strategy)
+			{
+				case BTLessStrategyNumber:
+				case BTLessEqualStrategyNumber:
+					if (chosen == NULL)
+					{
+						if (ScanDirectionIsBackward(dir))
+							chosen = cur;
+						else
+							impliesNN = cur;
+					}
+					break;
+				case BTEqualStrategyNumber:
+					/* override any non-equality choice */
+					chosen = cur;
+					break;
+				case BTGreaterEqualStrategyNumber:
+				case BTGreaterStrategyNumber:
+					if (chosen == NULL)
+					{
+						if (ScanDirectionIsForward(dir))
+							chosen = cur;
+						else
+							impliesNN = cur;
+					}
+					break;
+			}
+		}
+	}
+
+	/*
+	 * If we found no usable boundary keys, we have to start from one end of
+	 * the tree.  Walk down that edge to the first or last key, and scan from
+	 * there.
+	 */
+	if (keysCount == 0)
+	{
+		bool		match;
+
+		match = _bt_endpoint(scan, dir);
+
+		if (!match)
+		{
+			/* No match, so mark (parallel) scan finished */
+			_bt_parallel_done(scan);
+		}
+
+		return match;
+	}
+
+	/*
+	 * We want to start the scan somewhere within the index.  Set up an
+	 * insertion scankey we can use to search for the boundary point we
+	 * identified above.  The insertion scankey is built using the keys
+	 * identified by startKeys[].  (Remaining insertion scankey fields are
+	 * initialized after initial-positioning strategy is finalized.)
+	 */
+	Assert(keysCount <= INDEX_MAX_KEYS);
+	for (i = 0; i < keysCount; i++)
+	{
+		ScanKey		cur = startKeys[i];
+
+		Assert(cur->sk_attno == i + 1);
+
+		if (cur->sk_flags & SK_ROW_HEADER)
+		{
+			/*
+			 * Row comparison header: look to the first row member instead.
+			 *
+			 * The member scankeys are already in insertion format (ie, they
+			 * have sk_func = 3-way-comparison function), but we have to watch
+			 * out for nulls, which _bt_preprocess_keys didn't check. A null
+			 * in the first row member makes the condition unmatchable, just
+			 * like qual_ok = false.
+			 */
+			ScanKey		subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
+
+			Assert(subkey->sk_flags & SK_ROW_MEMBER);
+			if (subkey->sk_flags & SK_ISNULL)
+			{
+				_bt_parallel_done(scan);
+				return false;
+			}
+			memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
+
+			/*
+			 * If the row comparison is the last positioning key we accepted,
+			 * try to add additional keys from the lower-order row members.
+			 * (If we accepted independent conditions on additional index
+			 * columns, we use those instead --- doesn't seem worth trying to
+			 * determine which is more restrictive.)  Note that this is OK
+			 * even if the row comparison is of ">" or "<" type, because the
+			 * condition applied to all but the last row member is effectively
+			 * ">=" or "<=", and so the extra keys don't break the positioning
+			 * scheme.  But, by the same token, if we aren't able to use all
+			 * the row members, then the part of the row comparison that we
+			 * did use has to be treated as just a ">=" or "<=" condition, and
+			 * so we'd better adjust strat_total accordingly.
+			 */
+			if (i == keysCount - 1)
+			{
+				bool		used_all_subkeys = false;
+
+				Assert(!(subkey->sk_flags & SK_ROW_END));
+				for (;;)
+				{
+					subkey++;
+					Assert(subkey->sk_flags & SK_ROW_MEMBER);
+					if (subkey->sk_attno != keysCount + 1)
+						break;	/* out-of-sequence, can't use it */
+					if (subkey->sk_strategy != cur->sk_strategy)
+						break;	/* wrong direction, can't use it */
+					if (subkey->sk_flags & SK_ISNULL)
+						break;	/* can't use null keys */
+					Assert(keysCount < INDEX_MAX_KEYS);
+					memcpy(inskey.scankeys + keysCount, subkey,
+						   sizeof(ScanKeyData));
+					keysCount++;
+					if (subkey->sk_flags & SK_ROW_END)
+					{
+						used_all_subkeys = true;
+						break;
+					}
+				}
+				if (!used_all_subkeys)
+				{
+					switch (strat_total)
+					{
+						case BTLessStrategyNumber:
+							strat_total = BTLessEqualStrategyNumber;
+							break;
+						case BTGreaterStrategyNumber:
+							strat_total = BTGreaterEqualStrategyNumber;
+							break;
+					}
+				}
+				break;			/* done with outer loop */
+			}
+		}
+		else
+		{
+			/*
+			 * Ordinary comparison key.  Transform the search-style scan key
+			 * to an insertion scan key by replacing the sk_func with the
+			 * appropriate btree comparison function.
+			 *
+			 * If scankey operator is not a cross-type comparison, we can use
+			 * the cached comparison function; otherwise gotta look it up in
+			 * the catalogs.  (That can't lead to infinite recursion, since no
+			 * indexscan initiated by syscache lookup will use cross-data-type
+			 * operators.)
+			 *
+			 * We support the convention that sk_subtype == InvalidOid means
+			 * the opclass input type; this is a hack to simplify life for
+			 * ScanKeyInit().
+			 */
+			if (cur->sk_subtype == rel->rd_opcintype[i] ||
+				cur->sk_subtype == InvalidOid)
+			{
+				FmgrInfo   *procinfo;
+
+				procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
+				ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
+											   cur->sk_flags,
+											   cur->sk_attno,
+											   InvalidStrategy,
+											   cur->sk_subtype,
+											   cur->sk_collation,
+											   procinfo,
+											   cur->sk_argument);
+			}
+			else
+			{
+				RegProcedure cmp_proc;
+
+				cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+											 rel->rd_opcintype[i],
+											 cur->sk_subtype,
+											 BTORDER_PROC);
+				if (!RegProcedureIsValid(cmp_proc))
+					elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+						 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
+						 cur->sk_attno, RelationGetRelationName(rel));
+				ScanKeyEntryInitialize(inskey.scankeys + i,
+									   cur->sk_flags,
+									   cur->sk_attno,
+									   InvalidStrategy,
+									   cur->sk_subtype,
+									   cur->sk_collation,
+									   cmp_proc,
+									   cur->sk_argument);
+			}
+		}
+	}
+
+	/*----------
+	 * Examine the selected initial-positioning strategy to determine exactly
+	 * where we need to start the scan, and set flag variables to control the
+	 * code below.
+	 *
+	 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
+	 * item >= scan key.  If nextkey = true, they will locate the first
+	 * item > scan key.
+	 *
+	 * If goback = true, we will then step back one item, while if
+	 * goback = false, we will start the scan on the located item.
+	 *----------
+	 */
+	switch (strat_total)
+	{
+		case BTLessStrategyNumber:
+
+			/*
+			 * Find first item >= scankey, then back up one to arrive at last
+			 * item < scankey.  (Note: this positioning strategy is only used
+			 * for a backward scan, so that is always the correct starting
+			 * position.)
+			 */
+			nextkey = false;
+			goback = true;
+			break;
+
+		case BTLessEqualStrategyNumber:
+
+			/*
+			 * Find first item > scankey, then back up one to arrive at last
+			 * item <= scankey.  (Note: this positioning strategy is only used
+			 * for a backward scan, so that is always the correct starting
+			 * position.)
+			 */
+			nextkey = true;
+			goback = true;
+			break;
+
+		case BTEqualStrategyNumber:
+
+			/*
+			 * If a backward scan was specified, need to start with last equal
+			 * item not first one.
+			 */
+			if (ScanDirectionIsBackward(dir))
+			{
+				/*
+				 * This is the same as the <= strategy.  We will check at the
+				 * end whether the found item is actually =.
+				 */
+				nextkey = true;
+				goback = true;
+			}
+			else
+			{
+				/*
+				 * This is the same as the >= strategy.  We will check at the
+				 * end whether the found item is actually =.
+				 */
+				nextkey = false;
+				goback = false;
+			}
+			break;
+
+		case BTGreaterEqualStrategyNumber:
+
+			/*
+			 * Find first item >= scankey.  (This is only used for forward
+			 * scans.)
+			 */
+			nextkey = false;
+			goback = false;
+			break;
+
+		case BTGreaterStrategyNumber:
+
+			/*
+			 * Find first item > scankey.  (This is only used for forward
+			 * scans.)
+			 */
+			nextkey = true;
+			goback = false;
+			break;
+
+		default:
+			/* can't get here, but keep compiler quiet */
+			elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
+			return false;
+	}
+
+	/* Initialize remaining insertion scan key fields */
+	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
+	inskey.anynullkeys = false; /* unused */
+	inskey.nextkey = nextkey;
+	inskey.pivotsearch = false;
+	inskey.scantid = NULL;
+	inskey.keysz = keysCount;
+
+	/*
+	 * Use the manufactured insertion scan key to descend the tree and
+	 * position ourselves on the target leaf page.
+	 */
+	stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
+
+	/* don't need to keep the stack around... */
+	_bt_freestack(stack);
+
+	if (!BufferIsValid(buf))
+	{
+		/*
+		 * We only get here if the index is completely empty. Lock relation
+		 * because nothing finer to lock exists.
+		 */
+		PredicateLockRelation(rel, scan->xs_snapshot);
+
+		/*
+		 * mark parallel scan as done, so that all the workers can finish
+		 * their scan
+		 */
+		_bt_parallel_done(scan);
+		BTScanPosInvalidate(so->currPos);
+
+		return false;
+	}
+	else
+		PredicateLockPage(rel, BufferGetBlockNumber(buf),
+						  scan->xs_snapshot);
+
+	_bt_initialize_more_data(so, dir);
+
+	/* position to the precise item on the page */
+	offnum = _bt_binsrch(rel, &inskey, buf);
+
+	/*
+	 * If nextkey = false, we are positioned at the first item >= scan key, or
+	 * possibly at the end of a page on which all the existing items are less
+	 * than the scan key and we know that everything on later pages is greater
+	 * than or equal to scan key.
+	 *
+	 * If nextkey = true, we are positioned at the first item > scan key, or
+	 * possibly at the end of a page on which all the existing items are less
+	 * than or equal to the scan key and we know that everything on later
+	 * pages is greater than scan key.
+	 *
+	 * The actually desired starting point is either this item or the prior
+	 * one, or in the end-of-page case it's the first item on the next page or
+	 * the last item on this page.  Adjust the starting offset if needed. (If
+	 * this results in an offset before the first item or after the last one,
+	 * _bt_readpage will report no items found, and then we'll step to the
+	 * next page as needed.)
+	 */
+	if (goback)
+		offnum = OffsetNumberPrev(offnum);
+
+	/* remember which buffer we have pinned, if any */
+	Assert(!BTScanPosIsValid(so->currPos));
+	so->currPos.buf = buf;
+
+	/*
+	 * Now load data from the first page of the scan.
+	 */
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		/*
+		 * There's no actually-matching data on this page.  Try to advance to
+		 * the next page.  Return false if there's no matching data at all.
+		 */
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+		if (!_bt_steppage(scan, dir))
+			return false;
+	}
+	else
+	{
+		/* Drop the lock, and maybe the pin, on the current page */
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	}
+
+readcomplete:
+	/* OK, itemIndex says what to return */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_heaptid = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
+/*
+ *	_bt_next() -- Get the next item in a scan.
+ *
+ *		On entry, so->currPos describes the current page, which may be pinned
+ *		but is not locked, and so->currPos.itemIndex identifies which item was
+ *		previously returned.
+ *
+ *		On successful exit, scan->xs_ctup.t_self is set to the TID of the
+ *		next heap tuple, and if requested, scan->xs_itup points to a copy of
+ *		the index tuple.  so->currPos is updated as needed.
+ *
+ *		On failure exit (no more tuples), we release pin and set
+ *		so->currPos.buf to InvalidBuffer.
+ */
+bool
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTScanPosItem *currItem;
+
+	/*
+	 * Advance to next tuple on current page; or if there's no more, try to
+	 * step to the next page with data.
+	 */
+	if (ScanDirectionIsForward(dir))
+	{
+		if (++so->currPos.itemIndex > so->currPos.lastItem)
+		{
+			if (!_bt_steppage(scan, dir))
+				return false;
+		}
+	}
+	else
+	{
+		if (--so->currPos.itemIndex < so->currPos.firstItem)
+		{
+			if (!_bt_steppage(scan, dir))
+				return false;
+		}
+	}
+
+	/* OK, itemIndex says what to return */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_heaptid = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
+/*
+ *	_bt_readpage() -- Load data from current index page into so->currPos
+ *
+ * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
+ * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
+ * they are updated as appropriate.  All other fields of so->currPos are
+ * initialized from scratch here.
+ *
+ * We scan the current page starting at offnum and moving in the indicated
+ * direction.  All items matching the scan keys are loaded into currPos.items.
+ * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
+ * that there can be no more matching tuples in the current scan direction.
+ *
+ * In the case of a parallel scan, caller must have called _bt_parallel_seize
+ * prior to calling this function; this function will invoke
+ * _bt_parallel_release before returning.
+ *
+ * Returns true if any matching items found on the page, false if none.
+ */
+static bool
+_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber minoff;
+	OffsetNumber maxoff;
+	int			itemIndex;
+	bool		continuescan;
+	int			indnatts;
+
+	/*
+	 * We must have the buffer pinned and locked, but the usual macro can't be
+	 * used here; this function is what makes it good for currPos.
+	 */
+	Assert(BufferIsValid(so->currPos.buf));
+
+	page = BufferGetPage(so->currPos.buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* allow next page be processed by parallel worker */
+	if (scan->parallel_scan)
+	{
+		if (ScanDirectionIsForward(dir))
+			_bt_parallel_release(scan, opaque->btpo_next);
+		else
+			_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+	}
+
+	continuescan = true;		/* default assumption */
+	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * We note the buffer's block number so that we can release the pin later.
+	 * This allows us to re-read the buffer if it is needed again for hinting.
+	 */
+	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+
+	/*
+	 * We save the LSN of the page as we read it, so that we know whether it
+	 * safe to apply LP_DEAD hints to the page later.  This allows us to drop
+	 * the pin for MVCC scans, which allows vacuum to avoid blocking.
+	 */
+	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+
+	/*
+	 * we must save the page's right-link while scanning it; this tells us
+	 * where to step right to after we're done with these items.  There is no
+	 * corresponding need for the left-link, since splits always go right.
+	 */
+	so->currPos.nextPage = opaque->btpo_next;
+
+	/* initialize tuple workspace to empty */
+	so->currPos.nextTupleOffset = 0;
+
+	/*
+	 * Now that the current page has been made consistent, the macro should be
+	 * good.
+	 */
+	Assert(BTScanPosIsPinned(so->currPos));
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* load items[] in ascending order */
+		itemIndex = 0;
+
+		offnum = Max(offnum, minoff);
+
+		while (offnum <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
+
+			/*
+			 * If the scan specifies not to return killed tuples, then we
+			 * treat a killed tuple as not passing the qual
+			 */
+			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			{
+				offnum = OffsetNumberNext(offnum);
+				continue;
+			}
+
+			itup = (IndexTuple) PageGetItem(page, iid);
+
+			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			{
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					_bt_saveitem(so, itemIndex, offnum, itup);
+					itemIndex++;
+				}
+				else
+				{
+					int			tupleOffset;
+
+					/*
+					 * Set up state to return posting list, and remember first
+					 * TID
+					 */
+					tupleOffset =
+						_bt_setuppostingitems(so, itemIndex, offnum,
+											  BTreeTupleGetPostingN(itup, 0),
+											  itup);
+					itemIndex++;
+					/* Remember additional TIDs */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											tupleOffset);
+						itemIndex++;
+					}
+				}
+			}
+			/* When !continuescan, there can't be any more matches, so stop */
+			if (!continuescan)
+				break;
+
+			offnum = OffsetNumberNext(offnum);
+		}
+
+		/*
+		 * We don't need to visit page to the right when the high key
+		 * indicates that no more matches will be found there.
+		 *
+		 * Checking the high key like this works out more often than you might
+		 * think.  Leaf page splits pick a split point between the two most
+		 * dissimilar tuples (this is weighed against the need to evenly share
+		 * free space).  Leaf pages with high key attribute values that can
+		 * only appear on non-pivot tuples on the right sibling page are
+		 * common.
+		 */
+		if (continuescan && !P_RIGHTMOST(opaque))
+		{
+			ItemId		iid = PageGetItemId(page, P_HIKEY);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, iid);
+			int			truncatt;
+
+			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+		}
+
+		if (!continuescan)
+			so->currPos.moreRight = false;
+
+		Assert(itemIndex <= MaxTIDsPerBTreePage);
+		so->currPos.firstItem = 0;
+		so->currPos.lastItem = itemIndex - 1;
+		so->currPos.itemIndex = 0;
+	}
+	else
+	{
+		/* load items[] in descending order */
+		itemIndex = MaxTIDsPerBTreePage;
+
+		offnum = Min(offnum, maxoff);
+
+		while (offnum >= minoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
+			bool		tuple_alive;
+			bool		passes_quals;
+
+			/*
+			 * If the scan specifies not to return killed tuples, then we
+			 * treat a killed tuple as not passing the qual.  Most of the
+			 * time, it's a win to not bother examining the tuple's index
+			 * keys, but just skip to the next tuple (previous, actually,
+			 * since we're scanning backwards).  However, if this is the first
+			 * tuple on the page, we do check the index keys, to prevent
+			 * uselessly advancing to the page to the left.  This is similar
+			 * to the high key optimization used by forward scans.
+			 */
+			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			{
+				Assert(offnum >= P_FIRSTDATAKEY(opaque));
+				if (offnum > P_FIRSTDATAKEY(opaque))
+				{
+					offnum = OffsetNumberPrev(offnum);
+					continue;
+				}
+
+				tuple_alive = false;
+			}
+			else
+				tuple_alive = true;
+
+			itup = (IndexTuple) PageGetItem(page, iid);
+
+			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+										 &continuescan);
+			if (passes_quals && tuple_alive)
+			{
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					itemIndex--;
+					_bt_saveitem(so, itemIndex, offnum, itup);
+				}
+				else
+				{
+					int			tupleOffset;
+
+					/*
+					 * Set up state to return posting list, and remember first
+					 * TID.
+					 *
+					 * Note that we deliberately save/return items from
+					 * posting lists in ascending heap TID order for backwards
+					 * scans.  This allows _bt_killitems() to make a
+					 * consistent assumption about the order of items
+					 * associated with the same posting list tuple.
+					 */
+					itemIndex--;
+					tupleOffset =
+						_bt_setuppostingitems(so, itemIndex, offnum,
+											  BTreeTupleGetPostingN(itup, 0),
+											  itup);
+					/* Remember additional TIDs */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						itemIndex--;
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											tupleOffset);
+					}
+				}
+			}
+			if (!continuescan)
+			{
+				/* there can't be any more matches, so stop */
+				so->currPos.moreLeft = false;
+				break;
+			}
+
+			offnum = OffsetNumberPrev(offnum);
+		}
+
+		Assert(itemIndex >= 0);
+		so->currPos.firstItem = itemIndex;
+		so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
+		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
+	}
+
+	return (so->currPos.firstItem <= so->currPos.lastItem);
+}
+
+/* Save an index item into so->currPos.items[itemIndex] */
+static void
+_bt_saveitem(BTScanOpaque so, int itemIndex,
+			 OffsetNumber offnum, IndexTuple itup)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
+
+	currItem->heapTid = itup->t_tid;
+	currItem->indexOffset = offnum;
+	if (so->currTuples)
+	{
+		Size		itupsz = IndexTupleSize(itup);
+
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+		so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+	}
+}
+
+/*
+ * Setup state to save TIDs/items from a single posting list tuple.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for TID that is
+ * returned to scan first.  Second or subsequent TIDs for posting list should
+ * be saved by calling _bt_savepostingitem().
+ *
+ * Returns an offset into tuple storage space that main tuple is stored at if
+ * needed.
+ */
+static int
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					  ItemPointer heapTid, IndexTuple itup)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	Assert(BTreeTupleIsPosting(itup));
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+	if (so->currTuples)
+	{
+		/* Save base IndexTuple (truncate posting list) */
+		IndexTuple	base;
+		Size		itupsz = BTreeTupleGetPostingOffset(itup);
+
+		itupsz = MAXALIGN(itupsz);
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+		memcpy(base, itup, itupsz);
+		/* Defensively reduce work area index tuple header size */
+		base->t_info &= ~INDEX_SIZE_MASK;
+		base->t_info |= itupsz;
+		so->currPos.nextTupleOffset += itupsz;
+
+		return currItem->tupleOffset;
+	}
+
+	return 0;
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for current posting
+ * tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple.  Caller passes its return value as tupleOffset.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					ItemPointer heapTid, int tupleOffset)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+
+	/*
+	 * Have index-only scans return the same base IndexTuple for every TID
+	 * that originates from the same posting list
+	 */
+	if (so->currTuples)
+		currItem->tupleOffset = tupleOffset;
+}
+
+/*
+ *	_bt_steppage() -- Step to next page containing valid data for scan
+ *
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
+ * if pinned, we'll drop the pin before moving to next page.  The buffer is
+ * not locked on entry.
+ *
+ * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
+ * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
+ * to InvalidBuffer.  We return true to indicate success.
+ */
+static bool
+_bt_steppage(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BlockNumber blkno = InvalidBlockNumber;
+	bool		status;
+
+	Assert(BTScanPosIsValid(so->currPos));
+
+	/* Before leaving current page, deal with any killed items */
+	if (so->numKilled > 0)
+		_bt_killitems(scan);
+
+	/*
+	 * Before we modify currPos, make a copy of the page data if there was a
+	 * mark position that needs it.
+	 */
+	if (so->markItemIndex >= 0)
+	{
+		/* bump pin on current buffer for assignment to mark buffer */
+		if (BTScanPosIsPinned(so->currPos))
+			IncrBufferRefCount(so->currPos.buf);
+		memcpy(&so->markPos, &so->currPos,
+			   offsetof(BTScanPosData, items[1]) +
+			   so->currPos.lastItem * sizeof(BTScanPosItem));
+		if (so->markTuples)
+			memcpy(so->markTuples, so->currTuples,
+				   so->currPos.nextTupleOffset);
+		so->markPos.itemIndex = so->markItemIndex;
+		so->markItemIndex = -1;
+	}
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* Walk right to the next page with data */
+		if (scan->parallel_scan != NULL)
+		{
+			/*
+			 * Seize the scan to get the next block number; if the scan has
+			 * ended already, bail out.
+			 */
+			status = _bt_parallel_seize(scan, &blkno);
+			if (!status)
+			{
+				/* release the previous buffer, if pinned */
+				BTScanPosUnpinIfPinned(so->currPos);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+		}
+		else
+		{
+			/* Not parallel, so use the previously-saved nextPage link. */
+			blkno = so->currPos.nextPage;
+		}
+
+		/* Remember we left a page with data */
+		so->currPos.moreLeft = true;
+
+		/* release the previous buffer, if pinned */
+		BTScanPosUnpinIfPinned(so->currPos);
+	}
+	else
+	{
+		/* Remember we left a page with data */
+		so->currPos.moreRight = true;
+
+		if (scan->parallel_scan != NULL)
+		{
+			/*
+			 * Seize the scan to get the current block number; if the scan has
+			 * ended already, bail out.
+			 */
+			status = _bt_parallel_seize(scan, &blkno);
+			BTScanPosUnpinIfPinned(so->currPos);
+			if (!status)
+			{
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+		}
+		else
+		{
+			/* Not parallel, so just use our own notion of the current page */
+			blkno = so->currPos.currPage;
+		}
+	}
+
+	if (!_bt_readnextpage(scan, blkno, dir))
+		return false;
+
+	/* Drop the lock, and maybe the pin, on the current page */
+	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+	return true;
+}
+
+/*
+ *	_bt_readnextpage() -- Read next page containing valid data for scan
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page.  Caller is responsible to release lock and pin on
+ * buffer on success.  We return true to indicate success.
+ *
+ * If there are no more matching records in the given direction, we drop all
+ * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
+ */
+static bool
+_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Relation	rel;
+	Page		page;
+	BTPageOpaque opaque;
+	bool		status;
+
+	rel = scan->indexRelation;
+
+	if (ScanDirectionIsForward(dir))
+	{
+		for (;;)
+		{
+			/*
+			 * if we're at end of scan, give up and mark parallel scan as
+			 * done, so that all the workers can finish their scan
+			 */
+			if (blkno == P_NONE || !so->currPos.moreRight)
+			{
+				_bt_parallel_done(scan);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+			/* check for interrupts while we're not holding any buffer lock */
+			CHECK_FOR_INTERRUPTS();
+			/* step right one page */
+			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+			page = BufferGetPage(so->currPos.buf);
+			TestForOldSnapshot(scan->xs_snapshot, rel, page);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			/* check for deleted page */
+			if (!P_IGNORE(opaque))
+			{
+				PredicateLockPage(rel, blkno, scan->xs_snapshot);
+				/* see if there are any matches on this page */
+				/* note that this will clear moreRight if we can stop */
+				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
+					break;
+			}
+			else if (scan->parallel_scan != NULL)
+			{
+				/* allow next page be processed by parallel worker */
+				_bt_parallel_release(scan, opaque->btpo_next);
+			}
+
+			/* nope, keep going */
+			if (scan->parallel_scan != NULL)
+			{
+				_bt_relbuf(rel, so->currPos.buf);
+				status = _bt_parallel_seize(scan, &blkno);
+				if (!status)
+				{
+					BTScanPosInvalidate(so->currPos);
+					return false;
+				}
+			}
+			else
+			{
+				blkno = opaque->btpo_next;
+				_bt_relbuf(rel, so->currPos.buf);
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * Should only happen in parallel cases, when some other backend
+		 * advanced the scan.
+		 */
+		if (so->currPos.currPage != blkno)
+		{
+			BTScanPosUnpinIfPinned(so->currPos);
+			so->currPos.currPage = blkno;
+		}
+
+		/*
+		 * Walk left to the next page with data.  This is much more complex
+		 * than the walk-right case because of the possibility that the page
+		 * to our left splits while we are in flight to it, plus the
+		 * possibility that the page we were on gets deleted after we leave
+		 * it.  See nbtree/README for details.
+		 *
+		 * It might be possible to rearrange this code to have less overhead
+		 * in pinning and locking, but that would require capturing the left
+		 * pointer when the page is initially read, and using it here, along
+		 * with big changes to _bt_walk_left() and the code below.  It is not
+		 * clear whether this would be a win, since if the page immediately to
+		 * the left splits after we read this page and before we step left, we
+		 * would need to visit more pages than with the current code.
+		 *
+		 * Note that if we change the code so that we drop the pin for a scan
+		 * which uses a non-MVCC snapshot, we will need to modify the code for
+		 * walking left, to allow for the possibility that a referenced page
+		 * has been deleted.  As long as the buffer is pinned or the snapshot
+		 * is MVCC the page cannot move past the half-dead state to fully
+		 * deleted.
+		 */
+		if (BTScanPosIsPinned(so->currPos))
+			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
+		else
+			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
+
+		for (;;)
+		{
+			/* Done if we know there are no matching keys to the left */
+			if (!so->currPos.moreLeft)
+			{
+				_bt_relbuf(rel, so->currPos.buf);
+				_bt_parallel_done(scan);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+
+			/* Step to next physical page */
+			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
+											scan->xs_snapshot);
+
+			/* if we're physically at end of index, return failure */
+			if (so->currPos.buf == InvalidBuffer)
+			{
+				_bt_parallel_done(scan);
+				BTScanPosInvalidate(so->currPos);
+				return false;
+			}
+
+			/*
+			 * Okay, we managed to move left to a non-deleted page. Done if
+			 * it's not half-dead and contains matching tuples. Else loop back
+			 * and do it all again.
+			 */
+			page = BufferGetPage(so->currPos.buf);
+			TestForOldSnapshot(scan->xs_snapshot, rel, page);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			if (!P_IGNORE(opaque))
+			{
+				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
+				/* see if there are any matches on this page */
+				/* note that this will clear moreLeft if we can stop */
+				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
+					break;
+			}
+			else if (scan->parallel_scan != NULL)
+			{
+				/* allow next page be processed by parallel worker */
+				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+			}
+
+			/*
+			 * For parallel scans, get the last page scanned as it is quite
+			 * possible that by the time we try to seize the scan, some other
+			 * worker has already advanced the scan to a different page.  We
+			 * must continue based on the latest page scanned by any worker.
+			 */
+			if (scan->parallel_scan != NULL)
+			{
+				_bt_relbuf(rel, so->currPos.buf);
+				status = _bt_parallel_seize(scan, &blkno);
+				if (!status)
+				{
+					BTScanPosInvalidate(so->currPos);
+					return false;
+				}
+				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+			}
+		}
+	}
+
+	return true;
+}
+
+/*
+ *	_bt_parallel_readpage() -- Read current page containing valid data for scan
+ *
+ * On success, release lock and maybe pin on buffer.  We return true to
+ * indicate success.
+ */
+static bool
+_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	_bt_initialize_more_data(so, dir);
+
+	if (!_bt_readnextpage(scan, blkno, dir))
+		return false;
+
+	/* Drop the lock, and maybe the pin, on the current page */
+	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+	return true;
+}
+
+/*
+ * _bt_walk_left() -- step left one page, if possible
+ *
+ * The given buffer must be pinned and read-locked.  This will be dropped
+ * before stepping left.  On return, we have pin and read lock on the
+ * returned page, instead.
+ *
+ * Returns InvalidBuffer if there is no page to the left (no lock is held
+ * in that case).
+ *
+ * When working on a non-leaf level, it is possible for the returned page
+ * to be half-dead; the caller should check that condition and step left
+ * again if it's important.
+ */
+static Buffer
+_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
+{
+	Page		page;
+	BTPageOpaque opaque;
+
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	for (;;)
+	{
+		BlockNumber obknum;
+		BlockNumber lblkno;
+		BlockNumber blkno;
+		int			tries;
+
+		/* if we're at end of tree, release buf and return failure */
+		if (P_LEFTMOST(opaque))
+		{
+			_bt_relbuf(rel, buf);
+			break;
+		}
+		/* remember original page we are stepping left from */
+		obknum = BufferGetBlockNumber(buf);
+		/* step left */
+		blkno = lblkno = opaque->btpo_prev;
+		_bt_relbuf(rel, buf);
+		/* check for interrupts while we're not holding any buffer lock */
+		CHECK_FOR_INTERRUPTS();
+		buf = _bt_getbuf(rel, blkno, BT_READ);
+		page = BufferGetPage(buf);
+		TestForOldSnapshot(snapshot, rel, page);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		/*
+		 * If this isn't the page we want, walk right till we find what we
+		 * want --- but go no more than four hops (an arbitrary limit). If we
+		 * don't find the correct page by then, the most likely bet is that
+		 * the original page got deleted and isn't in the sibling chain at all
+		 * anymore, not that its left sibling got split more than four times.
+		 *
+		 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
+		 * because half-dead pages are still in the sibling chain.  Caller
+		 * must reject half-dead pages if wanted.
+		 */
+		tries = 0;
+		for (;;)
+		{
+			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+			{
+				/* Found desired page, return it */
+				return buf;
+			}
+			if (P_RIGHTMOST(opaque) || ++tries > 4)
+				break;
+			blkno = opaque->btpo_next;
+			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+			page = BufferGetPage(buf);
+			TestForOldSnapshot(snapshot, rel, page);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
+
+		/* Return to the original page to see what's up */
+		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+		page = BufferGetPage(buf);
+		TestForOldSnapshot(snapshot, rel, page);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (P_ISDELETED(opaque))
+		{
+			/*
+			 * It was deleted.  Move right to first nondeleted page (there
+			 * must be one); that is the page that has acquired the deleted
+			 * one's keyspace, so stepping left from it will take us where we
+			 * want to be.
+			 */
+			for (;;)
+			{
+				if (P_RIGHTMOST(opaque))
+					elog(ERROR, "fell off the end of index \"%s\"",
+						 RelationGetRelationName(rel));
+				blkno = opaque->btpo_next;
+				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+				page = BufferGetPage(buf);
+				TestForOldSnapshot(snapshot, rel, page);
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (!P_ISDELETED(opaque))
+					break;
+			}
+
+			/*
+			 * Now return to top of loop, resetting obknum to point to this
+			 * nondeleted page, and try again.
+			 */
+		}
+		else
+		{
+			/*
+			 * It wasn't deleted; the explanation had better be that the page
+			 * to the left got split or deleted. Without this check, we'd go
+			 * into an infinite loop if there's anything wrong.
+			 */
+			if (opaque->btpo_prev == lblkno)
+				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
+					 obknum, RelationGetRelationName(rel));
+			/* Okay to try again with new lblkno value */
+		}
+	}
+
+	return InvalidBuffer;
+}
+
+/*
+ * _bt_get_endpoint() -- Find the first or last page on a given tree level
+ *
+ * If the index is empty, we will return InvalidBuffer; any other failure
+ * condition causes ereport().  We will not return a dead page.
+ *
+ * The returned buffer is pinned and read-locked.
+ */
+Buffer
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+				 Snapshot snapshot)
+{
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber offnum;
+	BlockNumber blkno;
+	IndexTuple	itup;
+
+	/*
+	 * If we are looking for a leaf page, okay to descend from fast root;
+	 * otherwise better descend from true root.  (There is no point in being
+	 * smarter about intermediate levels.)
+	 */
+	if (level == 0)
+		buf = _bt_getroot(rel, BT_READ);
+	else
+		buf = _bt_gettrueroot(rel);
+
+	if (!BufferIsValid(buf))
+		return InvalidBuffer;
+
+	page = BufferGetPage(buf);
+	TestForOldSnapshot(snapshot, rel, page);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	for (;;)
+	{
+		/*
+		 * If we landed on a deleted page, step right to find a live page
+		 * (there must be one).  Also, if we want the rightmost page, step
+		 * right if needed to get to it (this could happen if the page split
+		 * since we obtained a pointer to it).
+		 */
+		while (P_IGNORE(opaque) ||
+			   (rightmost && !P_RIGHTMOST(opaque)))
+		{
+			blkno = opaque->btpo_next;
+			if (blkno == P_NONE)
+				elog(ERROR, "fell off the end of index \"%s\"",
+					 RelationGetRelationName(rel));
+			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+			page = BufferGetPage(buf);
+			TestForOldSnapshot(snapshot, rel, page);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
+
+		/* Done? */
+		if (opaque->btpo_level == level)
+			break;
+		if (opaque->btpo_level < level)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("btree level %u not found in index \"%s\"",
+									 level, RelationGetRelationName(rel))));
+
+		/* Descend to leftmost or rightmost child page */
+		if (rightmost)
+			offnum = PageGetMaxOffsetNumber(page);
+		else
+			offnum = P_FIRSTDATAKEY(opaque);
+
+		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+		blkno = BTreeTupleGetDownLink(itup);
+
+		buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	}
+
+	return buf;
+}
+
+/*
+ *	_bt_endpoint() -- Find the first or last page in the index, and scan
+ * from there to the first key satisfying all the quals.
+ *
+ * This is used by _bt_first() to set up a scan when we've determined
+ * that the scan must start at the beginning or end of the index (for
+ * a forward or backward scan respectively).  Exit conditions are the
+ * same as for _bt_first().
+ */
+static bool
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber start;
+	BTScanPosItem *currItem;
+
+	/*
+	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
+	 * version of _bt_search().  We don't maintain a stack since we know we
+	 * won't need it.
+	 */
+	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
+
+	if (!BufferIsValid(buf))
+	{
+		/*
+		 * Empty index. Lock the whole relation, as nothing finer to lock
+		 * exists.
+		 */
+		PredicateLockRelation(rel, scan->xs_snapshot);
+		BTScanPosInvalidate(so->currPos);
+		return false;
+	}
+
+	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Assert(P_ISLEAF(opaque));
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* There could be dead pages to the left, so not this: */
+		/* Assert(P_LEFTMOST(opaque)); */
+
+		start = P_FIRSTDATAKEY(opaque);
+	}
+	else if (ScanDirectionIsBackward(dir))
+	{
+		Assert(P_RIGHTMOST(opaque));
+
+		start = PageGetMaxOffsetNumber(page);
+	}
+	else
+	{
+		elog(ERROR, "invalid scan direction: %d", (int) dir);
+		start = 0;				/* keep compiler quiet */
+	}
+
+	/* remember which buffer we have pinned */
+	so->currPos.buf = buf;
+
+	_bt_initialize_more_data(so, dir);
+
+	/*
+	 * Now load data from the first page of the scan.
+	 */
+	if (!_bt_readpage(scan, dir, start))
+	{
+		/*
+		 * There's no actually-matching data on this page.  Try to advance to
+		 * the next page.  Return false if there's no matching data at all.
+		 */
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+		if (!_bt_steppage(scan, dir))
+			return false;
+	}
+	else
+	{
+		/* Drop the lock, and maybe the pin, on the current page */
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	}
+
+	/* OK, itemIndex says what to return */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_heaptid = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
+/*
+ * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
+ * for scan direction
+ */
+static inline void
+_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
+{
+	/* initialize moreLeft/moreRight appropriately for scan direction */
+	if (ScanDirectionIsForward(dir))
+	{
+		so->currPos.moreLeft = false;
+		so->currPos.moreRight = true;
+	}
+	else
+	{
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = false;
+	}
+	so->numKilled = 0;			/* just paranoia */
+	so->markItemIndex = -1;		/* ditto */
+}