1 files changed, 1189 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
new file mode 100644
index 0000000..43b6789
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -0,0 +1,1189 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsplitloc.c
+ *	  Choose split point code for Postgres btree implementation.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtsplitloc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "storage/lmgr.h"
+
+typedef enum
+{
+	/* strategy for searching through materialized list of split points */
+	SPLIT_DEFAULT,				/* give some weight to truncation */
+	SPLIT_MANY_DUPLICATES,		/* find minimally distinguishing point */
+	SPLIT_SINGLE_VALUE			/* leave left page almost full */
+} FindSplitStrat;
+
+typedef struct
+{
+	/* details of free space left by split */
+	int16		curdelta;		/* current leftfree/rightfree delta */
+	int16		leftfree;		/* space left on left page post-split */
+	int16		rightfree;		/* space left on right page post-split */
+
+	/* split point identifying fields (returned by _bt_findsplitloc) */
+	OffsetNumber firstrightoff; /* first origpage item on rightpage */
+	bool		newitemonleft;	/* new item goes on left, or right? */
+} SplitPoint;
+
+typedef struct
+{
+	/* context data for _bt_recsplitloc */
+	Relation	rel;			/* index relation */
+	Page		origpage;		/* page undergoing split */
+	IndexTuple	newitem;		/* new item (cause of page split) */
+	Size		newitemsz;		/* size of newitem (includes line pointer) */
+	bool		is_leaf;		/* T if splitting a leaf page */
+	bool		is_rightmost;	/* T if splitting rightmost page on level */
+	OffsetNumber newitemoff;	/* where the new item is to be inserted */
+	int			leftspace;		/* space available for items on left page */
+	int			rightspace;		/* space available for items on right page */
+	int			olddataitemstotal;	/* space taken by old items */
+	Size		minfirstrightsz;	/* smallest firstright size */
+
+	/* candidate split point data */
+	int			maxsplits;		/* maximum number of splits */
+	int			nsplits;		/* current number of splits */
+	SplitPoint *splits;			/* all candidate split points for page */
+	int			interval;		/* current range of acceptable split points */
+} FindSplitData;
+
+static void _bt_recsplitloc(FindSplitData *state,
+							OffsetNumber firstrightoff, bool newitemonleft,
+							int olddataitemstoleft,
+							Size firstrightofforigpagetuplesz);
+static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
+								bool usemult);
+static int	_bt_splitcmp(const void *arg1, const void *arg2);
+static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+								int leaffillfactor, bool *usemult);
+static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
+static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
+									 bool *newitemonleft, FindSplitStrat strategy);
+static int	_bt_defaultinterval(FindSplitData *state);
+static int	_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
+						 SplitPoint *rightpage, FindSplitStrat *strategy);
+static void _bt_interval_edges(FindSplitData *state,
+							   SplitPoint **leftinterval, SplitPoint **rightinterval);
+static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split);
+static inline IndexTuple _bt_split_lastleft(FindSplitData *state,
+											SplitPoint *split);
+static inline IndexTuple _bt_split_firstright(FindSplitData *state,
+											  SplitPoint *split);
+
+
+/*
+ *	_bt_findsplitloc() -- find an appropriate place to split a page.
+ *
+ * The main goal here is to equalize the free space that will be on each
+ * split page, *after accounting for the inserted tuple*.  (If we fail to
+ * account for it, we might find ourselves with too little room on the page
+ * that it needs to go into!)
+ *
+ * If the page is the rightmost page on its level, we instead try to arrange
+ * to leave the left split page fillfactor% full.  In this way, when we are
+ * inserting successively increasing keys (consider sequences, timestamps,
+ * etc) we will end up with a tree whose pages are about fillfactor% full,
+ * instead of the 50% full result that we'd get without this special case.
+ * This is the same as nbtsort.c produces for a newly-created tree.  Note
+ * that leaf and nonleaf pages use different fillfactors.  Note also that
+ * there are a number of further special cases where fillfactor is not
+ * applied in the standard way.
+ *
+ * We are passed the intended insert position of the new tuple, expressed as
+ * the offsetnumber of the tuple it must go in front of (this could be
+ * maxoff+1 if the tuple is to go at the end).  The new tuple itself is also
+ * passed, since it's needed to give some weight to how effective suffix
+ * truncation will be.  The implementation picks the split point that
+ * maximizes the effectiveness of suffix truncation from a small list of
+ * alternative candidate split points that leave each side of the split with
+ * about the same share of free space.  Suffix truncation is secondary to
+ * equalizing free space, except in cases with large numbers of duplicates.
+ * Note that it is always assumed that caller goes on to perform truncation,
+ * even with pg_upgrade'd indexes where that isn't actually the case
+ * (!heapkeyspace indexes).  See nbtree/README for more information about
+ * suffix truncation.
+ *
+ * We return the index of the first existing tuple that should go on the
+ * righthand page (which is called firstrightoff), plus a boolean
+ * indicating whether the new tuple goes on the left or right page.  You
+ * can think of the returned state as a point _between_ two adjacent data
+ * items (lastleft and firstright data items) on an imaginary version of
+ * origpage that already includes newitem.  The bool is necessary to
+ * disambiguate the case where firstrightoff == newitemoff (i.e. it is
+ * sometimes needed to determine if the firstright tuple for the split is
+ * newitem rather than the tuple from origpage at offset firstrightoff).
+ */
+OffsetNumber
+_bt_findsplitloc(Relation rel,
+				 Page origpage,
+				 OffsetNumber newitemoff,
+				 Size newitemsz,
+				 IndexTuple newitem,
+				 bool *newitemonleft)
+{
+	BTPageOpaque opaque;
+	int			leftspace,
+				rightspace,
+				olddataitemstotal,
+				olddataitemstoleft,
+				perfectpenalty,
+				leaffillfactor;
+	FindSplitData state;
+	FindSplitStrat strategy;
+	ItemId		itemid;
+	OffsetNumber offnum,
+				maxoff,
+				firstrightoff;
+	double		fillfactormult;
+	bool		usemult;
+	SplitPoint	leftpage,
+				rightpage;
+
+	opaque = BTPageGetOpaque(origpage);
+	maxoff = PageGetMaxOffsetNumber(origpage);
+
+	/* Total free space available on a btree page, after fixed overhead */
+	leftspace = rightspace =
+		PageGetPageSize(origpage) - SizeOfPageHeaderData -
+		MAXALIGN(sizeof(BTPageOpaqueData));
+
+	/* The right page will have the same high key as the old page */
+	if (!P_RIGHTMOST(opaque))
+	{
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
+							 sizeof(ItemIdData));
+	}
+
+	/* Count up total space in data items before actually scanning 'em */
+	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
+	leaffillfactor = BTGetFillFactor(rel);
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+	state.rel = rel;
+	state.origpage = origpage;
+	state.newitem = newitem;
+	state.newitemsz = newitemsz;
+	state.is_leaf = P_ISLEAF(opaque);
+	state.is_rightmost = P_RIGHTMOST(opaque);
+	state.leftspace = leftspace;
+	state.rightspace = rightspace;
+	state.olddataitemstotal = olddataitemstotal;
+	state.minfirstrightsz = SIZE_MAX;
+	state.newitemoff = newitemoff;
+
+	/* newitem cannot be a posting list item */
+	Assert(!BTreeTupleIsPosting(newitem));
+
+	/*
+	 * nsplits should never exceed maxoff because there will be at most as
+	 * many candidate split points as there are points _between_ tuples, once
+	 * you imagine that the new item is already on the original page (the
+	 * final number of splits may be slightly lower because not all points
+	 * between tuples will be legal).
+	 */
+	state.maxsplits = maxoff;
+	state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
+	state.nsplits = 0;
+
+	/*
+	 * Scan through the data items and calculate space usage for a split at
+	 * each possible position
+	 */
+	olddataitemstoleft = 0;
+
+	for (offnum = P_FIRSTDATAKEY(opaque);
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		Size		itemsz;
+
+		itemid = PageGetItemId(origpage, offnum);
+		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+
+		/*
+		 * When item offset number is not newitemoff, neither side of the
+		 * split can be newitem.  Record a split after the previous data item
+		 * from original page, but before the current data item from original
+		 * page. (_bt_recsplitloc() will reject the split when there are no
+		 * previous items, which we rely on.)
+		 */
+		if (offnum < newitemoff)
+			_bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
+		else if (offnum > newitemoff)
+			_bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
+		else
+		{
+			/*
+			 * Record a split after all "offnum < newitemoff" original page
+			 * data items, but before newitem
+			 */
+			_bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
+
+			/*
+			 * Record a split after newitem, but before data item from
+			 * original page at offset newitemoff/current offset
+			 */
+			_bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
+		}
+
+		olddataitemstoleft += itemsz;
+	}
+
+	/*
+	 * Record a split after all original page data items, but before newitem.
+	 * (Though only when it's possible that newitem will end up alone on new
+	 * right page.)
+	 */
+	Assert(olddataitemstoleft == olddataitemstotal);
+	if (newitemoff > maxoff)
+		_bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
+
+	/*
+	 * I believe it is not possible to fail to find a feasible split, but just
+	 * in case ...
+	 */
+	if (state.nsplits == 0)
+		elog(ERROR, "could not find a feasible split point for index \"%s\"",
+			 RelationGetRelationName(rel));
+
+	/*
+	 * Start search for a split point among list of legal split points.  Give
+	 * primary consideration to equalizing available free space in each half
+	 * of the split initially (start with default strategy), while applying
+	 * rightmost and split-after-new-item optimizations where appropriate.
+	 * Either of the two other fallback strategies may be required for cases
+	 * with a large number of duplicates around the original/space-optimal
+	 * split point.
+	 *
+	 * Default strategy gives some weight to suffix truncation in deciding a
+	 * split point on leaf pages.  It attempts to select a split point where a
+	 * distinguishing attribute appears earlier in the new high key for the
+	 * left side of the split, in order to maximize the number of trailing
+	 * attributes that can be truncated away.  Only candidate split points
+	 * that imply an acceptable balance of free space on each side are
+	 * considered.  See _bt_defaultinterval().
+	 */
+	if (!state.is_leaf)
+	{
+		/* fillfactormult only used on rightmost page */
+		usemult = state.is_rightmost;
+		fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
+	}
+	else if (state.is_rightmost)
+	{
+		/* Rightmost leaf page --  fillfactormult always used */
+		usemult = true;
+		fillfactormult = leaffillfactor / 100.0;
+	}
+	else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
+	{
+		/*
+		 * New item inserted at rightmost point among a localized grouping on
+		 * a leaf page -- apply "split after new item" optimization, either by
+		 * applying leaf fillfactor multiplier, or by choosing the exact split
+		 * point that leaves newitem as lastleft. (usemult is set for us.)
+		 */
+		if (usemult)
+		{
+			/* fillfactormult should be set based on leaf fillfactor */
+			fillfactormult = leaffillfactor / 100.0;
+		}
+		else
+		{
+			/* find precise split point after newitemoff */
+			for (int i = 0; i < state.nsplits; i++)
+			{
+				SplitPoint *split = state.splits + i;
+
+				if (split->newitemonleft &&
+					newitemoff == split->firstrightoff)
+				{
+					pfree(state.splits);
+					*newitemonleft = true;
+					return newitemoff;
+				}
+			}
+
+			/*
+			 * Cannot legally split after newitemoff; proceed with split
+			 * without using fillfactor multiplier.  This is defensive, and
+			 * should never be needed in practice.
+			 */
+			fillfactormult = 0.50;
+		}
+	}
+	else
+	{
+		/* Other leaf page.  50:50 page split. */
+		usemult = false;
+		/* fillfactormult not used, but be tidy */
+		fillfactormult = 0.50;
+	}
+
+	/*
+	 * Save leftmost and rightmost splits for page before original ordinal
+	 * sort order is lost by delta/fillfactormult sort
+	 */
+	leftpage = state.splits[0];
+	rightpage = state.splits[state.nsplits - 1];
+
+	/* Give split points a fillfactormult-wise delta, and sort on deltas */
+	_bt_deltasortsplits(&state, fillfactormult, usemult);
+
+	/* Determine split interval for default strategy */
+	state.interval = _bt_defaultinterval(&state);
+
+	/*
+	 * Determine if default strategy/split interval will produce a
+	 * sufficiently distinguishing split, or if we should change strategies.
+	 * Alternative strategies change the range of split points that are
+	 * considered acceptable (split interval), and possibly change
+	 * fillfactormult, in order to deal with pages with a large number of
+	 * duplicates gracefully.
+	 *
+	 * Pass low and high splits for the entire page (actually, they're for an
+	 * imaginary version of the page that includes newitem).  These are used
+	 * when the initial split interval encloses split points that are full of
+	 * duplicates, and we need to consider if it's even possible to avoid
+	 * appending a heap TID.
+	 */
+	perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
+
+	if (strategy == SPLIT_DEFAULT)
+	{
+		/*
+		 * Default strategy worked out (always works out with internal page).
+		 * Original split interval still stands.
+		 */
+	}
+
+	/*
+	 * Many duplicates strategy is used when a heap TID would otherwise be
+	 * appended, but the page isn't completely full of logical duplicates.
+	 *
+	 * The split interval is widened to include all legal candidate split
+	 * points.  There might be a few as two distinct values in the whole-page
+	 * split interval, though it's also possible that most of the values on
+	 * the page are unique.  The final split point will either be to the
+	 * immediate left or to the immediate right of the group of duplicate
+	 * tuples that enclose the first/delta-optimal split point (perfect
+	 * penalty was set so that the lowest delta split point that avoids
+	 * appending a heap TID will be chosen).  Maximizing the number of
+	 * attributes that can be truncated away is not a goal of the many
+	 * duplicates strategy.
+	 *
+	 * Single value strategy is used when it is impossible to avoid appending
+	 * a heap TID.  It arranges to leave the left page very full.  This
+	 * maximizes space utilization in cases where tuples with the same
+	 * attribute values span many pages.  Newly inserted duplicates will tend
+	 * to have higher heap TID values, so we'll end up splitting to the right
+	 * consistently.  (Single value strategy is harmless though not
+	 * particularly useful with !heapkeyspace indexes.)
+	 */
+	else if (strategy == SPLIT_MANY_DUPLICATES)
+	{
+		Assert(state.is_leaf);
+		/* Shouldn't try to truncate away extra user attributes */
+		Assert(perfectpenalty ==
+			   IndexRelationGetNumberOfKeyAttributes(state.rel));
+		/* No need to resort splits -- no change in fillfactormult/deltas */
+		state.interval = state.nsplits;
+	}
+	else if (strategy == SPLIT_SINGLE_VALUE)
+	{
+		Assert(state.is_leaf);
+		/* Split near the end of the page */
+		usemult = true;
+		fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
+		/* Resort split points with new delta */
+		_bt_deltasortsplits(&state, fillfactormult, usemult);
+		/* Appending a heap TID is unavoidable, so interval of 1 is fine */
+		state.interval = 1;
+	}
+
+	/*
+	 * Search among acceptable split points (using final split interval) for
+	 * the entry that has the lowest penalty, and is therefore expected to
+	 * maximize fan-out.  Sets *newitemonleft for us.
+	 */
+	firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
+									 strategy);
+	pfree(state.splits);
+
+	return firstrightoff;
+}
+
+/*
+ * Subroutine to record a particular point between two tuples (possibly the
+ * new item) on page (ie, combination of firstrightoff and newitemonleft
+ * settings) in *state for later analysis.  This is also a convenient point to
+ * check if the split is legal (if it isn't, it won't be recorded).
+ *
+ * firstrightoff is the offset of the first item on the original page that
+ * goes to the right page, and firstrightofforigpagetuplesz is the size of
+ * that tuple.  firstrightoff can be > max offset, which means that all the
+ * old items go to the left page and only the new item goes to the right page.
+ * We don't actually use firstrightofforigpagetuplesz in that case (actually,
+ * we don't use it for _any_ split where the firstright tuple happens to be
+ * newitem).
+ *
+ * olddataitemstoleft is the total size of all old items to the left of the
+ * split point that is recorded here when legal.  Should not include
+ * newitemsz, since that is handled here.
+ */
+static void
+_bt_recsplitloc(FindSplitData *state,
+				OffsetNumber firstrightoff,
+				bool newitemonleft,
+				int olddataitemstoleft,
+				Size firstrightofforigpagetuplesz)
+{
+	int16		leftfree,
+				rightfree;
+	Size		firstrightsz;
+	Size		postingsz = 0;
+	bool		newitemisfirstright;
+
+	/* Is the new item going to be split point's firstright tuple? */
+	newitemisfirstright = (firstrightoff == state->newitemoff &&
+						   !newitemonleft);
+
+	if (newitemisfirstright)
+		firstrightsz = state->newitemsz;
+	else
+	{
+		firstrightsz = firstrightofforigpagetuplesz;
+
+		/*
+		 * Calculate suffix truncation space saving when firstright tuple is a
+		 * posting list tuple, though only when the tuple is over 64 bytes
+		 * including line pointer overhead (arbitrary).  This avoids accessing
+		 * the tuple in cases where its posting list must be very small (if
+		 * tuple has one at all).
+		 *
+		 * Note: We don't do this in the case where firstright tuple is
+		 * newitem, since newitem cannot have a posting list.
+		 */
+		if (state->is_leaf && firstrightsz > 64)
+		{
+			ItemId		itemid;
+			IndexTuple	newhighkey;
+
+			itemid = PageGetItemId(state->origpage, firstrightoff);
+			newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid);
+
+			if (BTreeTupleIsPosting(newhighkey))
+				postingsz = IndexTupleSize(newhighkey) -
+					BTreeTupleGetPostingOffset(newhighkey);
+		}
+	}
+
+	/* Account for all the old tuples */
+	leftfree = state->leftspace - olddataitemstoleft;
+	rightfree = state->rightspace -
+		(state->olddataitemstotal - olddataitemstoleft);
+
+	/*
+	 * The first item on the right page becomes the high key of the left page;
+	 * therefore it counts against left space as well as right space (we
+	 * cannot assume that suffix truncation will make it any smaller).  When
+	 * index has included attributes, then those attributes of left page high
+	 * key will be truncated leaving that page with slightly more free space.
+	 * However, that shouldn't affect our ability to find valid split
+	 * location, since we err in the direction of being pessimistic about free
+	 * space on the left half.  Besides, even when suffix truncation of
+	 * non-TID attributes occurs, the new high key often won't even be a
+	 * single MAXALIGN() quantum smaller than the firstright tuple it's based
+	 * on.
+	 *
+	 * If we are on the leaf level, assume that suffix truncation cannot avoid
+	 * adding a heap TID to the left half's new high key when splitting at the
+	 * leaf level.  In practice the new high key will often be smaller and
+	 * will rarely be larger, but conservatively assume the worst case.  We do
+	 * go to the trouble of subtracting away posting list overhead, though
+	 * only when it looks like it will make an appreciable difference.
+	 * (Posting lists are the only case where truncation will typically make
+	 * the final high key far smaller than firstright, so being a bit more
+	 * precise there noticeably improves the balance of free space.)
+	 */
+	if (state->is_leaf)
+		leftfree -= (int16) (firstrightsz +
+							 MAXALIGN(sizeof(ItemPointerData)) -
+							 postingsz);
+	else
+		leftfree -= (int16) firstrightsz;
+
+	/* account for the new item */
+	if (newitemonleft)
+		leftfree -= (int16) state->newitemsz;
+	else
+		rightfree -= (int16) state->newitemsz;
+
+	/*
+	 * If we are not on the leaf level, we will be able to discard the key
+	 * data from the first item that winds up on the right page.
+	 */
+	if (!state->is_leaf)
+		rightfree += (int16) firstrightsz -
+			(int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
+
+	/* Record split if legal */
+	if (leftfree >= 0 && rightfree >= 0)
+	{
+		Assert(state->nsplits < state->maxsplits);
+
+		/* Determine smallest firstright tuple size among legal splits */
+		state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz);
+
+		state->splits[state->nsplits].curdelta = 0;
+		state->splits[state->nsplits].leftfree = leftfree;
+		state->splits[state->nsplits].rightfree = rightfree;
+		state->splits[state->nsplits].firstrightoff = firstrightoff;
+		state->splits[state->nsplits].newitemonleft = newitemonleft;
+		state->nsplits++;
+	}
+}
+
+/*
+ * Subroutine to assign space deltas to materialized array of candidate split
+ * points based on current fillfactor, and to sort array using that fillfactor
+ */
+static void
+_bt_deltasortsplits(FindSplitData *state, double fillfactormult,
+					bool usemult)
+{
+	for (int i = 0; i < state->nsplits; i++)
+	{
+		SplitPoint *split = state->splits + i;
+		int16		delta;
+
+		if (usemult)
+			delta = fillfactormult * split->leftfree -
+				(1.0 - fillfactormult) * split->rightfree;
+		else
+			delta = split->leftfree - split->rightfree;
+
+		if (delta < 0)
+			delta = -delta;
+
+		/* Save delta */
+		split->curdelta = delta;
+	}
+
+	qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
+}
+
+/*
+ * qsort-style comparator used by _bt_deltasortsplits()
+ */
+static int
+_bt_splitcmp(const void *arg1, const void *arg2)
+{
+	SplitPoint *split1 = (SplitPoint *) arg1;
+	SplitPoint *split2 = (SplitPoint *) arg2;
+
+	if (split1->curdelta > split2->curdelta)
+		return 1;
+	if (split1->curdelta < split2->curdelta)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Subroutine to determine whether or not a non-rightmost leaf page should be
+ * split immediately after the would-be original page offset for the
+ * new/incoming tuple (or should have leaf fillfactor applied when new item is
+ * to the right on original page).  This is appropriate when there is a
+ * pattern of localized monotonically increasing insertions into a composite
+ * index, where leading attribute values form local groupings, and we
+ * anticipate further insertions of the same/current grouping (new item's
+ * grouping) in the near future.  This can be thought of as a variation on
+ * applying leaf fillfactor during rightmost leaf page splits, since cases
+ * that benefit will converge on packing leaf pages leaffillfactor% full over
+ * time.
+ *
+ * We may leave extra free space remaining on the rightmost page of a "most
+ * significant column" grouping of tuples if that grouping never ends up
+ * having future insertions that use the free space.  That effect is
+ * self-limiting; a future grouping that becomes the "nearest on the right"
+ * grouping of the affected grouping usually puts the extra free space to good
+ * use.
+ *
+ * Caller uses optimization when routine returns true, though the exact action
+ * taken by caller varies.  Caller uses original leaf page fillfactor in
+ * standard way rather than using the new item offset directly when *usemult
+ * was also set to true here.  Otherwise, caller applies optimization by
+ * locating the legal split point that makes the new tuple the lastleft tuple
+ * for the split.
+ */
+static bool
+_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+					int leaffillfactor, bool *usemult)
+{
+	int16		nkeyatts;
+	ItemId		itemid;
+	IndexTuple	tup;
+	int			keepnatts;
+
+	Assert(state->is_leaf && !state->is_rightmost);
+
+	nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+
+	/* Single key indexes not considered here */
+	if (nkeyatts == 1)
+		return false;
+
+	/* Ascending insertion pattern never inferred when new item is first */
+	if (state->newitemoff == P_FIRSTKEY)
+		return false;
+
+	/*
+	 * Only apply optimization on pages with equisized tuples, since ordinal
+	 * keys are likely to be fixed-width.  Testing if the new tuple is
+	 * variable width directly might also work, but that fails to apply the
+	 * optimization to indexes with a numeric_ops attribute.
+	 *
+	 * Conclude that page has equisized tuples when the new item is the same
+	 * width as the smallest item observed during pass over page, and other
+	 * non-pivot tuples must be the same width as well.  (Note that the
+	 * possibly-truncated existing high key isn't counted in
+	 * olddataitemstotal, and must be subtracted from maxoff.)
+	 */
+	if (state->newitemsz != state->minfirstrightsz)
+		return false;
+	if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
+		return false;
+
+	/*
+	 * Avoid applying optimization when tuples are wider than a tuple
+	 * consisting of two non-NULL int8/int64 attributes (or four non-NULL
+	 * int4/int32 attributes)
+	 */
+	if (state->newitemsz >
+		MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
+		sizeof(ItemIdData))
+		return false;
+
+	/*
+	 * At least the first attribute's value must be equal to the corresponding
+	 * value in previous tuple to apply optimization.  New item cannot be a
+	 * duplicate, either.
+	 *
+	 * Handle case where new item is to the right of all items on the existing
+	 * page.  This is suggestive of monotonically increasing insertions in
+	 * itself, so the "heap TID adjacency" test is not applied here.
+	 */
+	if (state->newitemoff > maxoff)
+	{
+		itemid = PageGetItemId(state->origpage, maxoff);
+		tup = (IndexTuple) PageGetItem(state->origpage, itemid);
+		keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+		if (keepnatts > 1 && keepnatts <= nkeyatts)
+		{
+			*usemult = true;
+			return true;
+		}
+
+		return false;
+	}
+
+	/*
+	 * "Low cardinality leading column, high cardinality suffix column"
+	 * indexes with a random insertion pattern (e.g., an index with a boolean
+	 * column, such as an index on '(book_is_in_print, book_isbn)') present us
+	 * with a risk of consistently misapplying the optimization.  We're
+	 * willing to accept very occasional misapplication of the optimization,
+	 * provided the cases where we get it wrong are rare and self-limiting.
+	 *
+	 * Heap TID adjacency strongly suggests that the item just to the left was
+	 * inserted very recently, which limits overapplication of the
+	 * optimization.  Besides, all inappropriate cases triggered here will
+	 * still split in the middle of the page on average.
+	 */
+	itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff));
+	tup = (IndexTuple) PageGetItem(state->origpage, itemid);
+	/* Do cheaper test first */
+	if (BTreeTupleIsPosting(tup) ||
+		!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+		return false;
+	/* Check same conditions as rightmost item case, too */
+	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+	if (keepnatts > 1 && keepnatts <= nkeyatts)
+	{
+		double		interp = (double) state->newitemoff / ((double) maxoff + 1);
+		double		leaffillfactormult = (double) leaffillfactor / 100.0;
+
+		/*
+		 * Don't allow caller to split after a new item when it will result in
+		 * a split point to the right of the point that a leaf fillfactor
+		 * split would use -- have caller apply leaf fillfactor instead
+		 */
+		*usemult = interp > leaffillfactormult;
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Subroutine for determining if two heap TIDS are "adjacent".
+ *
+ * Adjacent means that the high TID is very likely to have been inserted into
+ * heap relation immediately after the low TID, probably during the current
+ * transaction.
+ */
+static bool
+_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
+{
+	BlockNumber lowblk,
+				highblk;
+
+	lowblk = ItemPointerGetBlockNumber(lowhtid);
+	highblk = ItemPointerGetBlockNumber(highhtid);
+
+	/* Make optimistic assumption of adjacency when heap blocks match */
+	if (lowblk == highblk)
+		return true;
+
+	/* When heap block one up, second offset should be FirstOffsetNumber */
+	if (lowblk + 1 == highblk &&
+		ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
+		return true;
+
+	return false;
+}
+
+/*
+ * Subroutine to find the "best" split point among candidate split points.
+ * The best split point is the split point with the lowest penalty among split
+ * points that fall within current/final split interval.  Penalty is an
+ * abstract score, with a definition that varies depending on whether we're
+ * splitting a leaf page or an internal page.  See _bt_split_penalty() for
+ * details.
+ *
+ * "perfectpenalty" is assumed to be the lowest possible penalty among
+ * candidate split points.  This allows us to return early without wasting
+ * cycles on calculating the first differing attribute for all candidate
+ * splits when that clearly cannot improve our choice (or when we only want a
+ * minimally distinguishing split point, and don't want to make the split any
+ * more unbalanced than is necessary).
+ *
+ * We return the index of the first existing tuple that should go on the right
+ * page, plus a boolean indicating if new item is on left of split point.
+ */
+static OffsetNumber
+_bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
+				 bool *newitemonleft, FindSplitStrat strategy)
+{
+	int			bestpenalty,
+				lowsplit;
+	int			highsplit = Min(state->interval, state->nsplits);
+	SplitPoint *final;
+
+	bestpenalty = INT_MAX;
+	lowsplit = 0;
+	for (int i = lowsplit; i < highsplit; i++)
+	{
+		int			penalty;
+
+		penalty = _bt_split_penalty(state, state->splits + i);
+
+		if (penalty < bestpenalty)
+		{
+			bestpenalty = penalty;
+			lowsplit = i;
+		}
+
+		if (penalty <= perfectpenalty)
+			break;
+	}
+
+	final = &state->splits[lowsplit];
+
+	/*
+	 * There is a risk that the "many duplicates" strategy will repeatedly do
+	 * the wrong thing when there are monotonically decreasing insertions to
+	 * the right of a large group of duplicates.   Repeated splits could leave
+	 * a succession of right half pages with free space that can never be
+	 * used.  This must be avoided.
+	 *
+	 * Consider the example of the leftmost page in a single integer attribute
+	 * NULLS FIRST index which is almost filled with NULLs.  Monotonically
+	 * decreasing integer insertions might cause the same leftmost page to
+	 * split repeatedly at the same point.  Each split derives its new high
+	 * key from the lowest current value to the immediate right of the large
+	 * group of NULLs, which will always be higher than all future integer
+	 * insertions, directing all future integer insertions to the same
+	 * leftmost page.
+	 */
+	if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
+		!final->newitemonleft && final->firstrightoff >= state->newitemoff &&
+		final->firstrightoff < state->newitemoff + 9)
+	{
+		/*
+		 * Avoid the problem by performing a 50:50 split when the new item is
+		 * just to the right of the would-be "many duplicates" split point.
+		 * (Note that the test used for an insert that is "just to the right"
+		 * of the split point is conservative.)
+		 */
+		final = &state->splits[0];
+	}
+
+	*newitemonleft = final->newitemonleft;
+	return final->firstrightoff;
+}
+
+#define LEAF_SPLIT_DISTANCE			0.050
+#define INTERNAL_SPLIT_DISTANCE		0.075
+
+/*
+ * Return a split interval to use for the default strategy.  This is a limit
+ * on the number of candidate split points to give further consideration to.
+ * Only a fraction of all candidate splits points (those located at the start
+ * of the now-sorted splits array) fall within the split interval.  Split
+ * interval is applied within _bt_bestsplitloc().
+ *
+ * Split interval represents an acceptable range of split points -- those that
+ * have leftfree and rightfree values that are acceptably balanced.  The final
+ * split point chosen is the split point with the lowest "penalty" among split
+ * points in this split interval (unless we change our entire strategy, in
+ * which case the interval also changes -- see _bt_strategy()).
+ *
+ * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits,
+ * and sigma b for internal ("branch") splits.  It's hard to provide a
+ * theoretical justification for the size of the split interval, though it's
+ * clear that a small split interval can make tuples on level L+1 much smaller
+ * on average, without noticeably affecting space utilization on level L.
+ * (Note that the way that we calculate split interval might need to change if
+ * suffix truncation is taught to truncate tuples "within" the last
+ * attribute/datum for data types like text, which is more or less how it is
+ * assumed to work in the paper.)
+ */
+static int
+_bt_defaultinterval(FindSplitData *state)
+{
+	SplitPoint *spaceoptimal;
+	int16		tolerance,
+				lowleftfree,
+				lowrightfree,
+				highleftfree,
+				highrightfree;
+
+	/*
+	 * Determine leftfree and rightfree values that are higher and lower than
+	 * we're willing to tolerate.  Note that the final split interval will be
+	 * about 10% of nsplits in the common case where all non-pivot tuples
+	 * (data items) from a leaf page are uniformly sized.  We're a bit more
+	 * aggressive when splitting internal pages.
+	 */
+	if (state->is_leaf)
+		tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
+	else
+		tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
+
+	/* First candidate split point is the most evenly balanced */
+	spaceoptimal = state->splits;
+	lowleftfree = spaceoptimal->leftfree - tolerance;
+	lowrightfree = spaceoptimal->rightfree - tolerance;
+	highleftfree = spaceoptimal->leftfree + tolerance;
+	highrightfree = spaceoptimal->rightfree + tolerance;
+
+	/*
+	 * Iterate through split points, starting from the split immediately after
+	 * 'spaceoptimal'.  Find the first split point that divides free space so
+	 * unevenly that including it in the split interval would be unacceptable.
+	 */
+	for (int i = 1; i < state->nsplits; i++)
+	{
+		SplitPoint *split = state->splits + i;
+
+		/* Cannot use curdelta here, since its value is often weighted */
+		if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
+			split->leftfree > highleftfree || split->rightfree > highrightfree)
+			return i;
+	}
+
+	return state->nsplits;
+}
+
+/*
+ * Subroutine to decide whether split should use default strategy/initial
+ * split interval, or whether it should finish splitting the page using
+ * alternative strategies (this is only possible with leaf pages).
+ *
+ * Caller uses alternative strategy (or sticks with default strategy) based
+ * on how *strategy is set here.  Return value is "perfect penalty", which is
+ * passed to _bt_bestsplitloc() as a final constraint on how far caller is
+ * willing to go to avoid appending a heap TID when using the many duplicates
+ * strategy (it also saves _bt_bestsplitloc() useless cycles).
+ */
+static int
+_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
+			 SplitPoint *rightpage, FindSplitStrat *strategy)
+{
+	IndexTuple	leftmost,
+				rightmost;
+	SplitPoint *leftinterval,
+			   *rightinterval;
+	int			perfectpenalty;
+	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+
+	/* Assume that alternative strategy won't be used for now */
+	*strategy = SPLIT_DEFAULT;
+
+	/*
+	 * Use smallest observed firstright item size for entire page (actually,
+	 * entire imaginary version of page that includes newitem) as perfect
+	 * penalty on internal pages.  This can save cycles in the common case
+	 * where most or all splits (not just splits within interval) have
+	 * firstright tuples that are the same size.
+	 */
+	if (!state->is_leaf)
+		return state->minfirstrightsz;
+
+	/*
+	 * Use leftmost and rightmost tuples from leftmost and rightmost splits in
+	 * current split interval
+	 */
+	_bt_interval_edges(state, &leftinterval, &rightinterval);
+	leftmost = _bt_split_lastleft(state, leftinterval);
+	rightmost = _bt_split_firstright(state, rightinterval);
+
+	/*
+	 * If initial split interval can produce a split point that will at least
+	 * avoid appending a heap TID in new high key, we're done.  Finish split
+	 * with default strategy and initial split interval.
+	 */
+	perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
+	if (perfectpenalty <= indnkeyatts)
+		return perfectpenalty;
+
+	/*
+	 * Work out how caller should finish split when even their "perfect"
+	 * penalty for initial/default split interval indicates that the interval
+	 * does not contain even a single split that avoids appending a heap TID.
+	 *
+	 * Use the leftmost split's lastleft tuple and the rightmost split's
+	 * firstright tuple to assess every possible split.
+	 */
+	leftmost = _bt_split_lastleft(state, leftpage);
+	rightmost = _bt_split_firstright(state, rightpage);
+
+	/*
+	 * If page (including new item) has many duplicates but is not entirely
+	 * full of duplicates, a many duplicates strategy split will be performed.
+	 * If page is entirely full of duplicates, a single value strategy split
+	 * will be performed.
+	 */
+	perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
+	if (perfectpenalty <= indnkeyatts)
+	{
+		*strategy = SPLIT_MANY_DUPLICATES;
+
+		/*
+		 * Many duplicates strategy should split at either side the group of
+		 * duplicates that enclose the delta-optimal split point.  Return
+		 * indnkeyatts rather than the true perfect penalty to make that
+		 * happen.  (If perfectpenalty was returned here then low cardinality
+		 * composite indexes could have continual unbalanced splits.)
+		 *
+		 * Note that caller won't go through with a many duplicates split in
+		 * rare cases where it looks like there are ever-decreasing insertions
+		 * to the immediate right of the split point.  This must happen just
+		 * before a final decision is made, within _bt_bestsplitloc().
+		 */
+		return indnkeyatts;
+	}
+
+	/*
+	 * Single value strategy is only appropriate with ever-increasing heap
+	 * TIDs; otherwise, original default strategy split should proceed to
+	 * avoid pathological performance.  Use page high key to infer if this is
+	 * the rightmost page among pages that store the same duplicate value.
+	 * This should not prevent insertions of heap TIDs that are slightly out
+	 * of order from using single value strategy, since that's expected with
+	 * concurrent inserters of the same duplicate value.
+	 */
+	else if (state->is_rightmost)
+		*strategy = SPLIT_SINGLE_VALUE;
+	else
+	{
+		ItemId		itemid;
+		IndexTuple	hikey;
+
+		itemid = PageGetItemId(state->origpage, P_HIKEY);
+		hikey = (IndexTuple) PageGetItem(state->origpage, itemid);
+		perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
+											 state->newitem);
+		if (perfectpenalty <= indnkeyatts)
+			*strategy = SPLIT_SINGLE_VALUE;
+		else
+		{
+			/*
+			 * Have caller finish split using default strategy, since page
+			 * does not appear to be the rightmost page for duplicates of the
+			 * value the page is filled with
+			 */
+		}
+	}
+
+	return perfectpenalty;
+}
+
+/*
+ * Subroutine to locate leftmost and rightmost splits for current/default
+ * split interval.  Note that it will be the same split iff there is only one
+ * split in interval.
+ */
+static void
+_bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
+				   SplitPoint **rightinterval)
+{
+	int			highsplit = Min(state->interval, state->nsplits);
+	SplitPoint *deltaoptimal;
+
+	deltaoptimal = state->splits;
+	*leftinterval = NULL;
+	*rightinterval = NULL;
+
+	/*
+	 * Delta is an absolute distance to optimal split point, so both the
+	 * leftmost and rightmost split point will usually be at the end of the
+	 * array
+	 */
+	for (int i = highsplit - 1; i >= 0; i--)
+	{
+		SplitPoint *distant = state->splits + i;
+
+		if (distant->firstrightoff < deltaoptimal->firstrightoff)
+		{
+			if (*leftinterval == NULL)
+				*leftinterval = distant;
+		}
+		else if (distant->firstrightoff > deltaoptimal->firstrightoff)
+		{
+			if (*rightinterval == NULL)
+				*rightinterval = distant;
+		}
+		else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
+		{
+			/*
+			 * "incoming tuple will become firstright" (distant) is to the
+			 * left of "incoming tuple will become lastleft" (delta-optimal)
+			 */
+			Assert(distant->firstrightoff == state->newitemoff);
+			if (*leftinterval == NULL)
+				*leftinterval = distant;
+		}
+		else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
+		{
+			/*
+			 * "incoming tuple will become lastleft" (distant) is to the right
+			 * of "incoming tuple will become firstright" (delta-optimal)
+			 */
+			Assert(distant->firstrightoff == state->newitemoff);
+			if (*rightinterval == NULL)
+				*rightinterval = distant;
+		}
+		else
+		{
+			/* There was only one or two splits in initial split interval */
+			Assert(distant == deltaoptimal);
+			if (*leftinterval == NULL)
+				*leftinterval = distant;
+			if (*rightinterval == NULL)
+				*rightinterval = distant;
+		}
+
+		if (*leftinterval && *rightinterval)
+			return;
+	}
+
+	Assert(false);
+}
+
+/*
+ * Subroutine to find penalty for caller's candidate split point.
+ *
+ * On leaf pages, penalty is the attribute number that distinguishes each side
+ * of a split.  It's the last attribute that needs to be included in new high
+ * key for left page.  It can be greater than the number of key attributes in
+ * cases where a heap TID will need to be appended during truncation.
+ *
+ * On internal pages, penalty is simply the size of the firstright tuple for
+ * the split (including line pointer overhead).  This tuple will become the
+ * new high key for the left page.
+ */
+static inline int
+_bt_split_penalty(FindSplitData *state, SplitPoint *split)
+{
+	IndexTuple	lastleft;
+	IndexTuple	firstright;
+
+	if (!state->is_leaf)
+	{
+		ItemId		itemid;
+
+		if (!split->newitemonleft &&
+			split->firstrightoff == state->newitemoff)
+			return state->newitemsz;
+
+		itemid = PageGetItemId(state->origpage, split->firstrightoff);
+
+		return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+	}
+
+	lastleft = _bt_split_lastleft(state, split);
+	firstright = _bt_split_firstright(state, split);
+
+	return _bt_keep_natts_fast(state->rel, lastleft, firstright);
+}
+
+/*
+ * Subroutine to get a lastleft IndexTuple for a split point
+ */
+static inline IndexTuple
+_bt_split_lastleft(FindSplitData *state, SplitPoint *split)
+{
+	ItemId		itemid;
+
+	if (split->newitemonleft && split->firstrightoff == state->newitemoff)
+		return state->newitem;
+
+	itemid = PageGetItemId(state->origpage,
+						   OffsetNumberPrev(split->firstrightoff));
+	return (IndexTuple) PageGetItem(state->origpage, itemid);
+}
+
+/*
+ * Subroutine to get a firstright IndexTuple for a split point
+ */
+static inline IndexTuple
+_bt_split_firstright(FindSplitData *state, SplitPoint *split)
+{
+	ItemId		itemid;
+
+	if (!split->newitemonleft && split->firstrightoff == state->newitemoff)
+		return state->newitem;
+
+	itemid = PageGetItemId(state->origpage, split->firstrightoff);
+	return (IndexTuple) PageGetItem(state->origpage, itemid);
+}