1 files changed, 1734 insertions, 0 deletions
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
new file mode 100644
index 0000000..9b367b5
--- /dev/null
+++ b/src/backend/access/gist/gist.c
@@ -0,0 +1,1734 @@
+/*-------------------------------------------------------------------------
+ *
+ * gist.c
+ *	  interface routines for the postgres GiST index access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/gist/gist.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/gist_private.h"
+#include "access/gistscan.h"
+#include "access/xloginsert.h"
+#include "catalog/pg_collation.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "utils/builtins.h"
+#include "utils/index_selfuncs.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* non-export function prototypes */
+static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
+static bool gistinserttuple(GISTInsertState *state, GISTInsertStack *stack,
+							GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum);
+static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+							 GISTSTATE *giststate,
+							 IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+							 Buffer leftchild, Buffer rightchild,
+							 bool unlockbuf, bool unlockleftchild);
+static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
+							GISTSTATE *giststate, List *splitinfo, bool unlockbuf);
+static void gistprunepage(Relation rel, Page page, Buffer buffer,
+						  Relation heapRel);
+
+
+#define ROTATEDIST(d) do { \
+	SplitedPageLayout *tmp=(SplitedPageLayout*)palloc0(sizeof(SplitedPageLayout)); \
+	tmp->block.blkno = InvalidBlockNumber;	\
+	tmp->buffer = InvalidBuffer;	\
+	tmp->next = (d); \
+	(d)=tmp; \
+} while(0)
+
+
+/*
+ * GiST handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+gisthandler(PG_FUNCTION_ARGS)
+{
+	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+	amroutine->amstrategies = 0;
+	amroutine->amsupport = GISTNProcs;
+	amroutine->amoptsprocnum = GIST_OPTIONS_PROC;
+	amroutine->amcanorder = false;
+	amroutine->amcanorderbyop = true;
+	amroutine->amcanbackward = false;
+	amroutine->amcanunique = false;
+	amroutine->amcanmulticol = true;
+	amroutine->amoptionalkey = true;
+	amroutine->amsearcharray = false;
+	amroutine->amsearchnulls = true;
+	amroutine->amstorage = true;
+	amroutine->amclusterable = true;
+	amroutine->ampredlocks = true;
+	amroutine->amcanparallel = false;
+	amroutine->amcaninclude = true;
+	amroutine->amusemaintenanceworkmem = false;
+	amroutine->amparallelvacuumoptions =
+		VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
+	amroutine->amkeytype = InvalidOid;
+
+	amroutine->ambuild = gistbuild;
+	amroutine->ambuildempty = gistbuildempty;
+	amroutine->aminsert = gistinsert;
+	amroutine->ambulkdelete = gistbulkdelete;
+	amroutine->amvacuumcleanup = gistvacuumcleanup;
+	amroutine->amcanreturn = gistcanreturn;
+	amroutine->amcostestimate = gistcostestimate;
+	amroutine->amoptions = gistoptions;
+	amroutine->amproperty = gistproperty;
+	amroutine->ambuildphasename = NULL;
+	amroutine->amvalidate = gistvalidate;
+	amroutine->amadjustmembers = gistadjustmembers;
+	amroutine->ambeginscan = gistbeginscan;
+	amroutine->amrescan = gistrescan;
+	amroutine->amgettuple = gistgettuple;
+	amroutine->amgetbitmap = gistgetbitmap;
+	amroutine->amendscan = gistendscan;
+	amroutine->ammarkpos = NULL;
+	amroutine->amrestrpos = NULL;
+	amroutine->amestimateparallelscan = NULL;
+	amroutine->aminitparallelscan = NULL;
+	amroutine->amparallelrescan = NULL;
+
+	PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * Create and return a temporary memory context for use by GiST. We
+ * _always_ invoke user-provided methods in a temporary memory
+ * context, so that memory leaks in those functions cannot cause
+ * problems. Also, we use some additional temporary contexts in the
+ * GiST code itself, to avoid the need to do some awkward manual
+ * memory management.
+ */
+MemoryContext
+createTempGistContext(void)
+{
+	return AllocSetContextCreate(CurrentMemoryContext,
+								 "GiST temporary context",
+								 ALLOCSET_DEFAULT_SIZES);
+}
+
+/*
+ *	gistbuildempty() -- build an empty gist index in the initialization fork
+ */
+void
+gistbuildempty(Relation index)
+{
+	Buffer		buffer;
+
+	/* Initialize the root page */
+	buffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+	/* Initialize and xlog buffer */
+	START_CRIT_SECTION();
+	GISTInitBuffer(buffer, F_LEAF);
+	MarkBufferDirty(buffer);
+	log_newpage_buffer(buffer, true);
+	END_CRIT_SECTION();
+
+	/* Unlock and release the buffer */
+	UnlockReleaseBuffer(buffer);
+}
+
+/*
+ *	gistinsert -- wrapper for GiST tuple insertion.
+ *
+ *	  This is the public interface routine for tuple insertion in GiSTs.
+ *	  It doesn't do any work; just locks the relation and passes the buck.
+ */
+bool
+gistinsert(Relation r, Datum *values, bool *isnull,
+		   ItemPointer ht_ctid, Relation heapRel,
+		   IndexUniqueCheck checkUnique,
+		   bool indexUnchanged,
+		   IndexInfo *indexInfo)
+{
+	GISTSTATE  *giststate = (GISTSTATE *) indexInfo->ii_AmCache;
+	IndexTuple	itup;
+	MemoryContext oldCxt;
+
+	/* Initialize GISTSTATE cache if first call in this statement */
+	if (giststate == NULL)
+	{
+		oldCxt = MemoryContextSwitchTo(indexInfo->ii_Context);
+		giststate = initGISTstate(r);
+		giststate->tempCxt = createTempGistContext();
+		indexInfo->ii_AmCache = (void *) giststate;
+		MemoryContextSwitchTo(oldCxt);
+	}
+
+	oldCxt = MemoryContextSwitchTo(giststate->tempCxt);
+
+	itup = gistFormTuple(giststate, r,
+						 values, isnull, true /* size is currently bogus */ );
+	itup->t_tid = *ht_ctid;
+
+	gistdoinsert(r, itup, 0, giststate, heapRel, false);
+
+	/* cleanup */
+	MemoryContextSwitchTo(oldCxt);
+	MemoryContextReset(giststate->tempCxt);
+
+	return false;
+}
+
+
+/*
+ * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
+ * at that offset is atomically removed along with inserting the new tuples.
+ * This is used to replace a tuple with a new one.
+ *
+ * If 'leftchildbuf' is valid, we're inserting the downlink for the page
+ * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
+ * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
+ *
+ * If 'markfollowright' is true and the page is split, the left child is
+ * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
+ * index build, however, there is no concurrent access and the page splitting
+ * is done in a slightly simpler fashion, and false is passed.
+ *
+ * If there is not enough room on the page, it is split. All the split
+ * pages are kept pinned and locked and returned in *splitinfo, the caller
+ * is responsible for inserting the downlinks for them. However, if
+ * 'buffer' is the root page and it needs to be split, gistplacetopage()
+ * performs the split as one atomic operation, and *splitinfo is set to NIL.
+ * In that case, we continue to hold the root page locked, and the child
+ * pages are released; note that new tuple(s) are *not* on the root page
+ * but in one of the new child pages.
+ *
+ * If 'newblkno' is not NULL, returns the block number of page the first
+ * new/updated tuple was inserted to. Usually it's the given page, but could
+ * be its right sibling if the page was split.
+ *
+ * Returns 'true' if the page was split, 'false' otherwise.
+ */
+bool
+gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
+				Buffer buffer,
+				IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+				BlockNumber *newblkno,
+				Buffer leftchildbuf,
+				List **splitinfo,
+				bool markfollowright,
+				Relation heapRel,
+				bool is_build)
+{
+	BlockNumber blkno = BufferGetBlockNumber(buffer);
+	Page		page = BufferGetPage(buffer);
+	bool		is_leaf = (GistPageIsLeaf(page)) ? true : false;
+	XLogRecPtr	recptr;
+	int			i;
+	bool		is_split;
+
+	/*
+	 * Refuse to modify a page that's incompletely split. This should not
+	 * happen because we finish any incomplete splits while we walk down the
+	 * tree. However, it's remotely possible that another concurrent inserter
+	 * splits a parent page, and errors out before completing the split. We
+	 * will just throw an error in that case, and leave any split we had in
+	 * progress unfinished too. The next insert that comes along will clean up
+	 * the mess.
+	 */
+	if (GistFollowRight(page))
+		elog(ERROR, "concurrent GiST page split was incomplete");
+
+	/* should never try to insert to a deleted page */
+	Assert(!GistPageIsDeleted(page));
+
+	*splitinfo = NIL;
+
+	/*
+	 * if isupdate, remove old key: This node's key has been modified, either
+	 * because a child split occurred or because we needed to adjust our key
+	 * for an insert in a child node. Therefore, remove the old version of
+	 * this node's key.
+	 *
+	 * for WAL replay, in the non-split case we handle this by setting up a
+	 * one-element todelete array; in the split case, it's handled implicitly
+	 * because the tuple vector passed to gistSplit won't include this tuple.
+	 */
+	is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
+
+	/*
+	 * If leaf page is full, try at first to delete dead tuples. And then
+	 * check again.
+	 */
+	if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
+	{
+		gistprunepage(rel, page, buffer, heapRel);
+		is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
+	}
+
+	if (is_split)
+	{
+		/* no space for insertion */
+		IndexTuple *itvec;
+		int			tlen;
+		SplitedPageLayout *dist = NULL,
+				   *ptr;
+		BlockNumber oldrlink = InvalidBlockNumber;
+		GistNSN		oldnsn = 0;
+		SplitedPageLayout rootpg;
+		bool		is_rootsplit;
+		int			npage;
+
+		is_rootsplit = (blkno == GIST_ROOT_BLKNO);
+
+		/*
+		 * Form index tuples vector to split. If we're replacing an old tuple,
+		 * remove the old version from the vector.
+		 */
+		itvec = gistextractpage(page, &tlen);
+		if (OffsetNumberIsValid(oldoffnum))
+		{
+			/* on inner page we should remove old tuple */
+			int			pos = oldoffnum - FirstOffsetNumber;
+
+			tlen--;
+			if (pos != tlen)
+				memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
+		}
+		itvec = gistjoinvector(itvec, &tlen, itup, ntup);
+		dist = gistSplit(rel, page, itvec, tlen, giststate);
+
+		/*
+		 * Check that split didn't produce too many pages.
+		 */
+		npage = 0;
+		for (ptr = dist; ptr; ptr = ptr->next)
+			npage++;
+		/* in a root split, we'll add one more page to the list below */
+		if (is_rootsplit)
+			npage++;
+		if (npage > GIST_MAX_SPLIT_PAGES)
+			elog(ERROR, "GiST page split into too many halves (%d, maximum %d)",
+				 npage, GIST_MAX_SPLIT_PAGES);
+
+		/*
+		 * Set up pages to work with. Allocate new buffers for all but the
+		 * leftmost page. The original page becomes the new leftmost page, and
+		 * is just replaced with the new contents.
+		 *
+		 * For a root-split, allocate new buffers for all child pages, the
+		 * original page is overwritten with new root page containing
+		 * downlinks to the new child pages.
+		 */
+		ptr = dist;
+		if (!is_rootsplit)
+		{
+			/* save old rightlink and NSN */
+			oldrlink = GistPageGetOpaque(page)->rightlink;
+			oldnsn = GistPageGetNSN(page);
+
+			dist->buffer = buffer;
+			dist->block.blkno = BufferGetBlockNumber(buffer);
+			dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer));
+
+			/* clean all flags except F_LEAF */
+			GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0;
+
+			ptr = ptr->next;
+		}
+		for (; ptr; ptr = ptr->next)
+		{
+			/* Allocate new page */
+			ptr->buffer = gistNewBuffer(rel);
+			GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
+			ptr->page = BufferGetPage(ptr->buffer);
+			ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
+			PredicateLockPageSplit(rel,
+								   BufferGetBlockNumber(buffer),
+								   BufferGetBlockNumber(ptr->buffer));
+		}
+
+		/*
+		 * Now that we know which blocks the new pages go to, set up downlink
+		 * tuples to point to them.
+		 */
+		for (ptr = dist; ptr; ptr = ptr->next)
+		{
+			ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
+			GistTupleSetValid(ptr->itup);
+		}
+
+		/*
+		 * If this is a root split, we construct the new root page with the
+		 * downlinks here directly, instead of requiring the caller to insert
+		 * them. Add the new root page to the list along with the child pages.
+		 */
+		if (is_rootsplit)
+		{
+			IndexTuple *downlinks;
+			int			ndownlinks = 0;
+			int			i;
+
+			rootpg.buffer = buffer;
+			rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer));
+			GistPageGetOpaque(rootpg.page)->flags = 0;
+
+			/* Prepare a vector of all the downlinks */
+			for (ptr = dist; ptr; ptr = ptr->next)
+				ndownlinks++;
+			downlinks = palloc(sizeof(IndexTuple) * ndownlinks);
+			for (i = 0, ptr = dist; ptr; ptr = ptr->next)
+				downlinks[i++] = ptr->itup;
+
+			rootpg.block.blkno = GIST_ROOT_BLKNO;
+			rootpg.block.num = ndownlinks;
+			rootpg.list = gistfillitupvec(downlinks, ndownlinks,
+										  &(rootpg.lenlist));
+			rootpg.itup = NULL;
+
+			rootpg.next = dist;
+			dist = &rootpg;
+		}
+		else
+		{
+			/* Prepare split-info to be returned to caller */
+			for (ptr = dist; ptr; ptr = ptr->next)
+			{
+				GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
+
+				si->buf = ptr->buffer;
+				si->downlink = ptr->itup;
+				*splitinfo = lappend(*splitinfo, si);
+			}
+		}
+
+		/*
+		 * Fill all pages. All the pages are new, ie. freshly allocated empty
+		 * pages, or a temporary copy of the old page.
+		 */
+		for (ptr = dist; ptr; ptr = ptr->next)
+		{
+			char	   *data = (char *) (ptr->list);
+
+			for (i = 0; i < ptr->block.num; i++)
+			{
+				IndexTuple	thistup = (IndexTuple) data;
+
+				if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
+					elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
+
+				/*
+				 * If this is the first inserted/updated tuple, let the caller
+				 * know which page it landed on.
+				 */
+				if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid))
+					*newblkno = ptr->block.blkno;
+
+				data += IndexTupleSize(thistup);
+			}
+
+			/* Set up rightlinks */
+			if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO)
+				GistPageGetOpaque(ptr->page)->rightlink =
+					ptr->next->block.blkno;
+			else
+				GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
+
+			/*
+			 * Mark the all but the right-most page with the follow-right
+			 * flag. It will be cleared as soon as the downlink is inserted
+			 * into the parent, but this ensures that if we error out before
+			 * that, the index is still consistent. (in buffering build mode,
+			 * any error will abort the index build anyway, so this is not
+			 * needed.)
+			 */
+			if (ptr->next && !is_rootsplit && markfollowright)
+				GistMarkFollowRight(ptr->page);
+			else
+				GistClearFollowRight(ptr->page);
+
+			/*
+			 * Copy the NSN of the original page to all pages. The
+			 * F_FOLLOW_RIGHT flags ensure that scans will follow the
+			 * rightlinks until the downlinks are inserted.
+			 */
+			GistPageSetNSN(ptr->page, oldnsn);
+		}
+
+		/*
+		 * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL
+		 * insertion for that. NB: The number of pages and data segments
+		 * specified here must match the calculations in gistXLogSplit()!
+		 */
+		if (!is_build && RelationNeedsWAL(rel))
+			XLogEnsureRecordSpace(npage, 1 + npage * 2);
+
+		START_CRIT_SECTION();
+
+		/*
+		 * Must mark buffers dirty before XLogInsert, even though we'll still
+		 * be changing their opaque fields below.
+		 */
+		for (ptr = dist; ptr; ptr = ptr->next)
+			MarkBufferDirty(ptr->buffer);
+		if (BufferIsValid(leftchildbuf))
+			MarkBufferDirty(leftchildbuf);
+
+		/*
+		 * The first page in the chain was a temporary working copy meant to
+		 * replace the old page. Copy it over the old page.
+		 */
+		PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer));
+		dist->page = BufferGetPage(dist->buffer);
+
+		/*
+		 * Write the WAL record.
+		 *
+		 * If we're building a new index, however, we don't WAL-log changes
+		 * yet. The LSN-NSN interlock between parent and child requires that
+		 * LSNs never move backwards, so set the LSNs to a value that's
+		 * smaller than any real or fake unlogged LSN that might be generated
+		 * later. (There can't be any concurrent scans during index build, so
+		 * we don't need to be able to detect concurrent splits yet.)
+		 */
+		if (is_build)
+			recptr = GistBuildLSN;
+		else
+		{
+			if (RelationNeedsWAL(rel))
+				recptr = gistXLogSplit(is_leaf,
+									   dist, oldrlink, oldnsn, leftchildbuf,
+									   markfollowright);
+			else
+				recptr = gistGetFakeLSN(rel);
+		}
+
+		for (ptr = dist; ptr; ptr = ptr->next)
+			PageSetLSN(ptr->page, recptr);
+
+		/*
+		 * Return the new child buffers to the caller.
+		 *
+		 * If this was a root split, we've already inserted the downlink
+		 * pointers, in the form of a new root page. Therefore we can release
+		 * all the new buffers, and keep just the root page locked.
+		 */
+		if (is_rootsplit)
+		{
+			for (ptr = dist->next; ptr; ptr = ptr->next)
+				UnlockReleaseBuffer(ptr->buffer);
+		}
+	}
+	else
+	{
+		/*
+		 * Enough space.  We always get here if ntup==0.
+		 */
+		START_CRIT_SECTION();
+
+		/*
+		 * Delete old tuple if any, then insert new tuple(s) if any.  If
+		 * possible, use the fast path of PageIndexTupleOverwrite.
+		 */
+		if (OffsetNumberIsValid(oldoffnum))
+		{
+			if (ntup == 1)
+			{
+				/* One-for-one replacement, so use PageIndexTupleOverwrite */
+				if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup,
+											 IndexTupleSize(*itup)))
+					elog(ERROR, "failed to add item to index page in \"%s\"",
+						 RelationGetRelationName(rel));
+			}
+			else
+			{
+				/* Delete old, then append new tuple(s) to page */
+				PageIndexTupleDelete(page, oldoffnum);
+				gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
+			}
+		}
+		else
+		{
+			/* Just append new tuples at the end of the page */
+			gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
+		}
+
+		MarkBufferDirty(buffer);
+
+		if (BufferIsValid(leftchildbuf))
+			MarkBufferDirty(leftchildbuf);
+
+		if (is_build)
+			recptr = GistBuildLSN;
+		else
+		{
+			if (RelationNeedsWAL(rel))
+			{
+				OffsetNumber ndeloffs = 0,
+							deloffs[1];
+
+				if (OffsetNumberIsValid(oldoffnum))
+				{
+					deloffs[0] = oldoffnum;
+					ndeloffs = 1;
+				}
+
+				recptr = gistXLogUpdate(buffer,
+										deloffs, ndeloffs, itup, ntup,
+										leftchildbuf);
+			}
+			else
+				recptr = gistGetFakeLSN(rel);
+		}
+		PageSetLSN(page, recptr);
+
+		if (newblkno)
+			*newblkno = blkno;
+	}
+
+	/*
+	 * If we inserted the downlink for a child page, set NSN and clear
+	 * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to
+	 * follow the rightlink if and only if they looked at the parent page
+	 * before we inserted the downlink.
+	 *
+	 * Note that we do this *after* writing the WAL record. That means that
+	 * the possible full page image in the WAL record does not include these
+	 * changes, and they must be replayed even if the page is restored from
+	 * the full page image. There's a chicken-and-egg problem: if we updated
+	 * the child pages first, we wouldn't know the recptr of the WAL record
+	 * we're about to write.
+	 */
+	if (BufferIsValid(leftchildbuf))
+	{
+		Page		leftpg = BufferGetPage(leftchildbuf);
+
+		GistPageSetNSN(leftpg, recptr);
+		GistClearFollowRight(leftpg);
+
+		PageSetLSN(leftpg, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	return is_split;
+}
+
+/*
+ * Workhouse routine for doing insertion into a GiST index. Note that
+ * this routine assumes it is invoked in a short-lived memory context,
+ * so it does not bother releasing palloc'd allocations.
+ */
+void
+gistdoinsert(Relation r, IndexTuple itup, Size freespace,
+			 GISTSTATE *giststate, Relation heapRel, bool is_build)
+{
+	ItemId		iid;
+	IndexTuple	idxtuple;
+	GISTInsertStack firststack;
+	GISTInsertStack *stack;
+	GISTInsertState state;
+	bool		xlocked = false;
+
+	memset(&state, 0, sizeof(GISTInsertState));
+	state.freespace = freespace;
+	state.r = r;
+	state.heapRel = heapRel;
+	state.is_build = is_build;
+
+	/* Start from the root */
+	firststack.blkno = GIST_ROOT_BLKNO;
+	firststack.lsn = 0;
+	firststack.retry_from_parent = false;
+	firststack.parent = NULL;
+	firststack.downlinkoffnum = InvalidOffsetNumber;
+	state.stack = stack = &firststack;
+
+	/*
+	 * Walk down along the path of smallest penalty, updating the parent
+	 * pointers with the key we're inserting as we go. If we crash in the
+	 * middle, the tree is consistent, although the possible parent updates
+	 * were a waste.
+	 */
+	for (;;)
+	{
+		/*
+		 * If we split an internal page while descending the tree, we have to
+		 * retry at the parent. (Normally, the LSN-NSN interlock below would
+		 * also catch this and cause us to retry. But LSNs are not updated
+		 * during index build.)
+		 */
+		while (stack->retry_from_parent)
+		{
+			if (xlocked)
+				LockBuffer(stack->buffer, GIST_UNLOCK);
+			xlocked = false;
+			ReleaseBuffer(stack->buffer);
+			state.stack = stack = stack->parent;
+		}
+
+		if (XLogRecPtrIsInvalid(stack->lsn))
+			stack->buffer = ReadBuffer(state.r, stack->blkno);
+
+		/*
+		 * Be optimistic and grab shared lock first. Swap it for an exclusive
+		 * lock later if we need to update the page.
+		 */
+		if (!xlocked)
+		{
+			LockBuffer(stack->buffer, GIST_SHARE);
+			gistcheckpage(state.r, stack->buffer);
+		}
+
+		stack->page = (Page) BufferGetPage(stack->buffer);
+		stack->lsn = xlocked ?
+			PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer);
+		Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn));
+
+		/*
+		 * If this page was split but the downlink was never inserted to the
+		 * parent because the inserting backend crashed before doing that, fix
+		 * that now.
+		 */
+		if (GistFollowRight(stack->page))
+		{
+			if (!xlocked)
+			{
+				LockBuffer(stack->buffer, GIST_UNLOCK);
+				LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+				xlocked = true;
+				/* someone might've completed the split when we unlocked */
+				if (!GistFollowRight(stack->page))
+					continue;
+			}
+			gistfixsplit(&state, giststate);
+
+			UnlockReleaseBuffer(stack->buffer);
+			xlocked = false;
+			state.stack = stack = stack->parent;
+			continue;
+		}
+
+		if ((stack->blkno != GIST_ROOT_BLKNO &&
+			 stack->parent->lsn < GistPageGetNSN(stack->page)) ||
+			GistPageIsDeleted(stack->page))
+		{
+			/*
+			 * Concurrent split or page deletion detected. There's no
+			 * guarantee that the downlink for this page is consistent with
+			 * the tuple we're inserting anymore, so go back to parent and
+			 * rechoose the best child.
+			 */
+			UnlockReleaseBuffer(stack->buffer);
+			xlocked = false;
+			state.stack = stack = stack->parent;
+			continue;
+		}
+
+		if (!GistPageIsLeaf(stack->page))
+		{
+			/*
+			 * This is an internal page so continue to walk down the tree.
+			 * Find the child node that has the minimum insertion penalty.
+			 */
+			BlockNumber childblkno;
+			IndexTuple	newtup;
+			GISTInsertStack *item;
+			OffsetNumber downlinkoffnum;
+
+			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
+			iid = PageGetItemId(stack->page, downlinkoffnum);
+			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
+			childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+			/*
+			 * Check that it's not a leftover invalid tuple from pre-9.1
+			 */
+			if (GistTupleIsInvalid(idxtuple))
+				ereport(ERROR,
+						(errmsg("index \"%s\" contains an inner tuple marked as invalid",
+								RelationGetRelationName(r)),
+						 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
+						 errhint("Please REINDEX it.")));
+
+			/*
+			 * Check that the key representing the target child node is
+			 * consistent with the key we're inserting. Update it if it's not.
+			 */
+			newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
+			if (newtup)
+			{
+				/*
+				 * Swap shared lock for an exclusive one. Beware, the page may
+				 * change while we unlock/lock the page...
+				 */
+				if (!xlocked)
+				{
+					LockBuffer(stack->buffer, GIST_UNLOCK);
+					LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+					xlocked = true;
+					stack->page = (Page) BufferGetPage(stack->buffer);
+
+					if (PageGetLSN(stack->page) != stack->lsn)
+					{
+						/* the page was changed while we unlocked it, retry */
+						continue;
+					}
+				}
+
+				/*
+				 * Update the tuple.
+				 *
+				 * We still hold the lock after gistinserttuple(), but it
+				 * might have to split the page to make the updated tuple fit.
+				 * In that case the updated tuple might migrate to the other
+				 * half of the split, so we have to go back to the parent and
+				 * descend back to the half that's a better fit for the new
+				 * tuple.
+				 */
+				if (gistinserttuple(&state, stack, giststate, newtup,
+									downlinkoffnum))
+				{
+					/*
+					 * If this was a root split, the root page continues to be
+					 * the parent and the updated tuple went to one of the
+					 * child pages, so we just need to retry from the root
+					 * page.
+					 */
+					if (stack->blkno != GIST_ROOT_BLKNO)
+					{
+						UnlockReleaseBuffer(stack->buffer);
+						xlocked = false;
+						state.stack = stack = stack->parent;
+					}
+					continue;
+				}
+			}
+			LockBuffer(stack->buffer, GIST_UNLOCK);
+			xlocked = false;
+
+			/* descend to the chosen child */
+			item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+			item->blkno = childblkno;
+			item->parent = stack;
+			item->downlinkoffnum = downlinkoffnum;
+			state.stack = stack = item;
+		}
+		else
+		{
+			/*
+			 * Leaf page. Insert the new key. We've already updated all the
+			 * parents on the way down, but we might have to split the page if
+			 * it doesn't fit. gistinserttuple() will take care of that.
+			 */
+
+			/*
+			 * Swap shared lock for an exclusive one. Be careful, the page may
+			 * change while we unlock/lock the page...
+			 */
+			if (!xlocked)
+			{
+				LockBuffer(stack->buffer, GIST_UNLOCK);
+				LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+				xlocked = true;
+				stack->page = (Page) BufferGetPage(stack->buffer);
+				stack->lsn = PageGetLSN(stack->page);
+
+				if (stack->blkno == GIST_ROOT_BLKNO)
+				{
+					/*
+					 * the only page that can become inner instead of leaf is
+					 * the root page, so for root we should recheck it
+					 */
+					if (!GistPageIsLeaf(stack->page))
+					{
+						/*
+						 * very rare situation: during unlock/lock index with
+						 * number of pages = 1 was increased
+						 */
+						LockBuffer(stack->buffer, GIST_UNLOCK);
+						xlocked = false;
+						continue;
+					}
+
+					/*
+					 * we don't need to check root split, because checking
+					 * leaf/inner is enough to recognize split for root
+					 */
+				}
+				else if ((GistFollowRight(stack->page) ||
+						  stack->parent->lsn < GistPageGetNSN(stack->page)) ||
+						 GistPageIsDeleted(stack->page))
+				{
+					/*
+					 * The page was split or deleted while we momentarily
+					 * unlocked the page. Go back to parent.
+					 */
+					UnlockReleaseBuffer(stack->buffer);
+					xlocked = false;
+					state.stack = stack = stack->parent;
+					continue;
+				}
+			}
+
+			/* now state.stack->(page, buffer and blkno) points to leaf page */
+
+			gistinserttuple(&state, stack, giststate, itup,
+							InvalidOffsetNumber);
+			LockBuffer(stack->buffer, GIST_UNLOCK);
+
+			/* Release any pins we might still hold before exiting */
+			for (; stack; stack = stack->parent)
+				ReleaseBuffer(stack->buffer);
+			break;
+		}
+	}
+}
+
+/*
+ * Traverse the tree to find path from root page to specified "child" block.
+ *
+ * returns a new insertion stack, starting from the parent of "child", up
+ * to the root. *downlinkoffnum is set to the offset of the downlink in the
+ * direct parent of child.
+ *
+ * To prevent deadlocks, this should lock only one page at a time.
+ */
+static GISTInsertStack *
+gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
+{
+	Page		page;
+	Buffer		buffer;
+	OffsetNumber i,
+				maxoff;
+	ItemId		iid;
+	IndexTuple	idxtuple;
+	List	   *fifo;
+	GISTInsertStack *top,
+			   *ptr;
+	BlockNumber blkno;
+
+	top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+	top->blkno = GIST_ROOT_BLKNO;
+	top->downlinkoffnum = InvalidOffsetNumber;
+
+	fifo = list_make1(top);
+	while (fifo != NIL)
+	{
+		/* Get next page to visit */
+		top = linitial(fifo);
+		fifo = list_delete_first(fifo);
+
+		buffer = ReadBuffer(r, top->blkno);
+		LockBuffer(buffer, GIST_SHARE);
+		gistcheckpage(r, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		if (GistPageIsLeaf(page))
+		{
+			/*
+			 * Because we scan the index top-down, all the rest of the pages
+			 * in the queue must be leaf pages as well.
+			 */
+			UnlockReleaseBuffer(buffer);
+			break;
+		}
+
+		/* currently, internal pages are never deleted */
+		Assert(!GistPageIsDeleted(page));
+
+		top->lsn = BufferGetLSNAtomic(buffer);
+
+		/*
+		 * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a
+		 * downlink. This should not normally happen..
+		 */
+		if (GistFollowRight(page))
+			elog(ERROR, "concurrent GiST page split was incomplete");
+
+		if (top->parent && top->parent->lsn < GistPageGetNSN(page) &&
+			GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
+		{
+			/*
+			 * Page was split while we looked elsewhere. We didn't see the
+			 * downlink to the right page when we scanned the parent, so add
+			 * it to the queue now.
+			 *
+			 * Put the right page ahead of the queue, so that we visit it
+			 * next. That's important, because if this is the lowest internal
+			 * level, just above leaves, we might already have queued up some
+			 * leaf pages, and we assume that there can't be any non-leaf
+			 * pages behind leaf pages.
+			 */
+			ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+			ptr->blkno = GistPageGetOpaque(page)->rightlink;
+			ptr->downlinkoffnum = InvalidOffsetNumber;
+			ptr->parent = top->parent;
+
+			fifo = lcons(ptr, fifo);
+		}
+
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+		{
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+			blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+			if (blkno == child)
+			{
+				/* Found it! */
+				UnlockReleaseBuffer(buffer);
+				*downlinkoffnum = i;
+				return top;
+			}
+			else
+			{
+				/* Append this child to the list of pages to visit later */
+				ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+				ptr->blkno = blkno;
+				ptr->downlinkoffnum = i;
+				ptr->parent = top;
+
+				fifo = lappend(fifo, ptr);
+			}
+		}
+
+		UnlockReleaseBuffer(buffer);
+	}
+
+	elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u",
+		 RelationGetRelationName(r), child);
+	return NULL;				/* keep compiler quiet */
+}
+
+/*
+ * Updates the stack so that child->parent is the correct parent of the
+ * child. child->parent must be exclusively locked on entry, and will
+ * remain so at exit, but it might not be the same page anymore.
+ */
+static void
+gistFindCorrectParent(Relation r, GISTInsertStack *child, bool is_build)
+{
+	GISTInsertStack *parent = child->parent;
+	ItemId		iid;
+	IndexTuple	idxtuple;
+	OffsetNumber maxoff;
+	GISTInsertStack *ptr;
+
+	gistcheckpage(r, parent->buffer);
+	parent->page = (Page) BufferGetPage(parent->buffer);
+	maxoff = PageGetMaxOffsetNumber(parent->page);
+
+	/* Check if the downlink is still where it was before */
+	if (child->downlinkoffnum != InvalidOffsetNumber && child->downlinkoffnum <= maxoff)
+	{
+		iid = PageGetItemId(parent->page, child->downlinkoffnum);
+		idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
+		if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+			return;				/* still there */
+	}
+
+	/*
+	 * The page has changed since we looked. During normal operation, every
+	 * update of a page changes its LSN, so the LSN we memorized should have
+	 * changed too. During index build, however, we don't WAL-log the changes
+	 * until we have built the index, so the LSN doesn't change. There is no
+	 * concurrent activity during index build, but we might have changed the
+	 * parent ourselves.
+	 */
+	Assert(parent->lsn != PageGetLSN(parent->page) || is_build);
+
+	/*
+	 * Scan the page to re-find the downlink. If the page was split, it might
+	 * have moved to a different page, so follow the right links until we find
+	 * it.
+	 */
+	while (true)
+	{
+		OffsetNumber i;
+
+		maxoff = PageGetMaxOffsetNumber(parent->page);
+		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+		{
+			iid = PageGetItemId(parent->page, i);
+			idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
+			if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+			{
+				/* yes!!, found */
+				child->downlinkoffnum = i;
+				return;
+			}
+		}
+
+		parent->blkno = GistPageGetOpaque(parent->page)->rightlink;
+		parent->downlinkoffnum = InvalidOffsetNumber;
+		UnlockReleaseBuffer(parent->buffer);
+		if (parent->blkno == InvalidBlockNumber)
+		{
+			/*
+			 * End of chain and still didn't find parent. It's a very-very
+			 * rare situation when root splitted.
+			 */
+			break;
+		}
+		parent->buffer = ReadBuffer(r, parent->blkno);
+		LockBuffer(parent->buffer, GIST_EXCLUSIVE);
+		gistcheckpage(r, parent->buffer);
+		parent->page = (Page) BufferGetPage(parent->buffer);
+	}
+
+	/*
+	 * awful!!, we need search tree to find parent ... , but before we should
+	 * release all old parent
+	 */
+
+	ptr = child->parent->parent;	/* child->parent already released above */
+	while (ptr)
+	{
+		ReleaseBuffer(ptr->buffer);
+		ptr = ptr->parent;
+	}
+
+	/* ok, find new path */
+	ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum);
+
+	/* read all buffers as expected by caller */
+	/* note we don't lock them or gistcheckpage them here! */
+	while (ptr)
+	{
+		ptr->buffer = ReadBuffer(r, ptr->blkno);
+		ptr->page = (Page) BufferGetPage(ptr->buffer);
+		ptr = ptr->parent;
+	}
+
+	/* install new chain of parents to stack */
+	child->parent = parent;
+
+	/* make recursive call to normal processing */
+	LockBuffer(child->parent->buffer, GIST_EXCLUSIVE);
+	gistFindCorrectParent(r, child, is_build);
+}
+
+/*
+ * Form a downlink pointer for the page in 'buf'.
+ */
+static IndexTuple
+gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate,
+				 GISTInsertStack *stack, bool is_build)
+{
+	Page		page = BufferGetPage(buf);
+	OffsetNumber maxoff;
+	OffsetNumber offset;
+	IndexTuple	downlink = NULL;
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
+	{
+		IndexTuple	ituple = (IndexTuple)
+		PageGetItem(page, PageGetItemId(page, offset));
+
+		if (downlink == NULL)
+			downlink = CopyIndexTuple(ituple);
+		else
+		{
+			IndexTuple	newdownlink;
+
+			newdownlink = gistgetadjusted(rel, downlink, ituple,
+										  giststate);
+			if (newdownlink)
+				downlink = newdownlink;
+		}
+	}
+
+	/*
+	 * If the page is completely empty, we can't form a meaningful downlink
+	 * for it. But we have to insert a downlink for the page. Any key will do,
+	 * as long as its consistent with the downlink of parent page, so that we
+	 * can legally insert it to the parent. A minimal one that matches as few
+	 * scans as possible would be best, to keep scans from doing useless work,
+	 * but we don't know how to construct that. So we just use the downlink of
+	 * the original page that was split - that's as far from optimal as it can
+	 * get but will do..
+	 */
+	if (!downlink)
+	{
+		ItemId		iid;
+
+		LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
+		gistFindCorrectParent(rel, stack, is_build);
+		iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum);
+		downlink = (IndexTuple) PageGetItem(stack->parent->page, iid);
+		downlink = CopyIndexTuple(downlink);
+		LockBuffer(stack->parent->buffer, GIST_UNLOCK);
+	}
+
+	ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf));
+	GistTupleSetValid(downlink);
+
+	return downlink;
+}
+
+
+/*
+ * Complete the incomplete split of state->stack->page.
+ */
+static void
+gistfixsplit(GISTInsertState *state, GISTSTATE *giststate)
+{
+	GISTInsertStack *stack = state->stack;
+	Buffer		buf;
+	Page		page;
+	List	   *splitinfo = NIL;
+
+	ereport(LOG,
+			(errmsg("fixing incomplete split in index \"%s\", block %u",
+					RelationGetRelationName(state->r), stack->blkno)));
+
+	Assert(GistFollowRight(stack->page));
+	Assert(OffsetNumberIsValid(stack->downlinkoffnum));
+
+	buf = stack->buffer;
+
+	/*
+	 * Read the chain of split pages, following the rightlinks. Construct a
+	 * downlink tuple for each page.
+	 */
+	for (;;)
+	{
+		GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
+		IndexTuple	downlink;
+
+		page = BufferGetPage(buf);
+
+		/* Form the new downlink tuples to insert to parent */
+		downlink = gistformdownlink(state->r, buf, giststate, stack, state->is_build);
+
+		si->buf = buf;
+		si->downlink = downlink;
+
+		splitinfo = lappend(splitinfo, si);
+
+		if (GistFollowRight(page))
+		{
+			/* lock next page */
+			buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink);
+			LockBuffer(buf, GIST_EXCLUSIVE);
+		}
+		else
+			break;
+	}
+
+	/* Insert the downlinks */
+	gistfinishsplit(state, stack, giststate, splitinfo, false);
+}
+
+/*
+ * Insert or replace a tuple in stack->buffer. If 'oldoffnum' is valid, the
+ * tuple at 'oldoffnum' is replaced, otherwise the tuple is inserted as new.
+ * 'stack' represents the path from the root to the page being updated.
+ *
+ * The caller must hold an exclusive lock on stack->buffer.  The lock is still
+ * held on return, but the page might not contain the inserted tuple if the
+ * page was split. The function returns true if the page was split, false
+ * otherwise.
+ */
+static bool
+gistinserttuple(GISTInsertState *state, GISTInsertStack *stack,
+				GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum)
+{
+	return gistinserttuples(state, stack, giststate, &tuple, 1, oldoffnum,
+							InvalidBuffer, InvalidBuffer, false, false);
+}
+
+/* ----------------
+ * An extended workhorse version of gistinserttuple(). This version allows
+ * inserting multiple tuples, or replacing a single tuple with multiple tuples.
+ * This is used to recursively update the downlinks in the parent when a page
+ * is split.
+ *
+ * If leftchild and rightchild are valid, we're inserting/replacing the
+ * downlink for rightchild, and leftchild is its left sibling. We clear the
+ * F_FOLLOW_RIGHT flag and update NSN on leftchild, atomically with the
+ * insertion of the downlink.
+ *
+ * To avoid holding locks for longer than necessary, when recursing up the
+ * tree to update the parents, the locking is a bit peculiar here. On entry,
+ * the caller must hold an exclusive lock on stack->buffer, as well as
+ * leftchild and rightchild if given. On return:
+ *
+ *	- Lock on stack->buffer is released, if 'unlockbuf' is true. The page is
+ *	  always kept pinned, however.
+ *	- Lock on 'leftchild' is released, if 'unlockleftchild' is true. The page
+ *	  is kept pinned.
+ *	- Lock and pin on 'rightchild' are always released.
+ *
+ * Returns 'true' if the page had to be split. Note that if the page was
+ * split, the inserted/updated tuples might've been inserted to a right
+ * sibling of stack->buffer instead of stack->buffer itself.
+ */
+static bool
+gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+				 GISTSTATE *giststate,
+				 IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+				 Buffer leftchild, Buffer rightchild,
+				 bool unlockbuf, bool unlockleftchild)
+{
+	List	   *splitinfo;
+	bool		is_split;
+
+	/*
+	 * Check for any rw conflicts (in serializable isolation level) just
+	 * before we intend to modify the page
+	 */
+	CheckForSerializableConflictIn(state->r, NULL, BufferGetBlockNumber(stack->buffer));
+
+	/* Insert the tuple(s) to the page, splitting the page if necessary */
+	is_split = gistplacetopage(state->r, state->freespace, giststate,
+							   stack->buffer,
+							   tuples, ntup,
+							   oldoffnum, NULL,
+							   leftchild,
+							   &splitinfo,
+							   true,
+							   state->heapRel,
+							   state->is_build);
+
+	/*
+	 * Before recursing up in case the page was split, release locks on the
+	 * child pages. We don't need to keep them locked when updating the
+	 * parent.
+	 */
+	if (BufferIsValid(rightchild))
+		UnlockReleaseBuffer(rightchild);
+	if (BufferIsValid(leftchild) && unlockleftchild)
+		LockBuffer(leftchild, GIST_UNLOCK);
+
+	/*
+	 * If we had to split, insert/update the downlinks in the parent. If the
+	 * caller requested us to release the lock on stack->buffer, tell
+	 * gistfinishsplit() to do that as soon as it's safe to do so. If we
+	 * didn't have to split, release it ourselves.
+	 */
+	if (splitinfo)
+		gistfinishsplit(state, stack, giststate, splitinfo, unlockbuf);
+	else if (unlockbuf)
+		LockBuffer(stack->buffer, GIST_UNLOCK);
+
+	return is_split;
+}
+
+/*
+ * Finish an incomplete split by inserting/updating the downlinks in parent
+ * page. 'splitinfo' contains all the child pages involved in the split,
+ * from left-to-right.
+ *
+ * On entry, the caller must hold a lock on stack->buffer and all the child
+ * pages in 'splitinfo'. If 'unlockbuf' is true, the lock on stack->buffer is
+ * released on return. The child pages are always unlocked and unpinned.
+ */
+static void
+gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
+				GISTSTATE *giststate, List *splitinfo, bool unlockbuf)
+{
+	GISTPageSplitInfo *right;
+	GISTPageSplitInfo *left;
+	IndexTuple	tuples[2];
+
+	/* A split always contains at least two halves */
+	Assert(list_length(splitinfo) >= 2);
+
+	/*
+	 * We need to insert downlinks for each new page, and update the downlink
+	 * for the original (leftmost) page in the split. Begin at the rightmost
+	 * page, inserting one downlink at a time until there's only two pages
+	 * left. Finally insert the downlink for the last new page and update the
+	 * downlink for the original page as one operation.
+	 */
+	LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
+
+	/*
+	 * Insert downlinks for the siblings from right to left, until there are
+	 * only two siblings left.
+	 */
+	for (int pos = list_length(splitinfo) - 1; pos > 1; pos--)
+	{
+		right = (GISTPageSplitInfo *) list_nth(splitinfo, pos);
+		left = (GISTPageSplitInfo *) list_nth(splitinfo, pos - 1);
+
+		gistFindCorrectParent(state->r, stack, state->is_build);
+		if (gistinserttuples(state, stack->parent, giststate,
+							 &right->downlink, 1,
+							 InvalidOffsetNumber,
+							 left->buf, right->buf, false, false))
+		{
+			/*
+			 * If the parent page was split, the existing downlink might have
+			 * moved.
+			 */
+			stack->downlinkoffnum = InvalidOffsetNumber;
+		}
+		/* gistinserttuples() released the lock on right->buf. */
+	}
+
+	right = (GISTPageSplitInfo *) lsecond(splitinfo);
+	left = (GISTPageSplitInfo *) linitial(splitinfo);
+
+	/*
+	 * Finally insert downlink for the remaining right page and update the
+	 * downlink for the original page to not contain the tuples that were
+	 * moved to the new pages.
+	 */
+	tuples[0] = left->downlink;
+	tuples[1] = right->downlink;
+	gistFindCorrectParent(state->r, stack, state->is_build);
+	(void) gistinserttuples(state, stack->parent, giststate,
+							tuples, 2,
+							stack->downlinkoffnum,
+							left->buf, right->buf,
+							true,	/* Unlock parent */
+							unlockbuf	/* Unlock stack->buffer if caller
+										 * wants that */
+		);
+
+	/*
+	 * The downlink might have moved when we updated it. Even if the page
+	 * wasn't split, because gistinserttuples() implements updating the old
+	 * tuple by removing and re-inserting it!
+	 */
+	stack->downlinkoffnum = InvalidOffsetNumber;
+
+	Assert(left->buf == stack->buffer);
+
+	/*
+	 * If we split the page because we had to adjust the downlink on an
+	 * internal page, while descending the tree for inserting a new tuple,
+	 * then this might no longer be the correct page for the new tuple. The
+	 * downlink to this page might not cover the new tuple anymore, it might
+	 * need to go to the newly-created right sibling instead. Tell the caller
+	 * to walk back up the stack, to re-check at the parent which page to
+	 * insert to.
+	 *
+	 * Normally, the LSN-NSN interlock during the tree descend would also
+	 * detect that a concurrent split happened (by ourselves), and cause us to
+	 * retry at the parent. But that mechanism doesn't work during index
+	 * build, because we don't do WAL-logging, and don't update LSNs, during
+	 * index build.
+	 */
+	stack->retry_from_parent = true;
+}
+
+/*
+ * gistSplit -- split a page in the tree and fill struct
+ * used for XLOG and real writes buffers. Function is recursive, ie
+ * it will split page until keys will fit in every page.
+ */
+SplitedPageLayout *
+gistSplit(Relation r,
+		  Page page,
+		  IndexTuple *itup,		/* contains compressed entry */
+		  int len,
+		  GISTSTATE *giststate)
+{
+	IndexTuple *lvectup,
+			   *rvectup;
+	GistSplitVector v;
+	int			i;
+	SplitedPageLayout *res = NULL;
+
+	/* this should never recurse very deeply, but better safe than sorry */
+	check_stack_depth();
+
+	/* there's no point in splitting an empty page */
+	Assert(len > 0);
+
+	/*
+	 * If a single tuple doesn't fit on a page, no amount of splitting will
+	 * help.
+	 */
+	if (len == 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+						IndexTupleSize(itup[0]), GiSTPageSize,
+						RelationGetRelationName(r))));
+
+	memset(v.spl_lisnull, true,
+		   sizeof(bool) * giststate->nonLeafTupdesc->natts);
+	memset(v.spl_risnull, true,
+		   sizeof(bool) * giststate->nonLeafTupdesc->natts);
+	gistSplitByKey(r, page, itup, len, giststate, &v, 0);
+
+	/* form left and right vector */
+	lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1));
+	rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1));
+
+	for (i = 0; i < v.splitVector.spl_nleft; i++)
+		lvectup[i] = itup[v.splitVector.spl_left[i] - 1];
+
+	for (i = 0; i < v.splitVector.spl_nright; i++)
+		rvectup[i] = itup[v.splitVector.spl_right[i] - 1];
+
+	/* finalize splitting (may need another split) */
+	if (!gistfitpage(rvectup, v.splitVector.spl_nright))
+	{
+		res = gistSplit(r, page, rvectup, v.splitVector.spl_nright, giststate);
+	}
+	else
+	{
+		ROTATEDIST(res);
+		res->block.num = v.splitVector.spl_nright;
+		res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist));
+		res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false);
+	}
+
+	if (!gistfitpage(lvectup, v.splitVector.spl_nleft))
+	{
+		SplitedPageLayout *resptr,
+				   *subres;
+
+		resptr = subres = gistSplit(r, page, lvectup, v.splitVector.spl_nleft, giststate);
+
+		/* install on list's tail */
+		while (resptr->next)
+			resptr = resptr->next;
+
+		resptr->next = res;
+		res = subres;
+	}
+	else
+	{
+		ROTATEDIST(res);
+		res->block.num = v.splitVector.spl_nleft;
+		res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist));
+		res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false);
+	}
+
+	return res;
+}
+
+/*
+ * Create a GISTSTATE and fill it with information about the index
+ */
+GISTSTATE *
+initGISTstate(Relation index)
+{
+	GISTSTATE  *giststate;
+	MemoryContext scanCxt;
+	MemoryContext oldCxt;
+	int			i;
+
+	/* safety check to protect fixed-size arrays in GISTSTATE */
+	if (index->rd_att->natts > INDEX_MAX_KEYS)
+		elog(ERROR, "numberOfAttributes %d > %d",
+			 index->rd_att->natts, INDEX_MAX_KEYS);
+
+	/* Create the memory context that will hold the GISTSTATE */
+	scanCxt = AllocSetContextCreate(CurrentMemoryContext,
+									"GiST scan context",
+									ALLOCSET_DEFAULT_SIZES);
+	oldCxt = MemoryContextSwitchTo(scanCxt);
+
+	/* Create and fill in the GISTSTATE */
+	giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE));
+
+	giststate->scanCxt = scanCxt;
+	giststate->tempCxt = scanCxt;	/* caller must change this if needed */
+	giststate->leafTupdesc = index->rd_att;
+
+	/*
+	 * The truncated tupdesc for non-leaf index tuples, which doesn't contain
+	 * the INCLUDE attributes.
+	 *
+	 * It is used to form tuples during tuple adjustment and page split.
+	 * B-tree creates shortened tuple descriptor for every truncated tuple,
+	 * because it is doing this less often: it does not have to form truncated
+	 * tuples during page split.  Also, B-tree is not adjusting tuples on
+	 * internal pages the way GiST does.
+	 */
+	giststate->nonLeafTupdesc = CreateTupleDescCopyConstr(index->rd_att);
+	giststate->nonLeafTupdesc->natts =
+		IndexRelationGetNumberOfKeyAttributes(index);
+
+	for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++)
+	{
+		fmgr_info_copy(&(giststate->consistentFn[i]),
+					   index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC),
+					   scanCxt);
+		fmgr_info_copy(&(giststate->unionFn[i]),
+					   index_getprocinfo(index, i + 1, GIST_UNION_PROC),
+					   scanCxt);
+
+		/* opclasses are not required to provide a Compress method */
+		if (OidIsValid(index_getprocid(index, i + 1, GIST_COMPRESS_PROC)))
+			fmgr_info_copy(&(giststate->compressFn[i]),
+						   index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC),
+						   scanCxt);
+		else
+			giststate->compressFn[i].fn_oid = InvalidOid;
+
+		/* opclasses are not required to provide a Decompress method */
+		if (OidIsValid(index_getprocid(index, i + 1, GIST_DECOMPRESS_PROC)))
+			fmgr_info_copy(&(giststate->decompressFn[i]),
+						   index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC),
+						   scanCxt);
+		else
+			giststate->decompressFn[i].fn_oid = InvalidOid;
+
+		fmgr_info_copy(&(giststate->penaltyFn[i]),
+					   index_getprocinfo(index, i + 1, GIST_PENALTY_PROC),
+					   scanCxt);
+		fmgr_info_copy(&(giststate->picksplitFn[i]),
+					   index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC),
+					   scanCxt);
+		fmgr_info_copy(&(giststate->equalFn[i]),
+					   index_getprocinfo(index, i + 1, GIST_EQUAL_PROC),
+					   scanCxt);
+
+		/* opclasses are not required to provide a Distance method */
+		if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC)))
+			fmgr_info_copy(&(giststate->distanceFn[i]),
+						   index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC),
+						   scanCxt);
+		else
+			giststate->distanceFn[i].fn_oid = InvalidOid;
+
+		/* opclasses are not required to provide a Fetch method */
+		if (OidIsValid(index_getprocid(index, i + 1, GIST_FETCH_PROC)))
+			fmgr_info_copy(&(giststate->fetchFn[i]),
+						   index_getprocinfo(index, i + 1, GIST_FETCH_PROC),
+						   scanCxt);
+		else
+			giststate->fetchFn[i].fn_oid = InvalidOid;
+
+		/*
+		 * If the index column has a specified collation, we should honor that
+		 * while doing comparisons.  However, we may have a collatable storage
+		 * type for a noncollatable indexed data type.  If there's no index
+		 * collation then specify default collation in case the support
+		 * functions need collation.  This is harmless if the support
+		 * functions don't care about collation, so we just do it
+		 * unconditionally.  (We could alternatively call get_typcollation,
+		 * but that seems like expensive overkill --- there aren't going to be
+		 * any cases where a GiST storage type has a nondefault collation.)
+		 */
+		if (OidIsValid(index->rd_indcollation[i]))
+			giststate->supportCollation[i] = index->rd_indcollation[i];
+		else
+			giststate->supportCollation[i] = DEFAULT_COLLATION_OID;
+	}
+
+	/* No opclass information for INCLUDE attributes */
+	for (; i < index->rd_att->natts; i++)
+	{
+		giststate->consistentFn[i].fn_oid = InvalidOid;
+		giststate->unionFn[i].fn_oid = InvalidOid;
+		giststate->compressFn[i].fn_oid = InvalidOid;
+		giststate->decompressFn[i].fn_oid = InvalidOid;
+		giststate->penaltyFn[i].fn_oid = InvalidOid;
+		giststate->picksplitFn[i].fn_oid = InvalidOid;
+		giststate->equalFn[i].fn_oid = InvalidOid;
+		giststate->distanceFn[i].fn_oid = InvalidOid;
+		giststate->fetchFn[i].fn_oid = InvalidOid;
+		giststate->supportCollation[i] = InvalidOid;
+	}
+
+	MemoryContextSwitchTo(oldCxt);
+
+	return giststate;
+}
+
+void
+freeGISTstate(GISTSTATE *giststate)
+{
+	/* It's sufficient to delete the scanCxt */
+	MemoryContextDelete(giststate->scanCxt);
+}
+
+/*
+ * gistprunepage() -- try to remove LP_DEAD items from the given page.
+ * Function assumes that buffer is exclusively locked.
+ */
+static void
+gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel)
+{
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	int			ndeletable = 0;
+	OffsetNumber offnum,
+				maxoff;
+
+	Assert(GistPageIsLeaf(page));
+
+	/*
+	 * Scan over all items to see which ones need to be deleted according to
+	 * LP_DEAD flags.
+	 */
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+
+		if (ItemIdIsDead(itemId))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+	{
+		TransactionId latestRemovedXid = InvalidTransactionId;
+
+		if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+			latestRemovedXid =
+				index_compute_xid_horizon_for_tuples(rel, heapRel, buffer,
+													 deletable, ndeletable);
+
+		START_CRIT_SECTION();
+
+		PageIndexMultiDelete(page, deletable, ndeletable);
+
+		/*
+		 * Mark the page as not containing any LP_DEAD items.  This is not
+		 * certainly true (there might be some that have recently been marked,
+		 * but weren't included in our target-item list), but it will almost
+		 * always be true and it doesn't seem worth an additional page scan to
+		 * check it. Remember that F_HAS_GARBAGE is only a hint anyway.
+		 */
+		GistClearPageHasGarbage(page);
+
+		MarkBufferDirty(buffer);
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			XLogRecPtr	recptr;
+
+			recptr = gistXLogDelete(buffer,
+									deletable, ndeletable,
+									latestRemovedXid);
+
+			PageSetLSN(page, recptr);
+		}
+		else
+			PageSetLSN(page, gistGetFakeLSN(rel));
+
+		END_CRIT_SECTION();
+	}
+
+	/*
+	 * Note: if we didn't find any LP_DEAD items, then the page's
+	 * F_HAS_GARBAGE hint bit is falsely set.  We do not bother expending a
+	 * separate write to clear it, however.  We will clear it when we split
+	 * the page.
+	 */
+}