Adding upstream version 14.5.upstream/14.5 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
commit: 46651ce6fe013220ed397add242004d764fc0153 (patch)
tree: 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/access/gist/gistbuild.c
parent: Initial commit. (diff)
download: postgresql-14-upstream.tar.xz
postgresql-14-upstream.zip
1 files changed, 1566 insertions, 0 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
new file mode 100644
index 0000000..ec28bfe
--- /dev/null
+++ b/src/backend/access/gist/gistbuild.c
@@ -0,0 +1,1566 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistbuild.c
+ *	  build algorithm for GiST indexes implementation.
+ *
+ * There are two different strategies:
+ *
+ * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
+ *    order, and create downlinks and internal pages as we go. This builds
+ *    the index from the bottom up, similar to how B-tree index build
+ *    works.
+ *
+ * 2. Start with an empty index, and insert all tuples one by one.
+ *
+ * The sorted method is used if the operator classes for all columns have
+ * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
+ *
+ * The second strategy can optionally use buffers at different levels of
+ * the tree to reduce I/O, see "Buffering build algorithm" in the README
+ * for a more detailed explanation. It initially calls insert over and
+ * over, but switches to the buffered algorithm after a certain number of
+ * tuples (unless buffering mode is disabled).
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/gist/gistbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "access/gistxlog.h"
+#include "access/tableam.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "storage/bufmgr.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/tuplesort.h"
+
+/* Step of index tuples for check whether to switch to buffering build mode */
+#define BUFFERING_MODE_SWITCH_CHECK_STEP 256
+
+/*
+ * Number of tuples to process in the slow way before switching to buffering
+ * mode, when buffering is explicitly turned on. Also, the number of tuples
+ * to process between readjusting the buffer size parameter, while in
+ * buffering mode.
+ */
+#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
+
+/*
+ * Strategy used to build the index. It can change between the
+ * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
+ * that needs to be decided up-front and cannot be changed afterwards.
+ */
+typedef enum
+{
+	GIST_SORTED_BUILD,			/* bottom-up build by sorting */
+	GIST_BUFFERING_DISABLED,	/* in regular build mode and aren't going to
+								 * switch */
+	GIST_BUFFERING_AUTO,		/* in regular build mode, but will switch to
+								 * buffering build mode if the index grows too
+								 * big */
+	GIST_BUFFERING_STATS,		/* gathering statistics of index tuple size
+								 * before switching to the buffering build
+								 * mode */
+	GIST_BUFFERING_ACTIVE		/* in buffering build mode */
+} GistBuildMode;
+
+/* Working state for gistbuild and its callback */
+typedef struct
+{
+	Relation	indexrel;
+	Relation	heaprel;
+	GISTSTATE  *giststate;
+
+	Size		freespace;		/* amount of free space to leave on pages */
+
+	GistBuildMode buildMode;
+
+	int64		indtuples;		/* number of tuples indexed */
+
+	/*
+	 * Extra data structures used during a buffering build. 'gfbb' contains
+	 * information related to managing the build buffers. 'parentMap' is a
+	 * lookup table of the parent of each internal page.
+	 */
+	int64		indtuplesSize;	/* total size of all indexed tuples */
+	GISTBuildBuffers *gfbb;
+	HTAB	   *parentMap;
+
+	/*
+	 * Extra data structures used during a sorting build.
+	 */
+	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
+
+	BlockNumber pages_allocated;
+	BlockNumber pages_written;
+
+	int			ready_num_pages;
+	BlockNumber ready_blknos[XLR_MAX_BLOCK_ID];
+	Page		ready_pages[XLR_MAX_BLOCK_ID];
+} GISTBuildState;
+
+/*
+ * In sorted build, we use a stack of these structs, one for each level,
+ * to hold an in-memory buffer of the rightmost page at the level. When the
+ * page fills up, it is written out and a new page is allocated.
+ */
+typedef struct GistSortedBuildPageState
+{
+	Page		page;
+	struct GistSortedBuildPageState *parent;	/* Upper level, if any */
+} GistSortedBuildPageState;
+
+/* prototypes for private functions */
+
+static void gistSortedBuildCallback(Relation index, ItemPointer tid,
+									Datum *values, bool *isnull,
+									bool tupleIsAlive, void *state);
+static void gist_indexsortbuild(GISTBuildState *state);
+static void gist_indexsortbuild_pagestate_add(GISTBuildState *state,
+											  GistSortedBuildPageState *pagestate,
+											  IndexTuple itup);
+static void gist_indexsortbuild_pagestate_flush(GISTBuildState *state,
+												GistSortedBuildPageState *pagestate);
+static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);
+
+static void gistInitBuffering(GISTBuildState *buildstate);
+static int	calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
+static void gistBuildCallback(Relation index,
+							  ItemPointer tid,
+							  Datum *values,
+							  bool *isnull,
+							  bool tupleIsAlive,
+							  void *state);
+static void gistBufferingBuildInsert(GISTBuildState *buildstate,
+									 IndexTuple itup);
+static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
+							BlockNumber startblkno, int startlevel);
+static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
+											 Buffer buffer, int level,
+											 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+											 BlockNumber parentblk, OffsetNumber downlinkoffnum);
+static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
+											 BlockNumber childblkno, int level,
+											 BlockNumber *parentblk,
+											 OffsetNumber *downlinkoffnum);
+static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
+static void gistEmptyAllBuffers(GISTBuildState *buildstate);
+static int	gistGetMaxLevel(Relation index);
+
+static void gistInitParentMap(GISTBuildState *buildstate);
+static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
+							   BlockNumber parent);
+static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
+static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
+
+
+/*
+ * Main entry point to GiST index build.
+ */
+IndexBuildResult *
+gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+	IndexBuildResult *result;
+	double		reltuples;
+	GISTBuildState buildstate;
+	MemoryContext oldcxt = CurrentMemoryContext;
+	int			fillfactor;
+	Oid			SortSupportFnOids[INDEX_MAX_KEYS];
+	GiSTOptions *options = (GiSTOptions *) index->rd_options;
+
+	/*
+	 * We expect to be called exactly once for any index relation. If that's
+	 * not the case, big trouble's what we have.
+	 */
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "index \"%s\" already contains data",
+			 RelationGetRelationName(index));
+
+	buildstate.indexrel = index;
+	buildstate.heaprel = heap;
+	buildstate.sortstate = NULL;
+	buildstate.giststate = initGISTstate(index);
+
+	/*
+	 * Create a temporary memory context that is reset once for each tuple
+	 * processed.  (Note: we don't bother to make this a child of the
+	 * giststate's scanCxt, so we have to delete it separately at the end.)
+	 */
+	buildstate.giststate->tempCxt = createTempGistContext();
+
+	/*
+	 * Choose build strategy.  First check whether the user specified to use
+	 * buffering mode.  (The use-case for that in the field is somewhat
+	 * questionable perhaps, but it's important for testing purposes.)
+	 */
+	if (options)
+	{
+		if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
+			buildstate.buildMode = GIST_BUFFERING_STATS;
+		else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
+			buildstate.buildMode = GIST_BUFFERING_DISABLED;
+		else					/* must be "auto" */
+			buildstate.buildMode = GIST_BUFFERING_AUTO;
+	}
+	else
+	{
+		buildstate.buildMode = GIST_BUFFERING_AUTO;
+	}
+
+	/*
+	 * Unless buffering mode was forced, see if we can use sorting instead.
+	 */
+	if (buildstate.buildMode != GIST_BUFFERING_STATS)
+	{
+		bool		hasallsortsupports = true;
+		int			keyscount = IndexRelationGetNumberOfKeyAttributes(index);
+
+		for (int i = 0; i < keyscount; i++)
+		{
+			SortSupportFnOids[i] = index_getprocid(index, i + 1,
+												   GIST_SORTSUPPORT_PROC);
+			if (!OidIsValid(SortSupportFnOids[i]))
+			{
+				hasallsortsupports = false;
+				break;
+			}
+		}
+		if (hasallsortsupports)
+			buildstate.buildMode = GIST_SORTED_BUILD;
+	}
+
+	/*
+	 * Calculate target amount of free space to leave on pages.
+	 */
+	fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR;
+	buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
+
+	/*
+	 * Build the index using the chosen strategy.
+	 */
+	buildstate.indtuples = 0;
+	buildstate.indtuplesSize = 0;
+
+	if (buildstate.buildMode == GIST_SORTED_BUILD)
+	{
+		/*
+		 * Sort all data, build the index from bottom up.
+		 */
+		buildstate.sortstate = tuplesort_begin_index_gist(heap,
+														  index,
+														  maintenance_work_mem,
+														  NULL,
+														  false);
+
+		/* Scan the table, adding all tuples to the tuplesort */
+		reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+										   gistSortedBuildCallback,
+										   (void *) &buildstate, NULL);
+
+		/*
+		 * Perform the sort and build index pages.
+		 */
+		tuplesort_performsort(buildstate.sortstate);
+
+		gist_indexsortbuild(&buildstate);
+
+		tuplesort_end(buildstate.sortstate);
+	}
+	else
+	{
+		/*
+		 * Initialize an empty index and insert all tuples, possibly using
+		 * buffers on intermediate levels.
+		 */
+		Buffer		buffer;
+		Page		page;
+
+		/* initialize the root page */
+		buffer = gistNewBuffer(index);
+		Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
+		page = BufferGetPage(buffer);
+
+		START_CRIT_SECTION();
+
+		GISTInitBuffer(buffer, F_LEAF);
+
+		MarkBufferDirty(buffer);
+		PageSetLSN(page, GistBuildLSN);
+
+		UnlockReleaseBuffer(buffer);
+
+		END_CRIT_SECTION();
+
+		/* Scan the table, inserting all the tuples to the index. */
+		reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+										   gistBuildCallback,
+										   (void *) &buildstate, NULL);
+
+		/*
+		 * If buffering was used, flush out all the tuples that are still in
+		 * the buffers.
+		 */
+		if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
+		{
+			elog(DEBUG1, "all tuples processed, emptying buffers");
+			gistEmptyAllBuffers(&buildstate);
+			gistFreeBuildBuffers(buildstate.gfbb);
+		}
+
+		/*
+		 * We didn't write WAL records as we built the index, so if
+		 * WAL-logging is required, write all pages to the WAL now.
+		 */
+		if (RelationNeedsWAL(index))
+		{
+			log_newpage_range(index, MAIN_FORKNUM,
+							  0, RelationGetNumberOfBlocks(index),
+							  true);
+		}
+	}
+
+	/* okay, all heap tuples are indexed */
+	MemoryContextSwitchTo(oldcxt);
+	MemoryContextDelete(buildstate.giststate->tempCxt);
+
+	freeGISTstate(buildstate.giststate);
+
+	/*
+	 * Return statistics
+	 */
+	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+	result->heap_tuples = reltuples;
+	result->index_tuples = (double) buildstate.indtuples;
+
+	return result;
+}
+
+/*-------------------------------------------------------------------------
+ * Routines for sorted build
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Per-tuple callback for table_index_build_scan.
+ */
+static void
+gistSortedBuildCallback(Relation index,
+						ItemPointer tid,
+						Datum *values,
+						bool *isnull,
+						bool tupleIsAlive,
+						void *state)
+{
+	GISTBuildState *buildstate = (GISTBuildState *) state;
+	MemoryContext oldCtx;
+	Datum		compressed_values[INDEX_MAX_KEYS];
+
+	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
+
+	/* Form an index tuple and point it at the heap tuple */
+	gistCompressValues(buildstate->giststate, index,
+					   values, isnull,
+					   true, compressed_values);
+
+	tuplesort_putindextuplevalues(buildstate->sortstate,
+								  buildstate->indexrel,
+								  tid,
+								  compressed_values, isnull);
+
+	MemoryContextSwitchTo(oldCtx);
+	MemoryContextReset(buildstate->giststate->tempCxt);
+
+	/* Update tuple count. */
+	buildstate->indtuples += 1;
+}
+
+/*
+ * Build GiST index from bottom up from pre-sorted tuples.
+ */
+static void
+gist_indexsortbuild(GISTBuildState *state)
+{
+	IndexTuple	itup;
+	GistSortedBuildPageState *leafstate;
+	GistSortedBuildPageState *pagestate;
+	Page		page;
+
+	state->pages_allocated = 0;
+	state->pages_written = 0;
+	state->ready_num_pages = 0;
+
+	/*
+	 * Write an empty page as a placeholder for the root page. It will be
+	 * replaced with the real root page at the end.
+	 */
+	page = palloc0(BLCKSZ);
+	RelationOpenSmgr(state->indexrel);
+	smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO,
+			   page, true);
+	state->pages_allocated++;
+	state->pages_written++;
+
+	/* Allocate a temporary buffer for the first leaf page. */
+	leafstate = palloc(sizeof(GistSortedBuildPageState));
+	leafstate->page = page;
+	leafstate->parent = NULL;
+	gistinitpage(page, F_LEAF);
+
+	/*
+	 * Fill index pages with tuples in the sorted order.
+	 */
+	while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
+	{
+		gist_indexsortbuild_pagestate_add(state, leafstate, itup);
+		MemoryContextReset(state->giststate->tempCxt);
+	}
+
+	/*
+	 * Write out the partially full non-root pages.
+	 *
+	 * Keep in mind that flush can build a new root.
+	 */
+	pagestate = leafstate;
+	while (pagestate->parent != NULL)
+	{
+		GistSortedBuildPageState *parent;
+
+		gist_indexsortbuild_pagestate_flush(state, pagestate);
+		parent = pagestate->parent;
+		pfree(pagestate->page);
+		pfree(pagestate);
+		pagestate = parent;
+	}
+
+	gist_indexsortbuild_flush_ready_pages(state);
+
+	/* Write out the root */
+	RelationOpenSmgr(state->indexrel);
+	PageSetLSN(pagestate->page, GistBuildLSN);
+	PageSetChecksumInplace(pagestate->page, GIST_ROOT_BLKNO);
+	smgrwrite(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO,
+			  pagestate->page, true);
+	if (RelationNeedsWAL(state->indexrel))
+		log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
+					pagestate->page, true);
+
+	pfree(pagestate->page);
+	pfree(pagestate);
+
+	/*
+	 * When we WAL-logged index pages, we must nonetheless fsync index files.
+	 * Since we're building outside shared buffers, a CHECKPOINT occurring
+	 * during the build has no way to flush the previously written data to
+	 * disk (indeed it won't know the index even exists).  A crash later on
+	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
+	 * earlier WAL entries. If we do not fsync those pages here, they might
+	 * still not be on disk when the crash occurs.
+	 */
+	if (RelationNeedsWAL(state->indexrel))
+	{
+		RelationOpenSmgr(state->indexrel);
+		smgrimmedsync(state->indexrel->rd_smgr, MAIN_FORKNUM);
+	}
+}
+
+/*
+ * Add tuple to a page. If the pages is full, write it out and re-initialize
+ * a new page first.
+ */
+static void
+gist_indexsortbuild_pagestate_add(GISTBuildState *state,
+								  GistSortedBuildPageState *pagestate,
+								  IndexTuple itup)
+{
+	Size		sizeNeeded;
+
+	/* Does the tuple fit? If not, flush */
+	sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData) + state->freespace;
+	if (PageGetFreeSpace(pagestate->page) < sizeNeeded)
+		gist_indexsortbuild_pagestate_flush(state, pagestate);
+
+	gistfillbuffer(pagestate->page, &itup, 1, InvalidOffsetNumber);
+}
+
+static void
+gist_indexsortbuild_pagestate_flush(GISTBuildState *state,
+									GistSortedBuildPageState *pagestate)
+{
+	GistSortedBuildPageState *parent;
+	IndexTuple *itvec;
+	IndexTuple	union_tuple;
+	int			vect_len;
+	bool		isleaf;
+	BlockNumber blkno;
+	MemoryContext oldCtx;
+
+	/* check once per page */
+	CHECK_FOR_INTERRUPTS();
+
+	if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
+		gist_indexsortbuild_flush_ready_pages(state);
+
+	/*
+	 * The page is now complete. Assign a block number to it, and add it to
+	 * the list of finished pages. (We don't write it out immediately, because
+	 * we want to WAL-log the pages in batches.)
+	 */
+	blkno = state->pages_allocated++;
+	state->ready_blknos[state->ready_num_pages] = blkno;
+	state->ready_pages[state->ready_num_pages] = pagestate->page;
+	state->ready_num_pages++;
+
+	isleaf = GistPageIsLeaf(pagestate->page);
+
+	/*
+	 * Form a downlink tuple to represent all the tuples on the page.
+	 */
+	oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
+	itvec = gistextractpage(pagestate->page, &vect_len);
+	union_tuple = gistunion(state->indexrel, itvec, vect_len,
+							state->giststate);
+	ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
+	MemoryContextSwitchTo(oldCtx);
+
+	/*
+	 * Insert the downlink to the parent page. If this was the root, create a
+	 * new page as the parent, which becomes the new root.
+	 */
+	parent = pagestate->parent;
+	if (parent == NULL)
+	{
+		parent = palloc(sizeof(GistSortedBuildPageState));
+		parent->page = (Page) palloc(BLCKSZ);
+		parent->parent = NULL;
+		gistinitpage(parent->page, 0);
+
+		pagestate->parent = parent;
+	}
+	gist_indexsortbuild_pagestate_add(state, parent, union_tuple);
+
+	/* Re-initialize the page buffer for next page on this level. */
+	pagestate->page = palloc(BLCKSZ);
+	gistinitpage(pagestate->page, isleaf ? F_LEAF : 0);
+
+	/*
+	 * Set the right link to point to the previous page. This is just for
+	 * debugging purposes: GiST only follows the right link if a page is split
+	 * concurrently to a scan, and that cannot happen during index build.
+	 *
+	 * It's a bit counterintuitive that we set the right link on the new page
+	 * to point to the previous page, and not the other way round. But GiST
+	 * pages are not ordered like B-tree pages are, so as long as the
+	 * right-links form a chain through all the pages in the same level, the
+	 * order doesn't matter.
+	 */
+	GistPageGetOpaque(pagestate->page)->rightlink = blkno;
+}
+
+static void
+gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
+{
+	if (state->ready_num_pages == 0)
+		return;
+
+	RelationOpenSmgr(state->indexrel);
+
+	for (int i = 0; i < state->ready_num_pages; i++)
+	{
+		Page		page = state->ready_pages[i];
+		BlockNumber blkno = state->ready_blknos[i];
+
+		/* Currently, the blocks must be buffered in order. */
+		if (blkno != state->pages_written)
+			elog(ERROR, "unexpected block number to flush GiST sorting build");
+
+		PageSetLSN(page, GistBuildLSN);
+		PageSetChecksumInplace(page, blkno);
+		smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, blkno, page, true);
+
+		state->pages_written++;
+	}
+
+	if (RelationNeedsWAL(state->indexrel))
+		log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages,
+					 state->ready_blknos, state->ready_pages, true);
+
+	for (int i = 0; i < state->ready_num_pages; i++)
+		pfree(state->ready_pages[i]);
+
+	state->ready_num_pages = 0;
+}
+
+
+/*-------------------------------------------------------------------------
+ * Routines for non-sorted build
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Attempt to switch to buffering mode.
+ *
+ * If there is not enough memory for buffering build, sets bufferingMode
+ * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
+ * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
+ * GIST_BUFFERING_ACTIVE.
+ */
+static void
+gistInitBuffering(GISTBuildState *buildstate)
+{
+	Relation	index = buildstate->indexrel;
+	int			pagesPerBuffer;
+	Size		pageFreeSpace;
+	Size		itupAvgSize,
+				itupMinSize;
+	double		avgIndexTuplesPerPage,
+				maxIndexTuplesPerPage;
+	int			i;
+	int			levelStep;
+
+	/* Calc space of index page which is available for index tuples */
+	pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
+		- sizeof(ItemIdData)
+		- buildstate->freespace;
+
+	/*
+	 * Calculate average size of already inserted index tuples using gathered
+	 * statistics.
+	 */
+	itupAvgSize = (double) buildstate->indtuplesSize /
+		(double) buildstate->indtuples;
+
+	/*
+	 * Calculate minimal possible size of index tuple by index metadata.
+	 * Minimal possible size of varlena is VARHDRSZ.
+	 *
+	 * XXX: that's not actually true, as a short varlen can be just 2 bytes.
+	 * And we should take padding into account here.
+	 */
+	itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
+	for (i = 0; i < index->rd_att->natts; i++)
+	{
+		if (TupleDescAttr(index->rd_att, i)->attlen < 0)
+			itupMinSize += VARHDRSZ;
+		else
+			itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
+	}
+
+	/* Calculate average and maximal number of index tuples which fit to page */
+	avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
+	maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
+
+	/*
+	 * We need to calculate two parameters for the buffering algorithm:
+	 * levelStep and pagesPerBuffer.
+	 *
+	 * levelStep determines the size of subtree that we operate on, while
+	 * emptying a buffer. A higher value is better, as you need fewer buffer
+	 * emptying steps to build the index. However, if you set it too high, the
+	 * subtree doesn't fit in cache anymore, and you quickly lose the benefit
+	 * of the buffers.
+	 *
+	 * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
+	 * the number of tuples on page (ie. fanout), and M is the amount of
+	 * internal memory available. Curiously, they doesn't explain *why* that
+	 * setting is optimal. We calculate it by taking the highest levelStep so
+	 * that a subtree still fits in cache. For a small B, our way of
+	 * calculating levelStep is very close to Arge et al's formula. For a
+	 * large B, our formula gives a value that is 2x higher.
+	 *
+	 * The average size (in pages) of a subtree of depth n can be calculated
+	 * as a geometric series:
+	 *
+	 * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
+	 *
+	 * where B is the average number of index tuples on page. The subtree is
+	 * cached in the shared buffer cache and the OS cache, so we choose
+	 * levelStep so that the subtree size is comfortably smaller than
+	 * effective_cache_size, with a safety factor of 4.
+	 *
+	 * The estimate on the average number of index tuples on page is based on
+	 * average tuple sizes observed before switching to buffered build, so the
+	 * real subtree size can be somewhat larger. Also, it would selfish to
+	 * gobble the whole cache for our index build. The safety factor of 4
+	 * should account for those effects.
+	 *
+	 * The other limiting factor for setting levelStep is that while
+	 * processing a subtree, we need to hold one page for each buffer at the
+	 * next lower buffered level. The max. number of buffers needed for that
+	 * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
+	 * hopefully maintenance_work_mem is set high enough that you're
+	 * constrained by effective_cache_size rather than maintenance_work_mem.
+	 *
+	 * XXX: the buffer hash table consumes a fair amount of memory too per
+	 * buffer, but that is not currently taken into account. That scales on
+	 * the total number of buffers used, ie. the index size and on levelStep.
+	 * Note that a higher levelStep *reduces* the amount of memory needed for
+	 * the hash table.
+	 */
+	levelStep = 1;
+	for (;;)
+	{
+		double		subtreesize;
+		double		maxlowestlevelpages;
+
+		/* size of an average subtree at this levelStep (in pages). */
+		subtreesize =
+			(1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
+			(1 - avgIndexTuplesPerPage);
+
+		/* max number of pages at the lowest level of a subtree */
+		maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
+
+		/* subtree must fit in cache (with safety factor of 4) */
+		if (subtreesize > effective_cache_size / 4)
+			break;
+
+		/* each node in the lowest level of a subtree has one page in memory */
+		if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
+			break;
+
+		/* Good, we can handle this levelStep. See if we can go one higher. */
+		levelStep++;
+	}
+
+	/*
+	 * We just reached an unacceptable value of levelStep in previous loop.
+	 * So, decrease levelStep to get last acceptable value.
+	 */
+	levelStep--;
+
+	/*
+	 * If there's not enough cache or maintenance_work_mem, fall back to plain
+	 * inserts.
+	 */
+	if (levelStep <= 0)
+	{
+		elog(DEBUG1, "failed to switch to buffered GiST build");
+		buildstate->buildMode = GIST_BUFFERING_DISABLED;
+		return;
+	}
+
+	/*
+	 * The second parameter to set is pagesPerBuffer, which determines the
+	 * size of each buffer. We adjust pagesPerBuffer also during the build,
+	 * which is why this calculation is in a separate function.
+	 */
+	pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
+
+	/* Initialize GISTBuildBuffers with these parameters */
+	buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
+											gistGetMaxLevel(index));
+
+	gistInitParentMap(buildstate);
+
+	buildstate->buildMode = GIST_BUFFERING_ACTIVE;
+
+	elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
+		 levelStep, pagesPerBuffer);
+}
+
+/*
+ * Calculate pagesPerBuffer parameter for the buffering algorithm.
+ *
+ * Buffer size is chosen so that assuming that tuples are distributed
+ * randomly, emptying half a buffer fills on average one page in every buffer
+ * at the next lower level.
+ */
+static int
+calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
+{
+	double		pagesPerBuffer;
+	double		avgIndexTuplesPerPage;
+	double		itupAvgSize;
+	Size		pageFreeSpace;
+
+	/* Calc space of index page which is available for index tuples */
+	pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
+		- sizeof(ItemIdData)
+		- buildstate->freespace;
+
+	/*
+	 * Calculate average size of already inserted index tuples using gathered
+	 * statistics.
+	 */
+	itupAvgSize = (double) buildstate->indtuplesSize /
+		(double) buildstate->indtuples;
+
+	avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
+
+	/*
+	 * Recalculate required size of buffers.
+	 */
+	pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
+
+	return (int) rint(pagesPerBuffer);
+}
+
+/*
+ * Per-tuple callback for table_index_build_scan.
+ */
+static void
+gistBuildCallback(Relation index,
+				  ItemPointer tid,
+				  Datum *values,
+				  bool *isnull,
+				  bool tupleIsAlive,
+				  void *state)
+{
+	GISTBuildState *buildstate = (GISTBuildState *) state;
+	IndexTuple	itup;
+	MemoryContext oldCtx;
+
+	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = gistFormTuple(buildstate->giststate, index,
+						 values, isnull,
+						 true);
+	itup->t_tid = *tid;
+
+	if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
+	{
+		/* We have buffers, so use them. */
+		gistBufferingBuildInsert(buildstate, itup);
+	}
+	else
+	{
+		/*
+		 * There's no buffers (yet). Since we already have the index relation
+		 * locked, we call gistdoinsert directly.
+		 */
+		gistdoinsert(index, itup, buildstate->freespace,
+					 buildstate->giststate, buildstate->heaprel, true);
+	}
+
+	/* Update tuple count and total size. */
+	buildstate->indtuples += 1;
+	buildstate->indtuplesSize += IndexTupleSize(itup);
+
+	MemoryContextSwitchTo(oldCtx);
+	MemoryContextReset(buildstate->giststate->tempCxt);
+
+	if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
+		buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
+	{
+		/* Adjust the target buffer size now */
+		buildstate->gfbb->pagesPerBuffer =
+			calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
+	}
+
+	/*
+	 * In 'auto' mode, check if the index has grown too large to fit in cache,
+	 * and switch to buffering mode if it has.
+	 *
+	 * To avoid excessive calls to smgrnblocks(), only check this every
+	 * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
+	 *
+	 * In 'stats' state, switch as soon as we have seen enough tuples to have
+	 * some idea of the average tuple size.
+	 */
+	if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
+		 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
+		 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
+		(buildstate->buildMode == GIST_BUFFERING_STATS &&
+		 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
+	{
+		/*
+		 * Index doesn't fit in effective cache anymore. Try to switch to
+		 * buffering build mode.
+		 */
+		gistInitBuffering(buildstate);
+	}
+}
+
+/*
+ * Insert function for buffering index build.
+ */
+static void
+gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
+{
+	/* Insert the tuple to buffers. */
+	gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
+
+	/* If we filled up (half of a) buffer, process buffer emptying. */
+	gistProcessEmptyingQueue(buildstate);
+}
+
+/*
+ * Process an index tuple. Runs the tuple down the tree until we reach a leaf
+ * page or node buffer, and inserts the tuple there. Returns true if we have
+ * to stop buffer emptying process (because one of child buffers can't take
+ * index tuples anymore).
+ */
+static bool
+gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
+				BlockNumber startblkno, int startlevel)
+{
+	GISTSTATE  *giststate = buildstate->giststate;
+	GISTBuildBuffers *gfbb = buildstate->gfbb;
+	Relation	indexrel = buildstate->indexrel;
+	BlockNumber childblkno;
+	Buffer		buffer;
+	bool		result = false;
+	BlockNumber blkno;
+	int			level;
+	OffsetNumber downlinkoffnum = InvalidOffsetNumber;
+	BlockNumber parentblkno = InvalidBlockNumber;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * Loop until we reach a leaf page (level == 0) or a level with buffers
+	 * (not including the level we start at, because we would otherwise make
+	 * no progress).
+	 */
+	blkno = startblkno;
+	level = startlevel;
+	for (;;)
+	{
+		ItemId		iid;
+		IndexTuple	idxtuple,
+					newtup;
+		Page		page;
+		OffsetNumber childoffnum;
+
+		/* Have we reached a level with buffers? */
+		if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
+			break;
+
+		/* Have we reached a leaf page? */
+		if (level == 0)
+			break;
+
+		/*
+		 * Nope. Descend down to the next level then. Choose a child to
+		 * descend down to.
+		 */
+
+		buffer = ReadBuffer(indexrel, blkno);
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+
+		page = (Page) BufferGetPage(buffer);
+		childoffnum = gistchoose(indexrel, page, itup, giststate);
+		iid = PageGetItemId(page, childoffnum);
+		idxtuple = (IndexTuple) PageGetItem(page, iid);
+		childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+		if (level > 1)
+			gistMemorizeParent(buildstate, childblkno, blkno);
+
+		/*
+		 * Check that the key representing the target child node is consistent
+		 * with the key we're inserting. Update it if it's not.
+		 */
+		newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
+		if (newtup)
+		{
+			blkno = gistbufferinginserttuples(buildstate,
+											  buffer,
+											  level,
+											  &newtup,
+											  1,
+											  childoffnum,
+											  InvalidBlockNumber,
+											  InvalidOffsetNumber);
+			/* gistbufferinginserttuples() released the buffer */
+		}
+		else
+			UnlockReleaseBuffer(buffer);
+
+		/* Descend to the child */
+		parentblkno = blkno;
+		blkno = childblkno;
+		downlinkoffnum = childoffnum;
+		Assert(level > 0);
+		level--;
+	}
+
+	if (LEVEL_HAS_BUFFERS(level, gfbb))
+	{
+		/*
+		 * We've reached level with buffers. Place the index tuple to the
+		 * buffer, and add the buffer to the emptying queue if it overflows.
+		 */
+		GISTNodeBuffer *childNodeBuffer;
+
+		/* Find the buffer or create a new one */
+		childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
+
+		/* Add index tuple to it */
+		gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
+
+		if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
+			result = true;
+	}
+	else
+	{
+		/*
+		 * We've reached a leaf page. Place the tuple here.
+		 */
+		Assert(level == 0);
+		buffer = ReadBuffer(indexrel, blkno);
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		gistbufferinginserttuples(buildstate, buffer, level,
+								  &itup, 1, InvalidOffsetNumber,
+								  parentblkno, downlinkoffnum);
+		/* gistbufferinginserttuples() released the buffer */
+	}
+
+	return result;
+}
+
+/*
+ * Insert tuples to a given page.
+ *
+ * This is analogous with gistinserttuples() in the regular insertion code.
+ *
+ * Returns the block number of the page where the (first) new or updated tuple
+ * was inserted. Usually that's the original page, but might be a sibling page
+ * if the original page was split.
+ *
+ * Caller should hold a lock on 'buffer' on entry. This function will unlock
+ * and unpin it.
+ */
+static BlockNumber
+gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
+						  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+						  BlockNumber parentblk, OffsetNumber downlinkoffnum)
+{
+	GISTBuildBuffers *gfbb = buildstate->gfbb;
+	List	   *splitinfo;
+	bool		is_split;
+	BlockNumber placed_to_blk = InvalidBlockNumber;
+
+	is_split = gistplacetopage(buildstate->indexrel,
+							   buildstate->freespace,
+							   buildstate->giststate,
+							   buffer,
+							   itup, ntup, oldoffnum, &placed_to_blk,
+							   InvalidBuffer,
+							   &splitinfo,
+							   false,
+							   buildstate->heaprel, true);
+
+	/*
+	 * If this is a root split, update the root path item kept in memory. This
+	 * ensures that all path stacks are always complete, including all parent
+	 * nodes up to the root. That simplifies the algorithm to re-find correct
+	 * parent.
+	 */
+	if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
+	{
+		Page		page = BufferGetPage(buffer);
+		OffsetNumber off;
+		OffsetNumber maxoff;
+
+		Assert(level == gfbb->rootlevel);
+		gfbb->rootlevel++;
+
+		elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
+
+		/*
+		 * All the downlinks on the old root page are now on one of the child
+		 * pages. Visit all the new child pages to memorize the parents of the
+		 * grandchildren.
+		 */
+		if (gfbb->rootlevel > 1)
+		{
+			maxoff = PageGetMaxOffsetNumber(page);
+			for (off = FirstOffsetNumber; off <= maxoff; off++)
+			{
+				ItemId		iid = PageGetItemId(page, off);
+				IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
+				BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+				Buffer		childbuf = ReadBuffer(buildstate->indexrel, childblkno);
+
+				LockBuffer(childbuf, GIST_SHARE);
+				gistMemorizeAllDownlinks(buildstate, childbuf);
+				UnlockReleaseBuffer(childbuf);
+
+				/*
+				 * Also remember that the parent of the new child page is the
+				 * root block.
+				 */
+				gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
+			}
+		}
+	}
+
+	if (splitinfo)
+	{
+		/*
+		 * Insert the downlinks to the parent. This is analogous with
+		 * gistfinishsplit() in the regular insertion code, but the locking is
+		 * simpler, and we have to maintain the buffers on internal nodes and
+		 * the parent map.
+		 */
+		IndexTuple *downlinks;
+		int			ndownlinks,
+					i;
+		Buffer		parentBuffer;
+		ListCell   *lc;
+
+		/* Parent may have changed since we memorized this path. */
+		parentBuffer =
+			gistBufferingFindCorrectParent(buildstate,
+										   BufferGetBlockNumber(buffer),
+										   level,
+										   &parentblk,
+										   &downlinkoffnum);
+
+		/*
+		 * If there's a buffer associated with this page, that needs to be
+		 * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
+		 * downlinks in 'splitinfo', to make sure they're consistent not only
+		 * with the tuples already on the pages, but also the tuples in the
+		 * buffers that will eventually be inserted to them.
+		 */
+		gistRelocateBuildBuffersOnSplit(gfbb,
+										buildstate->giststate,
+										buildstate->indexrel,
+										level,
+										buffer, splitinfo);
+
+		/* Create an array of all the downlink tuples */
+		ndownlinks = list_length(splitinfo);
+		downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
+		i = 0;
+		foreach(lc, splitinfo)
+		{
+			GISTPageSplitInfo *splitinfo = lfirst(lc);
+
+			/*
+			 * Remember the parent of each new child page in our parent map.
+			 * This assumes that the downlinks fit on the parent page. If the
+			 * parent page is split, too, when we recurse up to insert the
+			 * downlinks, the recursive gistbufferinginserttuples() call will
+			 * update the map again.
+			 */
+			if (level > 0)
+				gistMemorizeParent(buildstate,
+								   BufferGetBlockNumber(splitinfo->buf),
+								   BufferGetBlockNumber(parentBuffer));
+
+			/*
+			 * Also update the parent map for all the downlinks that got moved
+			 * to a different page. (actually this also loops through the
+			 * downlinks that stayed on the original page, but it does no
+			 * harm).
+			 */
+			if (level > 1)
+				gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
+
+			/*
+			 * Since there's no concurrent access, we can release the lower
+			 * level buffers immediately. This includes the original page.
+			 */
+			UnlockReleaseBuffer(splitinfo->buf);
+			downlinks[i++] = splitinfo->downlink;
+		}
+
+		/* Insert them into parent. */
+		gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
+								  downlinks, ndownlinks, downlinkoffnum,
+								  InvalidBlockNumber, InvalidOffsetNumber);
+
+		list_free_deep(splitinfo);	/* we don't need this anymore */
+	}
+	else
+		UnlockReleaseBuffer(buffer);
+
+	return placed_to_blk;
+}
+
+/*
+ * Find the downlink pointing to a child page.
+ *
+ * 'childblkno' indicates the child page to find the parent for. 'level' is
+ * the level of the child. On entry, *parentblkno and *downlinkoffnum can
+ * point to a location where the downlink used to be - we will check that
+ * location first, and save some cycles if it hasn't moved. The function
+ * returns a buffer containing the downlink, exclusively-locked, and
+ * *parentblkno and *downlinkoffnum are set to the real location of the
+ * downlink.
+ *
+ * If the child page is a leaf (level == 0), the caller must supply a correct
+ * parentblkno. Otherwise we use the parent map hash table to find the parent
+ * block.
+ *
+ * This function serves the same purpose as gistFindCorrectParent() during
+ * normal index inserts, but this is simpler because we don't need to deal
+ * with concurrent inserts.
+ */
+static Buffer
+gistBufferingFindCorrectParent(GISTBuildState *buildstate,
+							   BlockNumber childblkno, int level,
+							   BlockNumber *parentblkno,
+							   OffsetNumber *downlinkoffnum)
+{
+	BlockNumber parent;
+	Buffer		buffer;
+	Page		page;
+	OffsetNumber maxoff;
+	OffsetNumber off;
+
+	if (level > 0)
+		parent = gistGetParent(buildstate, childblkno);
+	else
+	{
+		/*
+		 * For a leaf page, the caller must supply a correct parent block
+		 * number.
+		 */
+		if (*parentblkno == InvalidBlockNumber)
+			elog(ERROR, "no parent buffer provided of child %u", childblkno);
+		parent = *parentblkno;
+	}
+
+	buffer = ReadBuffer(buildstate->indexrel, parent);
+	page = BufferGetPage(buffer);
+	LockBuffer(buffer, GIST_EXCLUSIVE);
+	gistcheckpage(buildstate->indexrel, buffer);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/* Check if it was not moved */
+	if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
+		*downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
+	{
+		ItemId		iid = PageGetItemId(page, *downlinkoffnum);
+		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+		if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
+		{
+			/* Still there */
+			return buffer;
+		}
+	}
+
+	/*
+	 * Downlink was not at the offset where it used to be. Scan the page to
+	 * find it. During normal gist insertions, it might've moved to another
+	 * page, to the right, but during a buffering build, we keep track of the
+	 * parent of each page in the lookup table so we should always know what
+	 * page it's on.
+	 */
+	for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
+	{
+		ItemId		iid = PageGetItemId(page, off);
+		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+		if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
+		{
+			/* yes!!, found it */
+			*downlinkoffnum = off;
+			return buffer;
+		}
+	}
+
+	elog(ERROR, "failed to re-find parent for block %u", childblkno);
+	return InvalidBuffer;		/* keep compiler quiet */
+}
+
+/*
+ * Process buffers emptying stack. Emptying of one buffer can cause emptying
+ * of other buffers. This function iterates until this cascading emptying
+ * process finished, e.g. until buffers emptying stack is empty.
+ */
+static void
+gistProcessEmptyingQueue(GISTBuildState *buildstate)
+{
+	GISTBuildBuffers *gfbb = buildstate->gfbb;
+
+	/* Iterate while we have elements in buffers emptying stack. */
+	while (gfbb->bufferEmptyingQueue != NIL)
+	{
+		GISTNodeBuffer *emptyingNodeBuffer;
+
+		/* Get node buffer from emptying stack. */
+		emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
+		gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
+		emptyingNodeBuffer->queuedForEmptying = false;
+
+		/*
+		 * We are going to load last pages of buffers where emptying will be
+		 * to. So let's unload any previously loaded buffers.
+		 */
+		gistUnloadNodeBuffers(gfbb);
+
+		/*
+		 * Pop tuples from the buffer and run them down to the buffers at
+		 * lower level, or leaf pages. We continue until one of the lower
+		 * level buffers fills up, or this buffer runs empty.
+		 *
+		 * In Arge et al's paper, the buffer emptying is stopped after
+		 * processing 1/2 node buffer worth of tuples, to avoid overfilling
+		 * any of the lower level buffers. However, it's more efficient to
+		 * keep going until one of the lower level buffers actually fills up,
+		 * so that's what we do. This doesn't need to be exact, if a buffer
+		 * overfills by a few tuples, there's no harm done.
+		 */
+		while (true)
+		{
+			IndexTuple	itup;
+
+			/* Get next index tuple from the buffer */
+			if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
+				break;
+
+			/*
+			 * Run it down to the underlying node buffer or leaf page.
+			 *
+			 * Note: it's possible that the buffer we're emptying splits as a
+			 * result of this call. If that happens, our emptyingNodeBuffer
+			 * points to the left half of the split. After split, it's very
+			 * likely that the new left buffer is no longer over the half-full
+			 * threshold, but we might as well keep flushing tuples from it
+			 * until we fill a lower-level buffer.
+			 */
+			if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
+			{
+				/*
+				 * A lower level buffer filled up. Stop emptying this buffer,
+				 * to avoid overflowing the lower level buffer.
+				 */
+				break;
+			}
+
+			/* Free all the memory allocated during index tuple processing */
+			MemoryContextReset(buildstate->giststate->tempCxt);
+		}
+	}
+}
+
+/*
+ * Empty all node buffers, from top to bottom. This is done at the end of
+ * index build to flush all remaining tuples to the index.
+ *
+ * Note: This destroys the buffersOnLevels lists, so the buffers should not
+ * be inserted to after this call.
+ */
+static void
+gistEmptyAllBuffers(GISTBuildState *buildstate)
+{
+	GISTBuildBuffers *gfbb = buildstate->gfbb;
+	MemoryContext oldCtx;
+	int			i;
+
+	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
+
+	/*
+	 * Iterate through the levels from top to bottom.
+	 */
+	for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
+	{
+		/*
+		 * Empty all buffers on this level. Note that new buffers can pop up
+		 * in the list during the processing, as a result of page splits, so a
+		 * simple walk through the list won't work. We remove buffers from the
+		 * list when we see them empty; a buffer can't become non-empty once
+		 * it's been fully emptied.
+		 */
+		while (gfbb->buffersOnLevels[i] != NIL)
+		{
+			GISTNodeBuffer *nodeBuffer;
+
+			nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
+
+			if (nodeBuffer->blocksCount != 0)
+			{
+				/*
+				 * Add this buffer to the emptying queue, and proceed to empty
+				 * the queue.
+				 */
+				if (!nodeBuffer->queuedForEmptying)
+				{
+					MemoryContextSwitchTo(gfbb->context);
+					nodeBuffer->queuedForEmptying = true;
+					gfbb->bufferEmptyingQueue =
+						lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
+					MemoryContextSwitchTo(buildstate->giststate->tempCxt);
+				}
+				gistProcessEmptyingQueue(buildstate);
+			}
+			else
+				gfbb->buffersOnLevels[i] =
+					list_delete_first(gfbb->buffersOnLevels[i]);
+		}
+		elog(DEBUG2, "emptied all buffers at level %d", i);
+	}
+	MemoryContextSwitchTo(oldCtx);
+}
+
+/*
+ * Get the depth of the GiST index.
+ */
+static int
+gistGetMaxLevel(Relation index)
+{
+	int			maxLevel;
+	BlockNumber blkno;
+
+	/*
+	 * Traverse down the tree, starting from the root, until we hit the leaf
+	 * level.
+	 */
+	maxLevel = 0;
+	blkno = GIST_ROOT_BLKNO;
+	while (true)
+	{
+		Buffer		buffer;
+		Page		page;
+		IndexTuple	itup;
+
+		buffer = ReadBuffer(index, blkno);
+
+		/*
+		 * There's no concurrent access during index build, so locking is just
+		 * pro forma.
+		 */
+		LockBuffer(buffer, GIST_SHARE);
+		page = (Page) BufferGetPage(buffer);
+
+		if (GistPageIsLeaf(page))
+		{
+			/* We hit the bottom, so we're done. */
+			UnlockReleaseBuffer(buffer);
+			break;
+		}
+
+		/*
+		 * Pick the first downlink on the page, and follow it. It doesn't
+		 * matter which downlink we choose, the tree has the same depth
+		 * everywhere, so we just pick the first one.
+		 */
+		itup = (IndexTuple) PageGetItem(page,
+										PageGetItemId(page, FirstOffsetNumber));
+		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+		UnlockReleaseBuffer(buffer);
+
+		/*
+		 * We're going down on the tree. It means that there is yet one more
+		 * level in the tree.
+		 */
+		maxLevel++;
+	}
+	return maxLevel;
+}
+
+
+/*
+ * Routines for managing the parent map.
+ *
+ * Whenever a page is split, we need to insert the downlinks into the parent.
+ * We need to somehow find the parent page to do that. In normal insertions,
+ * we keep a stack of nodes visited when we descend the tree. However, in
+ * buffering build, we can start descending the tree from any internal node,
+ * when we empty a buffer by cascading tuples to its children. So we don't
+ * have a full stack up to the root available at that time.
+ *
+ * So instead, we maintain a hash table to track the parent of every internal
+ * page. We don't need to track the parents of leaf nodes, however. Whenever
+ * we insert to a leaf, we've just descended down from its parent, so we know
+ * its immediate parent already. This helps a lot to limit the memory used
+ * by this hash table.
+ *
+ * Whenever an internal node is split, the parent map needs to be updated.
+ * the parent of the new child page needs to be recorded, and also the
+ * entries for all page whose downlinks are moved to a new page at the split
+ * needs to be updated.
+ *
+ * We also update the parent map whenever we descend the tree. That might seem
+ * unnecessary, because we maintain the map whenever a downlink is moved or
+ * created, but it is needed because we switch to buffering mode after
+ * creating a tree with regular index inserts. Any pages created before
+ * switching to buffering mode will not be present in the parent map initially,
+ * but will be added there the first time we visit them.
+ */
+
+typedef struct
+{
+	BlockNumber childblkno;		/* hash key */
+	BlockNumber parentblkno;
+} ParentMapEntry;
+
+static void
+gistInitParentMap(GISTBuildState *buildstate)
+{
+	HASHCTL		hashCtl;
+
+	hashCtl.keysize = sizeof(BlockNumber);
+	hashCtl.entrysize = sizeof(ParentMapEntry);
+	hashCtl.hcxt = CurrentMemoryContext;
+	buildstate->parentMap = hash_create("gistbuild parent map",
+										1024,
+										&hashCtl,
+										HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+}
+
+static void
+gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
+{
+	ParentMapEntry *entry;
+	bool		found;
+
+	entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
+										   (const void *) &child,
+										   HASH_ENTER,
+										   &found);
+	entry->parentblkno = parent;
+}
+
+/*
+ * Scan all downlinks on a page, and memorize their parent.
+ */
+static void
+gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
+{
+	OffsetNumber maxoff;
+	OffsetNumber off;
+	BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
+	Page		page = BufferGetPage(parentbuf);
+
+	Assert(!GistPageIsLeaf(page));
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (off = FirstOffsetNumber; off <= maxoff; off++)
+	{
+		ItemId		iid = PageGetItemId(page, off);
+		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
+		BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+		gistMemorizeParent(buildstate, childblkno, parentblkno);
+	}
+}
+
+static BlockNumber
+gistGetParent(GISTBuildState *buildstate, BlockNumber child)
+{
+	ParentMapEntry *entry;
+	bool		found;
+
+	/* Find node buffer in hash table */
+	entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
+										   (const void *) &child,
+										   HASH_FIND,
+										   &found);
+	if (!found)
+		elog(ERROR, "could not find parent of block %u in lookup table", child);
+
+	return entry->parentblkno;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
commit	46651ce6fe013220ed397add242004d764fc0153 (patch)
tree	6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/access/gist/gistbuild.c
parent	Initial commit. (diff)
download	postgresql-14-upstream.tar.xz postgresql-14-upstream.zip