summaryrefslogtreecommitdiffstats
path: root/src/backend/access/gist
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/gist')
-rw-r--r--src/backend/access/gist/Makefile28
-rw-r--r--src/backend/access/gist/README466
-rw-r--r--src/backend/access/gist/gist.c1709
-rw-r--r--src/backend/access/gist/gistbuild.c1186
-rw-r--r--src/backend/access/gist/gistbuildbuffers.c776
-rw-r--r--src/backend/access/gist/gistget.c803
-rw-r--r--src/backend/access/gist/gistproc.c1542
-rw-r--r--src/backend/access/gist/gistscan.c358
-rw-r--r--src/backend/access/gist/gistsplit.c779
-rw-r--r--src/backend/access/gist/gistutil.c1054
-rw-r--r--src/backend/access/gist/gistvacuum.c658
-rw-r--r--src/backend/access/gist/gistvalidate.c281
-rw-r--r--src/backend/access/gist/gistxlog.c717
13 files changed, 10357 insertions, 0 deletions
diff --git a/src/backend/access/gist/Makefile b/src/backend/access/gist/Makefile
new file mode 100644
index 0000000..1aca8bc
--- /dev/null
+++ b/src/backend/access/gist/Makefile
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/gist
+#
+# IDENTIFICATION
+# src/backend/access/gist/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/gist
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ gist.o \
+ gistbuild.o \
+ gistbuildbuffers.o \
+ gistget.o \
+ gistproc.o \
+ gistscan.o \
+ gistsplit.o \
+ gistutil.o \
+ gistvacuum.o \
+ gistvalidate.o \
+ gistxlog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
new file mode 100644
index 0000000..fffdfff
--- /dev/null
+++ b/src/backend/access/gist/README
@@ -0,0 +1,466 @@
+src/backend/access/gist/README
+
+GiST Indexing
+=============
+
+This directory contains an implementation of GiST indexing for Postgres.
+
+GiST stands for Generalized Search Tree. It was introduced in the seminal paper
+"Generalized Search Trees for Database Systems", 1995, Joseph M. Hellerstein,
+Jeffrey F. Naughton, Avi Pfeffer:
+
+ http://www.sai.msu.su/~megera/postgres/gist/papers/gist.ps
+
+and implemented by J. Hellerstein and P. Aoki in an early version of
+PostgreSQL (more details are available from The GiST Indexing Project
+at Berkeley at http://gist.cs.berkeley.edu/). As a "university"
+project it had a limited number of features and was in rare use.
+
+The current implementation of GiST supports:
+
+ * Variable length keys
+ * Composite keys (multi-key)
+ * Ordered search (nearest-neighbor search)
+ * provides NULL-safe interface to GiST core
+ * Concurrency
+ * Recovery support via WAL logging
+ * Buffering build algorithm
+
+The support for concurrency implemented in PostgreSQL was developed based on
+the paper "Access Methods for Next-Generation Database Systems" by
+Marcel Kornacker:
+
+ http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
+
+Buffering build algorithm for GiST was developed based on the paper "Efficient
+Bulk Operations on Dynamic R-trees" by Lars Arge, Klaus Hinrichs, Jan Vahrenhold
+and Jeffrey Scott Vitter.
+
+ http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.135.9894&rep=rep1&type=pdf
+
+The original algorithms were modified in several ways:
+
+* They had to be adapted to PostgreSQL conventions. For example, the SEARCH
+ algorithm was considerably changed, because in PostgreSQL the search function
+ should return one tuple (next), not all tuples at once. Also, it should
+ release page locks between calls.
+* Since we added support for variable length keys, it's not possible to
+ guarantee enough free space for all keys on pages after splitting. User
+ defined function picksplit doesn't have information about size of tuples
+ (each tuple may contain several keys as in multicolumn index while picksplit
+ could work with only one key) and pages.
+* We modified original INSERT algorithm for performance reasons. In particular,
+ it is now a single-pass algorithm.
+* Since the papers were theoretical, some details were omitted and we
+ had to find out ourself how to solve some specific problems.
+
+Because of the above reasons, we have revised the interaction of GiST
+core and PostgreSQL WAL system. Moreover, we encountered (and solved)
+a problem of uncompleted insertions when recovering after crash, which
+was not touched in the paper.
+
+Search Algorithm
+----------------
+
+The search code maintains a queue of unvisited items, where an "item" is
+either a heap tuple known to satisfy the search conditions, or an index
+page that is consistent with the search conditions according to inspection
+of its parent page's downlink item. Initially the root page is searched
+to find unvisited items in it. Then we pull items from the queue. A
+heap tuple pointer is just returned immediately; an index page entry
+causes that page to be searched, generating more queue entries.
+
+The queue is kept ordered with heap tuple items at the front, then
+index page entries, with any newly-added index page entry inserted
+before existing index page entries. This ensures depth-first traversal
+of the index, and in particular causes the first few heap tuples to be
+returned as soon as possible. That is helpful in case there is a LIMIT
+that requires only a few tuples to be produced.
+
+To implement nearest-neighbor search, the queue entries are augmented
+with distance data: heap tuple entries are labeled with exact distance
+from the search argument, while index-page entries must be labeled with
+the minimum distance that any of their children could have. Then,
+queue entries are retrieved in smallest-distance-first order, with
+entries having identical distances managed as stated in the previous
+paragraph.
+
+The search algorithm keeps an index page locked only long enough to scan
+its entries and queue those that satisfy the search conditions. Since
+insertions can occur concurrently with searches, it is possible for an
+index child page to be split between the time we make a queue entry for it
+(while visiting its parent page) and the time we actually reach and scan
+the child page. To avoid missing the entries that were moved to the right
+sibling, we detect whether a split has occurred by comparing the child
+page's NSN to the LSN that the parent had when visited. If it did, the
+sibling page is immediately added to the front of the queue, ensuring that
+its items will be scanned in the same order as if they were still on the
+original child page.
+
+As is usual in Postgres, the search algorithm only guarantees to find index
+entries that existed before the scan started; index entries added during
+the scan might or might not be visited. This is okay as long as all
+searches use MVCC snapshot rules to reject heap tuples newer than the time
+of scan start. In particular, this means that we need not worry about
+cases where a parent page's downlink key is "enlarged" after we look at it.
+Any such enlargement would be to add child items that we aren't interested
+in returning anyway.
+
+
+Insert Algorithm
+----------------
+
+INSERT guarantees that the GiST tree remains balanced. User defined key method
+Penalty is used for choosing a subtree to insert; method PickSplit is used for
+the node splitting algorithm; method Union is used for propagating changes
+upward to maintain the tree properties.
+
+To insert a tuple, we first have to find a suitable leaf page to insert to.
+The algorithm walks down the tree, starting from the root, along the path
+of smallest Penalty. At each step:
+
+1. Has this page been split since we looked at the parent? If so, it's
+possible that we should be inserting to the other half instead, so retreat
+back to the parent.
+2. If this is a leaf node, we've found our target node.
+3. Otherwise use Penalty to pick a new target subtree.
+4. Check the key representing the target subtree. If it doesn't already cover
+the key we're inserting, replace it with the Union of the old downlink key
+and the key being inserted. (Actually, we always call Union, and just skip
+the replacement if the Unioned key is the same as the existing key)
+5. Replacing the key in step 4 might cause the page to be split. In that case,
+propagate the change upwards and restart the algorithm from the first parent
+that didn't need to be split.
+6. Walk down to the target subtree, and goto 1.
+
+This differs from the insertion algorithm in the original paper. In the
+original paper, you first walk down the tree until you reach a leaf page, and
+then you adjust the downlink in the parent, and propagate the adjustment up,
+all the way up to the root in the worst case. But we adjust the downlinks to
+cover the new key already when we walk down, so that when we reach the leaf
+page, we don't need to update the parents anymore, except to insert the
+downlinks if we have to split the page. This makes crash recovery simpler:
+after inserting a key to the page, the tree is immediately self-consistent
+without having to update the parents. Even if we split a page and crash before
+inserting the downlink to the parent, the tree is self-consistent because the
+right half of the split is accessible via the rightlink of the left page
+(which replaced the original page).
+
+Note that the algorithm can walk up and down the tree before reaching a leaf
+page, if internal pages need to split while adjusting the downlinks for the
+new key. Eventually, you should reach the bottom, and proceed with the
+insertion of the new tuple.
+
+Once we've found the target page to insert to, we check if there's room
+for the new tuple. If there is, the tuple is inserted, and we're done.
+If it doesn't fit, however, the page needs to be split. Note that it is
+possible that a page needs to be split into more than two pages, if keys have
+different lengths or more than one key is being inserted at a time (which can
+happen when inserting downlinks for a page split that resulted in more than
+two pages at the lower level). After splitting a page, the parent page needs
+to be updated. The downlink for the new page needs to be inserted, and the
+downlink for the old page, which became the left half of the split, needs to
+be updated to only cover those tuples that stayed on the left page. Inserting
+the downlink in the parent can again lead to a page split, recursing up to the
+root page in the worst case.
+
+gistplacetopage is the workhorse function that performs one step of the
+insertion. If the tuple fits, it inserts it to the given page, otherwise
+it splits the page, and constructs the new downlink tuples for the split
+pages. The caller must then call gistplacetopage() on the parent page to
+insert the downlink tuples. The parent page that holds the downlink to
+the child might have migrated as a result of concurrent splits of the
+parent, gistFindCorrectParent() is used to find the parent page.
+
+Splitting the root page works slightly differently. At root split,
+gistplacetopage() allocates the new child pages and replaces the old root
+page with the new root containing downlinks to the new children, all in one
+operation.
+
+
+findPath is a subroutine of findParent, used when the correct parent page
+can't be found by following the rightlinks at the parent level:
+
+findPath( stack item )
+ push stack, [root, 0, 0] // page, LSN, parent
+ while( stack )
+ ptr = top of stack
+ latch( ptr->page, S-mode )
+ if ( ptr->parent->page->lsn < ptr->page->nsn )
+ push stack, [ ptr->page->rightlink, 0, ptr->parent ]
+ end
+ for( each tuple on page )
+ if ( tuple->pagepointer == item->page )
+ return stack
+ else
+ add to stack at the end [tuple->pagepointer,0, ptr]
+ end
+ end
+ unlatch( ptr->page )
+ pop stack
+ end
+
+
+gistFindCorrectParent is used to re-find the parent of a page during
+insertion. It might have migrated to the right since we traversed down the
+tree because of page splits.
+
+findParent( stack item )
+ parent = item->parent
+ if ( parent->page->lsn != parent->lsn )
+ while(true)
+ search parent tuple on parent->page, if found the return
+ rightlink = parent->page->rightlink
+ unlatch( parent->page )
+ if ( rightlink is incorrect )
+ break loop
+ end
+ parent->page = rightlink
+ latch( parent->page, X-mode )
+ end
+ newstack = findPath( item->parent )
+ replace part of stack to new one
+ latch( parent->page, X-mode )
+ return findParent( item )
+ end
+
+pageSplit function decides how to distribute keys to the new pages after
+page split:
+
+pageSplit(page, allkeys)
+ (lkeys, rkeys) = pickSplit( allkeys )
+ if ( page is root )
+ lpage = new page
+ else
+ lpage = page
+ rpage = new page
+ if ( no space left on rpage )
+ newkeys = pageSplit( rpage, rkeys )
+ else
+ push newkeys, union(rkeys)
+ end
+ if ( no space left on lpage )
+ push newkeys, pageSplit( lpage, lkeys )
+ else
+ push newkeys, union(lkeys)
+ end
+ return newkeys
+
+
+
+Concurrency control
+-------------------
+As a rule of thumb, if you need to hold a lock on multiple pages at the
+same time, the locks should be acquired in the following order: child page
+before parent, and left-to-right at the same level. Always acquiring the
+locks in the same order avoids deadlocks.
+
+The search algorithm only looks at and locks one page at a time. Consequently
+there's a race condition between a search and a page split. A page split
+happens in two phases: 1. The page is split 2. The downlink is inserted to the
+parent. If a search looks at the parent page between those steps, before the
+downlink is inserted, it will still find the new right half by following the
+rightlink on the left half. But it must not follow the rightlink if it saw the
+downlink in the parent, or the page will be visited twice!
+
+A split initially marks the left page with the F_FOLLOW_RIGHT flag. If a scan
+sees that flag set, it knows that the right page is missing the downlink, and
+should be visited too. When split inserts the downlink to the parent, it
+clears the F_FOLLOW_RIGHT flag in the child, and sets the NSN field in the
+child page header to match the LSN of the insertion on the parent. If the
+F_FOLLOW_RIGHT flag is not set, a scan compares the NSN on the child and the
+LSN it saw in the parent. If NSN < LSN, the scan looked at the parent page
+before the downlink was inserted, so it should follow the rightlink. Otherwise
+the scan saw the downlink in the parent page, and will/did follow that as
+usual.
+
+A scan can't normally see a page with the F_FOLLOW_RIGHT flag set, because
+a page split keeps the child pages locked until the downlink has been inserted
+to the parent and the flag cleared again. But if a crash happens in the middle
+of a page split, before the downlinks are inserted into the parent, that will
+leave a page with F_FOLLOW_RIGHT in the tree. Scans handle that just fine,
+but we'll eventually want to fix that for performance reasons. And more
+importantly, dealing with pages with missing downlink pointers in the parent
+would complicate the insertion algorithm. So when an insertion sees a page
+with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
+crashed in the middle to completion by adding the downlink in the parent.
+
+Buffering build algorithm
+-------------------------
+
+In the buffering index build algorithm, some or all internal nodes have a
+buffer attached to them. When a tuple is inserted at the top, the descend down
+the tree is stopped as soon as a buffer is reached, and the tuple is pushed to
+the buffer. When a buffer gets too full, all the tuples in it are flushed to
+the lower level, where they again hit lower level buffers or leaf pages. This
+makes the insertions happen in more of a breadth-first than depth-first order,
+which greatly reduces the amount of random I/O required.
+
+In the algorithm, levels are numbered so that leaf pages have level zero,
+and internal node levels count up from 1. This numbering ensures that a page's
+level number never changes, even when the root page is split.
+
+Level Tree
+
+3 *
+ / \
+2 * *
+ / | \ / | \
+1 * * * * * *
+ / \ / \ / \ / \ / \ / \
+0 o o o o o o o o o o o o
+
+* - internal page
+o - leaf page
+
+Internal pages that belong to certain levels have buffers associated with
+them. Leaf pages never have buffers. Which levels have buffers is controlled
+by "level step" parameter: level numbers that are multiples of level_step
+have buffers, while others do not. For example, if level_step = 2, then
+pages on levels 2, 4, 6, ... have buffers. If level_step = 1 then every
+internal page has a buffer.
+
+Level Tree (level_step = 1) Tree (level_step = 2)
+
+3 * *
+ / \ / \
+2 *(b) *(b) *(b) *(b)
+ / | \ / | \ / | \ / | \
+1 *(b) *(b) *(b) *(b) *(b) *(b) * * * * * *
+ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \
+0 o o o o o o o o o o o o o o o o o o o o o o o o
+
+(b) - buffer
+
+Logically, a buffer is just bunch of tuples. Physically, it is divided in
+pages, backed by a temporary file. Each buffer can be in one of two states:
+a) Last page of the buffer is kept in main memory. A node buffer is
+automatically switched to this state when a new index tuple is added to it,
+or a tuple is removed from it.
+b) All pages of the buffer are swapped out to disk. When a buffer becomes too
+full, and we start to flush it, all other buffers are switched to this state.
+
+When an index tuple is inserted, its initial processing can end in one of the
+following points:
+1) Leaf page, if the depth of the index <= level_step, meaning that
+ none of the internal pages have buffers associated with them.
+2) Buffer of topmost level page that has buffers.
+
+New index tuples are processed until one of the buffers in the topmost
+buffered level becomes half-full. When a buffer becomes half-full, it's added
+to the emptying queue, and will be emptied before a new tuple is processed.
+
+Buffer emptying process means that index tuples from the buffer are moved
+into buffers at a lower level, or leaf pages. First, all the other buffers are
+swapped to disk to free up the memory. Then tuples are popped from the buffer
+one by one, and cascaded down the tree to the next buffer or leaf page below
+the buffered node.
+
+Emptying a buffer has the interesting dynamic property that any intermediate
+pages between the buffer being emptied, and the next buffered or leaf level
+below it, become cached. If there are no more buffers below the node, the leaf
+pages where the tuples finally land on get cached too. If there are, the last
+buffer page of each buffer below is kept in memory. This is illustrated in
+the figures below:
+
+ Buffer being emptied to
+ lower-level buffers Buffer being emptied to leaf pages
+
+ +(fb) +(fb)
+ / \ / \
+ + + + +
+ / \ / \ / \ / \
+ *(ab) *(ab) *(ab) *(ab) x x x x
+
++ - cached internal page
+x - cached leaf page
+* - non-cached internal page
+(fb) - buffer being emptied
+(ab) - buffers being appended to, with last page in memory
+
+In the beginning of the index build, the level-step is chosen so that all those
+pages involved in emptying one buffer fit in cache, so after each of those
+pages have been accessed once and cached, emptying a buffer doesn't involve
+any more I/O. This locality is where the speedup of the buffering algorithm
+comes from.
+
+Emptying one buffer can fill up one or more of the lower-level buffers,
+triggering emptying of them as well. Whenever a buffer becomes too full, it's
+added to the emptying queue, and will be emptied after the current buffer has
+been processed.
+
+To keep the size of each buffer limited even in the worst case, buffer emptying
+is scheduled as soon as a buffer becomes half-full, and emptying it continues
+until 1/2 of the nominal buffer size worth of tuples has been emptied. This
+guarantees that when buffer emptying begins, all the lower-level buffers
+are at most half-full. In the worst case that all the tuples are cascaded down
+to the same lower-level buffer, that buffer therefore has enough space to
+accommodate all the tuples emptied from the upper-level buffer. There is no
+hard size limit in any of the data structures used, though, so this only needs
+to be approximate; small overfilling of some buffers doesn't matter.
+
+If an internal page that has a buffer associated with it is split, the buffer
+needs to be split too. All tuples in the buffer are scanned through and
+relocated to the correct sibling buffers, using the penalty function to decide
+which buffer each tuple should go to.
+
+After all tuples from the heap have been processed, there are still some index
+tuples in the buffers. At this point, final buffer emptying starts. All buffers
+are emptied in top-down order. This is slightly complicated by the fact that
+new buffers can be allocated during the emptying, due to page splits. However,
+the new buffers will always be siblings of buffers that haven't been fully
+emptied yet; tuples never move upwards in the tree. The final emptying loops
+through buffers at a given level until all buffers at that level have been
+emptied, and then moves down to the next level.
+
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+VACUUM works in two stages:
+
+In the first stage, we scan the whole index in physical order. To make sure
+that we don't miss any dead tuples because a concurrent page split moved them,
+we check the F_FOLLOW_RIGHT flags and NSN on each page, to detect if the
+page has been concurrently split. If a concurrent page split is detected, and
+one half of the page was moved to a position that we already scanned, we
+"jump backwards" to scan the page again. This is the same mechanism that
+B-tree VACUUM uses, but because we already have NSNs on pages, to detect page
+splits during searches, we don't need a "vacuum cycle ID" concept for that
+like B-tree does.
+
+While we scan all the pages, we also make note of any completely empty leaf
+pages. We will try to unlink them from the tree after the scan. We also record
+the block numbers of all internal pages; they are needed to locate parents of
+the empty pages while unlinking them.
+
+We try to unlink any empty leaf pages from the tree, so that their space can
+be reused. In order to delete an empty page, its downlink must be removed from
+the parent. We scan all the internal pages, whose block numbers we memorized
+in the first stage, and look for downlinks to pages that we have memorized as
+being empty. Whenever we find one, we acquire a lock on the parent and child
+page, re-check that the child page is still empty. Then, we remove the
+downlink and mark the child as deleted, and release the locks.
+
+The insertion algorithm would get confused, if an internal page was completely
+empty. So we never delete the last child of an internal page, even if it's
+empty. Currently, we only support deleting leaf pages.
+
+This page deletion algorithm works on a best-effort basis. It might fail to
+find a downlink, if a concurrent page split moved it after the first stage.
+In that case, we won't be able to remove all empty pages. That's OK, it's
+not expected to happen very often, and hopefully the next VACUUM will clean
+it up.
+
+When we have deleted a page, it's possible that an in-progress search will
+still descend on the page, if it saw the downlink before we removed it. The
+search will see that it is deleted, and ignore it, but as long as that can
+happen, we cannot reuse the page. To "wait out" any in-progress searches, when
+a page is deleted, it's labeled with the current next-transaction counter
+value. The page is not recycled, until that XID is no longer visible to
+anyone. That's much more conservative than necessary, but let's keep it
+simple.
+
+
+Authors:
+ Teodor Sigaev <teodor@sigaev.ru>
+ Oleg Bartunov <oleg@sai.msu.su>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
new file mode 100644
index 0000000..1ea38d9
--- /dev/null
+++ b/src/backend/access/gist/gist.c
@@ -0,0 +1,1709 @@
+/*-------------------------------------------------------------------------
+ *
+ * gist.c
+ * interface routines for the postgres GiST index access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gist.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/gist_private.h"
+#include "access/gistscan.h"
+#include "catalog/pg_collation.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "utils/builtins.h"
+#include "utils/index_selfuncs.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* non-export function prototypes */
+static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
+static bool gistinserttuple(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum);
+static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate,
+ IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+ Buffer leftchild, Buffer rightchild,
+ bool unlockbuf, bool unlockleftchild);
+static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate, List *splitinfo, bool unlockbuf);
+static void gistprunepage(Relation rel, Page page, Buffer buffer,
+ Relation heapRel);
+
+
+#define ROTATEDIST(d) do { \
+ SplitedPageLayout *tmp=(SplitedPageLayout*)palloc0(sizeof(SplitedPageLayout)); \
+ tmp->block.blkno = InvalidBlockNumber; \
+ tmp->buffer = InvalidBuffer; \
+ tmp->next = (d); \
+ (d)=tmp; \
+} while(0)
+
+
+/*
+ * GiST handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+gisthandler(PG_FUNCTION_ARGS)
+{
+ IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+ amroutine->amstrategies = 0;
+ amroutine->amsupport = GISTNProcs;
+ amroutine->amoptsprocnum = GIST_OPTIONS_PROC;
+ amroutine->amcanorder = false;
+ amroutine->amcanorderbyop = true;
+ amroutine->amcanbackward = false;
+ amroutine->amcanunique = false;
+ amroutine->amcanmulticol = true;
+ amroutine->amoptionalkey = true;
+ amroutine->amsearcharray = false;
+ amroutine->amsearchnulls = true;
+ amroutine->amstorage = true;
+ amroutine->amclusterable = true;
+ amroutine->ampredlocks = true;
+ amroutine->amcanparallel = false;
+ amroutine->amcaninclude = true;
+ amroutine->amusemaintenanceworkmem = false;
+ amroutine->amparallelvacuumoptions =
+ VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
+ amroutine->amkeytype = InvalidOid;
+
+ amroutine->ambuild = gistbuild;
+ amroutine->ambuildempty = gistbuildempty;
+ amroutine->aminsert = gistinsert;
+ amroutine->ambulkdelete = gistbulkdelete;
+ amroutine->amvacuumcleanup = gistvacuumcleanup;
+ amroutine->amcanreturn = gistcanreturn;
+ amroutine->amcostestimate = gistcostestimate;
+ amroutine->amoptions = gistoptions;
+ amroutine->amproperty = gistproperty;
+ amroutine->ambuildphasename = NULL;
+ amroutine->amvalidate = gistvalidate;
+ amroutine->ambeginscan = gistbeginscan;
+ amroutine->amrescan = gistrescan;
+ amroutine->amgettuple = gistgettuple;
+ amroutine->amgetbitmap = gistgetbitmap;
+ amroutine->amendscan = gistendscan;
+ amroutine->ammarkpos = NULL;
+ amroutine->amrestrpos = NULL;
+ amroutine->amestimateparallelscan = NULL;
+ amroutine->aminitparallelscan = NULL;
+ amroutine->amparallelrescan = NULL;
+
+ PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * Create and return a temporary memory context for use by GiST. We
+ * _always_ invoke user-provided methods in a temporary memory
+ * context, so that memory leaks in those functions cannot cause
+ * problems. Also, we use some additional temporary contexts in the
+ * GiST code itself, to avoid the need to do some awkward manual
+ * memory management.
+ */
+MemoryContext
+createTempGistContext(void)
+{
+ return AllocSetContextCreate(CurrentMemoryContext,
+ "GiST temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+}
+
+/*
+ * gistbuildempty() -- build an empty gist index in the initialization fork
+ */
+void
+gistbuildempty(Relation index)
+{
+ Buffer buffer;
+
+ /* Initialize the root page */
+ buffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ /* Initialize and xlog buffer */
+ START_CRIT_SECTION();
+ GISTInitBuffer(buffer, F_LEAF);
+ MarkBufferDirty(buffer);
+ log_newpage_buffer(buffer, true);
+ END_CRIT_SECTION();
+
+ /* Unlock and release the buffer */
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * gistinsert -- wrapper for GiST tuple insertion.
+ *
+ * This is the public interface routine for tuple insertion in GiSTs.
+ * It doesn't do any work; just locks the relation and passes the buck.
+ */
+bool
+gistinsert(Relation r, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ IndexInfo *indexInfo)
+{
+ GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache;
+ IndexTuple itup;
+ MemoryContext oldCxt;
+
+ /* Initialize GISTSTATE cache if first call in this statement */
+ if (giststate == NULL)
+ {
+ oldCxt = MemoryContextSwitchTo(indexInfo->ii_Context);
+ giststate = initGISTstate(r);
+ giststate->tempCxt = createTempGistContext();
+ indexInfo->ii_AmCache = (void *) giststate;
+ MemoryContextSwitchTo(oldCxt);
+ }
+
+ oldCxt = MemoryContextSwitchTo(giststate->tempCxt);
+
+ itup = gistFormTuple(giststate, r,
+ values, isnull, true /* size is currently bogus */ );
+ itup->t_tid = *ht_ctid;
+
+ gistdoinsert(r, itup, 0, giststate, heapRel, false);
+
+ /* cleanup */
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextReset(giststate->tempCxt);
+
+ return false;
+}
+
+
+/*
+ * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
+ * at that offset is atomically removed along with inserting the new tuples.
+ * This is used to replace a tuple with a new one.
+ *
+ * If 'leftchildbuf' is valid, we're inserting the downlink for the page
+ * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
+ * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
+ *
+ * If 'markfollowright' is true and the page is split, the left child is
+ * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
+ * index build, however, there is no concurrent access and the page splitting
+ * is done in a slightly simpler fashion, and false is passed.
+ *
+ * If there is not enough room on the page, it is split. All the split
+ * pages are kept pinned and locked and returned in *splitinfo, the caller
+ * is responsible for inserting the downlinks for them. However, if
+ * 'buffer' is the root page and it needs to be split, gistplacetopage()
+ * performs the split as one atomic operation, and *splitinfo is set to NIL.
+ * In that case, we continue to hold the root page locked, and the child
+ * pages are released; note that new tuple(s) are *not* on the root page
+ * but in one of the new child pages.
+ *
+ * If 'newblkno' is not NULL, returns the block number of page the first
+ * new/updated tuple was inserted to. Usually it's the given page, but could
+ * be its right sibling if the page was split.
+ *
+ * Returns 'true' if the page was split, 'false' otherwise.
+ */
+bool
+gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
+ Buffer buffer,
+ IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+ BlockNumber *newblkno,
+ Buffer leftchildbuf,
+ List **splitinfo,
+ bool markfollowright,
+ Relation heapRel,
+ bool is_build)
+{
+ BlockNumber blkno = BufferGetBlockNumber(buffer);
+ Page page = BufferGetPage(buffer);
+ bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
+ XLogRecPtr recptr;
+ int i;
+ bool is_split;
+
+ /*
+ * Refuse to modify a page that's incompletely split. This should not
+ * happen because we finish any incomplete splits while we walk down the
+ * tree. However, it's remotely possible that another concurrent inserter
+ * splits a parent page, and errors out before completing the split. We
+ * will just throw an error in that case, and leave any split we had in
+ * progress unfinished too. The next insert that comes along will clean up
+ * the mess.
+ */
+ if (GistFollowRight(page))
+ elog(ERROR, "concurrent GiST page split was incomplete");
+
+ /* should never try to insert to a deleted page */
+ Assert(!GistPageIsDeleted(page));
+
+ *splitinfo = NIL;
+
+ /*
+ * if isupdate, remove old key: This node's key has been modified, either
+ * because a child split occurred or because we needed to adjust our key
+ * for an insert in a child node. Therefore, remove the old version of
+ * this node's key.
+ *
+ * for WAL replay, in the non-split case we handle this by setting up a
+ * one-element todelete array; in the split case, it's handled implicitly
+ * because the tuple vector passed to gistSplit won't include this tuple.
+ */
+ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
+
+ /*
+ * If leaf page is full, try at first to delete dead tuples. And then
+ * check again.
+ */
+ if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
+ {
+ gistprunepage(rel, page, buffer, heapRel);
+ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
+ }
+
+ if (is_split)
+ {
+ /* no space for insertion */
+ IndexTuple *itvec;
+ int tlen;
+ SplitedPageLayout *dist = NULL,
+ *ptr;
+ BlockNumber oldrlink = InvalidBlockNumber;
+ GistNSN oldnsn = 0;
+ SplitedPageLayout rootpg;
+ bool is_rootsplit;
+ int npage;
+
+ is_rootsplit = (blkno == GIST_ROOT_BLKNO);
+
+ /*
+ * Form index tuples vector to split. If we're replacing an old tuple,
+ * remove the old version from the vector.
+ */
+ itvec = gistextractpage(page, &tlen);
+ if (OffsetNumberIsValid(oldoffnum))
+ {
+ /* on inner page we should remove old tuple */
+ int pos = oldoffnum - FirstOffsetNumber;
+
+ tlen--;
+ if (pos != tlen)
+ memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
+ }
+ itvec = gistjoinvector(itvec, &tlen, itup, ntup);
+ dist = gistSplit(rel, page, itvec, tlen, giststate);
+
+ /*
+ * Check that split didn't produce too many pages.
+ */
+ npage = 0;
+ for (ptr = dist; ptr; ptr = ptr->next)
+ npage++;
+ /* in a root split, we'll add one more page to the list below */
+ if (is_rootsplit)
+ npage++;
+ if (npage > GIST_MAX_SPLIT_PAGES)
+ elog(ERROR, "GiST page split into too many halves (%d, maximum %d)",
+ npage, GIST_MAX_SPLIT_PAGES);
+
+ /*
+ * Set up pages to work with. Allocate new buffers for all but the
+ * leftmost page. The original page becomes the new leftmost page, and
+ * is just replaced with the new contents.
+ *
+ * For a root-split, allocate new buffers for all child pages, the
+ * original page is overwritten with new root page containing
+ * downlinks to the new child pages.
+ */
+ ptr = dist;
+ if (!is_rootsplit)
+ {
+ /* save old rightlink and NSN */
+ oldrlink = GistPageGetOpaque(page)->rightlink;
+ oldnsn = GistPageGetNSN(page);
+
+ dist->buffer = buffer;
+ dist->block.blkno = BufferGetBlockNumber(buffer);
+ dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer));
+
+ /* clean all flags except F_LEAF */
+ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0;
+
+ ptr = ptr->next;
+ }
+ for (; ptr; ptr = ptr->next)
+ {
+ /* Allocate new page */
+ ptr->buffer = gistNewBuffer(rel);
+ GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
+ ptr->page = BufferGetPage(ptr->buffer);
+ ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
+ PredicateLockPageSplit(rel,
+ BufferGetBlockNumber(buffer),
+ BufferGetBlockNumber(ptr->buffer));
+ }
+
+ /*
+ * Now that we know which blocks the new pages go to, set up downlink
+ * tuples to point to them.
+ */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
+ GistTupleSetValid(ptr->itup);
+ }
+
+ /*
+ * If this is a root split, we construct the new root page with the
+ * downlinks here directly, instead of requiring the caller to insert
+ * them. Add the new root page to the list along with the child pages.
+ */
+ if (is_rootsplit)
+ {
+ IndexTuple *downlinks;
+ int ndownlinks = 0;
+ int i;
+
+ rootpg.buffer = buffer;
+ rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer));
+ GistPageGetOpaque(rootpg.page)->flags = 0;
+
+ /* Prepare a vector of all the downlinks */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ ndownlinks++;
+ downlinks = palloc(sizeof(IndexTuple) * ndownlinks);
+ for (i = 0, ptr = dist; ptr; ptr = ptr->next)
+ downlinks[i++] = ptr->itup;
+
+ rootpg.block.blkno = GIST_ROOT_BLKNO;
+ rootpg.block.num = ndownlinks;
+ rootpg.list = gistfillitupvec(downlinks, ndownlinks,
+ &(rootpg.lenlist));
+ rootpg.itup = NULL;
+
+ rootpg.next = dist;
+ dist = &rootpg;
+ }
+ else
+ {
+ /* Prepare split-info to be returned to caller */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
+
+ si->buf = ptr->buffer;
+ si->downlink = ptr->itup;
+ *splitinfo = lappend(*splitinfo, si);
+ }
+ }
+
+ /*
+ * Fill all pages. All the pages are new, ie. freshly allocated empty
+ * pages, or a temporary copy of the old page.
+ */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ char *data = (char *) (ptr->list);
+
+ for (i = 0; i < ptr->block.num; i++)
+ {
+ IndexTuple thistup = (IndexTuple) data;
+
+ if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
+
+ /*
+ * If this is the first inserted/updated tuple, let the caller
+ * know which page it landed on.
+ */
+ if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid))
+ *newblkno = ptr->block.blkno;
+
+ data += IndexTupleSize(thistup);
+ }
+
+ /* Set up rightlinks */
+ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO)
+ GistPageGetOpaque(ptr->page)->rightlink =
+ ptr->next->block.blkno;
+ else
+ GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
+
+ /*
+ * Mark the all but the right-most page with the follow-right
+ * flag. It will be cleared as soon as the downlink is inserted
+ * into the parent, but this ensures that if we error out before
+ * that, the index is still consistent. (in buffering build mode,
+ * any error will abort the index build anyway, so this is not
+ * needed.)
+ */
+ if (ptr->next && !is_rootsplit && markfollowright)
+ GistMarkFollowRight(ptr->page);
+ else
+ GistClearFollowRight(ptr->page);
+
+ /*
+ * Copy the NSN of the original page to all pages. The
+ * F_FOLLOW_RIGHT flags ensure that scans will follow the
+ * rightlinks until the downlinks are inserted.
+ */
+ GistPageSetNSN(ptr->page, oldnsn);
+ }
+
+ /*
+ * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL
+ * insertion for that. NB: The number of pages and data segments
+ * specified here must match the calculations in gistXLogSplit()!
+ */
+ if (!is_build && RelationNeedsWAL(rel))
+ XLogEnsureRecordSpace(npage, 1 + npage * 2);
+
+ START_CRIT_SECTION();
+
+ /*
+ * Must mark buffers dirty before XLogInsert, even though we'll still
+ * be changing their opaque fields below.
+ */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ MarkBufferDirty(ptr->buffer);
+ if (BufferIsValid(leftchildbuf))
+ MarkBufferDirty(leftchildbuf);
+
+ /*
+ * The first page in the chain was a temporary working copy meant to
+ * replace the old page. Copy it over the old page.
+ */
+ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer));
+ dist->page = BufferGetPage(dist->buffer);
+
+ /*
+ * Write the WAL record.
+ *
+ * If we're building a new index, however, we don't WAL-log changes
+ * yet. The LSN-NSN interlock between parent and child requires that
+ * LSNs never move backwards, so set the LSNs to a value that's
+ * smaller than any real or fake unlogged LSN that might be generated
+ * later. (There can't be any concurrent scans during index build, so
+ * we don't need to be able to detect concurrent splits yet.)
+ */
+ if (is_build)
+ recptr = GistBuildLSN;
+ else
+ {
+ if (RelationNeedsWAL(rel))
+ recptr = gistXLogSplit(is_leaf,
+ dist, oldrlink, oldnsn, leftchildbuf,
+ markfollowright);
+ else
+ recptr = gistGetFakeLSN(rel);
+ }
+
+ for (ptr = dist; ptr; ptr = ptr->next)
+ PageSetLSN(ptr->page, recptr);
+
+ /*
+ * Return the new child buffers to the caller.
+ *
+ * If this was a root split, we've already inserted the downlink
+ * pointers, in the form of a new root page. Therefore we can release
+ * all the new buffers, and keep just the root page locked.
+ */
+ if (is_rootsplit)
+ {
+ for (ptr = dist->next; ptr; ptr = ptr->next)
+ UnlockReleaseBuffer(ptr->buffer);
+ }
+ }
+ else
+ {
+ /*
+ * Enough space. We always get here if ntup==0.
+ */
+ START_CRIT_SECTION();
+
+ /*
+ * Delete old tuple if any, then insert new tuple(s) if any. If
+ * possible, use the fast path of PageIndexTupleOverwrite.
+ */
+ if (OffsetNumberIsValid(oldoffnum))
+ {
+ if (ntup == 1)
+ {
+ /* One-for-one replacement, so use PageIndexTupleOverwrite */
+ if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup,
+ IndexTupleSize(*itup)))
+ elog(ERROR, "failed to add item to index page in \"%s\"",
+ RelationGetRelationName(rel));
+ }
+ else
+ {
+ /* Delete old, then append new tuple(s) to page */
+ PageIndexTupleDelete(page, oldoffnum);
+ gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
+ }
+ }
+ else
+ {
+ /* Just append new tuples at the end of the page */
+ gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
+ }
+
+ MarkBufferDirty(buffer);
+
+ if (BufferIsValid(leftchildbuf))
+ MarkBufferDirty(leftchildbuf);
+
+ if (is_build)
+ recptr = GistBuildLSN;
+ else
+ {
+ if (RelationNeedsWAL(rel))
+ {
+ OffsetNumber ndeloffs = 0,
+ deloffs[1];
+
+ if (OffsetNumberIsValid(oldoffnum))
+ {
+ deloffs[0] = oldoffnum;
+ ndeloffs = 1;
+ }
+
+ recptr = gistXLogUpdate(buffer,
+ deloffs, ndeloffs, itup, ntup,
+ leftchildbuf);
+ }
+ else
+ recptr = gistGetFakeLSN(rel);
+ }
+ PageSetLSN(page, recptr);
+
+ if (newblkno)
+ *newblkno = blkno;
+ }
+
+ /*
+ * If we inserted the downlink for a child page, set NSN and clear
+ * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to
+ * follow the rightlink if and only if they looked at the parent page
+ * before we inserted the downlink.
+ *
+ * Note that we do this *after* writing the WAL record. That means that
+ * the possible full page image in the WAL record does not include these
+ * changes, and they must be replayed even if the page is restored from
+ * the full page image. There's a chicken-and-egg problem: if we updated
+ * the child pages first, we wouldn't know the recptr of the WAL record
+ * we're about to write.
+ */
+ if (BufferIsValid(leftchildbuf))
+ {
+ Page leftpg = BufferGetPage(leftchildbuf);
+
+ GistPageSetNSN(leftpg, recptr);
+ GistClearFollowRight(leftpg);
+
+ PageSetLSN(leftpg, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ return is_split;
+}
+
+/*
+ * Workhouse routine for doing insertion into a GiST index. Note that
+ * this routine assumes it is invoked in a short-lived memory context,
+ * so it does not bother releasing palloc'd allocations.
+ */
+void
+gistdoinsert(Relation r, IndexTuple itup, Size freespace,
+ GISTSTATE *giststate, Relation heapRel, bool is_build)
+{
+ ItemId iid;
+ IndexTuple idxtuple;
+ GISTInsertStack firststack;
+ GISTInsertStack *stack;
+ GISTInsertState state;
+ bool xlocked = false;
+
+ memset(&state, 0, sizeof(GISTInsertState));
+ state.freespace = freespace;
+ state.r = r;
+ state.heapRel = heapRel;
+ state.is_build = is_build;
+
+ /* Start from the root */
+ firststack.blkno = GIST_ROOT_BLKNO;
+ firststack.lsn = 0;
+ firststack.retry_from_parent = false;
+ firststack.parent = NULL;
+ firststack.downlinkoffnum = InvalidOffsetNumber;
+ state.stack = stack = &firststack;
+
+ /*
+ * Walk down along the path of smallest penalty, updating the parent
+ * pointers with the key we're inserting as we go. If we crash in the
+ * middle, the tree is consistent, although the possible parent updates
+ * were a waste.
+ */
+ for (;;)
+ {
+ /*
+ * If we split an internal page while descending the tree, we have to
+ * retry at the parent. (Normally, the LSN-NSN interlock below would
+ * also catch this and cause us to retry. But LSNs are not updated
+ * during index build.)
+ */
+ while (stack->retry_from_parent)
+ {
+ if (xlocked)
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ xlocked = false;
+ ReleaseBuffer(stack->buffer);
+ state.stack = stack = stack->parent;
+ }
+
+ if (XLogRecPtrIsInvalid(stack->lsn))
+ stack->buffer = ReadBuffer(state.r, stack->blkno);
+
+ /*
+ * Be optimistic and grab shared lock first. Swap it for an exclusive
+ * lock later if we need to update the page.
+ */
+ if (!xlocked)
+ {
+ LockBuffer(stack->buffer, GIST_SHARE);
+ gistcheckpage(state.r, stack->buffer);
+ }
+
+ stack->page = (Page) BufferGetPage(stack->buffer);
+ stack->lsn = xlocked ?
+ PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer);
+ Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn));
+
+ /*
+ * If this page was split but the downlink was never inserted to the
+ * parent because the inserting backend crashed before doing that, fix
+ * that now.
+ */
+ if (GistFollowRight(stack->page))
+ {
+ if (!xlocked)
+ {
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+ xlocked = true;
+ /* someone might've completed the split when we unlocked */
+ if (!GistFollowRight(stack->page))
+ continue;
+ }
+ gistfixsplit(&state, giststate);
+
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ continue;
+ }
+
+ if ((stack->blkno != GIST_ROOT_BLKNO &&
+ stack->parent->lsn < GistPageGetNSN(stack->page)) ||
+ GistPageIsDeleted(stack->page))
+ {
+ /*
+ * Concurrent split or page deletion detected. There's no
+ * guarantee that the downlink for this page is consistent with
+ * the tuple we're inserting anymore, so go back to parent and
+ * rechoose the best child.
+ */
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ continue;
+ }
+
+ if (!GistPageIsLeaf(stack->page))
+ {
+ /*
+ * This is an internal page so continue to walk down the tree.
+ * Find the child node that has the minimum insertion penalty.
+ */
+ BlockNumber childblkno;
+ IndexTuple newtup;
+ GISTInsertStack *item;
+ OffsetNumber downlinkoffnum;
+
+ downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
+ iid = PageGetItemId(stack->page, downlinkoffnum);
+ idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
+ childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+ /*
+ * Check that it's not a leftover invalid tuple from pre-9.1
+ */
+ if (GistTupleIsInvalid(idxtuple))
+ ereport(ERROR,
+ (errmsg("index \"%s\" contains an inner tuple marked as invalid",
+ RelationGetRelationName(r)),
+ errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
+ errhint("Please REINDEX it.")));
+
+ /*
+ * Check that the key representing the target child node is
+ * consistent with the key we're inserting. Update it if it's not.
+ */
+ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
+ if (newtup)
+ {
+ /*
+ * Swap shared lock for an exclusive one. Beware, the page may
+ * change while we unlock/lock the page...
+ */
+ if (!xlocked)
+ {
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+ xlocked = true;
+ stack->page = (Page) BufferGetPage(stack->buffer);
+
+ if (PageGetLSN(stack->page) != stack->lsn)
+ {
+ /* the page was changed while we unlocked it, retry */
+ continue;
+ }
+ }
+
+ /*
+ * Update the tuple.
+ *
+ * We still hold the lock after gistinserttuple(), but it
+ * might have to split the page to make the updated tuple fit.
+ * In that case the updated tuple might migrate to the other
+ * half of the split, so we have to go back to the parent and
+ * descend back to the half that's a better fit for the new
+ * tuple.
+ */
+ if (gistinserttuple(&state, stack, giststate, newtup,
+ downlinkoffnum))
+ {
+ /*
+ * If this was a root split, the root page continues to be
+ * the parent and the updated tuple went to one of the
+ * child pages, so we just need to retry from the root
+ * page.
+ */
+ if (stack->blkno != GIST_ROOT_BLKNO)
+ {
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ }
+ continue;
+ }
+ }
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ xlocked = false;
+
+ /* descend to the chosen child */
+ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+ item->blkno = childblkno;
+ item->parent = stack;
+ item->downlinkoffnum = downlinkoffnum;
+ state.stack = stack = item;
+ }
+ else
+ {
+ /*
+ * Leaf page. Insert the new key. We've already updated all the
+ * parents on the way down, but we might have to split the page if
+ * it doesn't fit. gistinserttuple() will take care of that.
+ */
+
+ /*
+ * Swap shared lock for an exclusive one. Be careful, the page may
+ * change while we unlock/lock the page...
+ */
+ if (!xlocked)
+ {
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+ xlocked = true;
+ stack->page = (Page) BufferGetPage(stack->buffer);
+ stack->lsn = PageGetLSN(stack->page);
+
+ if (stack->blkno == GIST_ROOT_BLKNO)
+ {
+ /*
+ * the only page that can become inner instead of leaf is
+ * the root page, so for root we should recheck it
+ */
+ if (!GistPageIsLeaf(stack->page))
+ {
+ /*
+ * very rare situation: during unlock/lock index with
+ * number of pages = 1 was increased
+ */
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ xlocked = false;
+ continue;
+ }
+
+ /*
+ * we don't need to check root split, because checking
+ * leaf/inner is enough to recognize split for root
+ */
+ }
+ else if ((GistFollowRight(stack->page) ||
+ stack->parent->lsn < GistPageGetNSN(stack->page)) ||
+ GistPageIsDeleted(stack->page))
+ {
+ /*
+ * The page was split or deleted while we momentarily
+ * unlocked the page. Go back to parent.
+ */
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ continue;
+ }
+ }
+
+ /* now state.stack->(page, buffer and blkno) points to leaf page */
+
+ gistinserttuple(&state, stack, giststate, itup,
+ InvalidOffsetNumber);
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+
+ /* Release any pins we might still hold before exiting */
+ for (; stack; stack = stack->parent)
+ ReleaseBuffer(stack->buffer);
+ break;
+ }
+ }
+}
+
+/*
+ * Traverse the tree to find path from root page to specified "child" block.
+ *
+ * returns a new insertion stack, starting from the parent of "child", up
+ * to the root. *downlinkoffnum is set to the offset of the downlink in the
+ * direct parent of child.
+ *
+ * To prevent deadlocks, this should lock only one page at a time.
+ */
+static GISTInsertStack *
+gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
+{
+ Page page;
+ Buffer buffer;
+ OffsetNumber i,
+ maxoff;
+ ItemId iid;
+ IndexTuple idxtuple;
+ List *fifo;
+ GISTInsertStack *top,
+ *ptr;
+ BlockNumber blkno;
+
+ top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+ top->blkno = GIST_ROOT_BLKNO;
+ top->downlinkoffnum = InvalidOffsetNumber;
+
+ fifo = list_make1(top);
+ while (fifo != NIL)
+ {
+ /* Get next page to visit */
+ top = linitial(fifo);
+ fifo = list_delete_first(fifo);
+
+ buffer = ReadBuffer(r, top->blkno);
+ LockBuffer(buffer, GIST_SHARE);
+ gistcheckpage(r, buffer);
+ page = (Page) BufferGetPage(buffer);
+
+ if (GistPageIsLeaf(page))
+ {
+ /*
+ * Because we scan the index top-down, all the rest of the pages
+ * in the queue must be leaf pages as well.
+ */
+ UnlockReleaseBuffer(buffer);
+ break;
+ }
+
+ /* currently, internal pages are never deleted */
+ Assert(!GistPageIsDeleted(page));
+
+ top->lsn = BufferGetLSNAtomic(buffer);
+
+ /*
+ * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a
+ * downlink. This should not normally happen..
+ */
+ if (GistFollowRight(page))
+ elog(ERROR, "concurrent GiST page split was incomplete");
+
+ if (top->parent && top->parent->lsn < GistPageGetNSN(page) &&
+ GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
+ {
+ /*
+ * Page was split while we looked elsewhere. We didn't see the
+ * downlink to the right page when we scanned the parent, so add
+ * it to the queue now.
+ *
+ * Put the right page ahead of the queue, so that we visit it
+ * next. That's important, because if this is the lowest internal
+ * level, just above leaves, we might already have queued up some
+ * leaf pages, and we assume that there can't be any non-leaf
+ * pages behind leaf pages.
+ */
+ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+ ptr->blkno = GistPageGetOpaque(page)->rightlink;
+ ptr->downlinkoffnum = InvalidOffsetNumber;
+ ptr->parent = top->parent;
+
+ fifo = lcons(ptr, fifo);
+ }
+
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ iid = PageGetItemId(page, i);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+ if (blkno == child)
+ {
+ /* Found it! */
+ UnlockReleaseBuffer(buffer);
+ *downlinkoffnum = i;
+ return top;
+ }
+ else
+ {
+ /* Append this child to the list of pages to visit later */
+ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+ ptr->blkno = blkno;
+ ptr->downlinkoffnum = i;
+ ptr->parent = top;
+
+ fifo = lappend(fifo, ptr);
+ }
+ }
+
+ UnlockReleaseBuffer(buffer);
+ }
+
+ elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u",
+ RelationGetRelationName(r), child);
+ return NULL; /* keep compiler quiet */
+}
+
+/*
+ * Updates the stack so that child->parent is the correct parent of the
+ * child. child->parent must be exclusively locked on entry, and will
+ * remain so at exit, but it might not be the same page anymore.
+ */
+static void
+gistFindCorrectParent(Relation r, GISTInsertStack *child)
+{
+ GISTInsertStack *parent = child->parent;
+
+ gistcheckpage(r, parent->buffer);
+ parent->page = (Page) BufferGetPage(parent->buffer);
+
+ /* here we don't need to distinguish between split and page update */
+ if (child->downlinkoffnum == InvalidOffsetNumber ||
+ parent->lsn != PageGetLSN(parent->page))
+ {
+ /* parent is changed, look child in right links until found */
+ OffsetNumber i,
+ maxoff;
+ ItemId iid;
+ IndexTuple idxtuple;
+ GISTInsertStack *ptr;
+
+ while (true)
+ {
+ maxoff = PageGetMaxOffsetNumber(parent->page);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ iid = PageGetItemId(parent->page, i);
+ idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
+ if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+ {
+ /* yes!!, found */
+ child->downlinkoffnum = i;
+ return;
+ }
+ }
+
+ parent->blkno = GistPageGetOpaque(parent->page)->rightlink;
+ UnlockReleaseBuffer(parent->buffer);
+ if (parent->blkno == InvalidBlockNumber)
+ {
+ /*
+ * End of chain and still didn't find parent. It's a very-very
+ * rare situation when root splitted.
+ */
+ break;
+ }
+ parent->buffer = ReadBuffer(r, parent->blkno);
+ LockBuffer(parent->buffer, GIST_EXCLUSIVE);
+ gistcheckpage(r, parent->buffer);
+ parent->page = (Page) BufferGetPage(parent->buffer);
+ }
+
+ /*
+ * awful!!, we need search tree to find parent ... , but before we
+ * should release all old parent
+ */
+
+ ptr = child->parent->parent; /* child->parent already released
+ * above */
+ while (ptr)
+ {
+ ReleaseBuffer(ptr->buffer);
+ ptr = ptr->parent;
+ }
+
+ /* ok, find new path */
+ ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum);
+
+ /* read all buffers as expected by caller */
+ /* note we don't lock them or gistcheckpage them here! */
+ while (ptr)
+ {
+ ptr->buffer = ReadBuffer(r, ptr->blkno);
+ ptr->page = (Page) BufferGetPage(ptr->buffer);
+ ptr = ptr->parent;
+ }
+
+ /* install new chain of parents to stack */
+ child->parent = parent;
+
+ /* make recursive call to normal processing */
+ LockBuffer(child->parent->buffer, GIST_EXCLUSIVE);
+ gistFindCorrectParent(r, child);
+ }
+}
+
+/*
+ * Form a downlink pointer for the page in 'buf'.
+ */
+static IndexTuple
+gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate,
+ GISTInsertStack *stack)
+{
+ Page page = BufferGetPage(buf);
+ OffsetNumber maxoff;
+ OffsetNumber offset;
+ IndexTuple downlink = NULL;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
+ {
+ IndexTuple ituple = (IndexTuple)
+ PageGetItem(page, PageGetItemId(page, offset));
+
+ if (downlink == NULL)
+ downlink = CopyIndexTuple(ituple);
+ else
+ {
+ IndexTuple newdownlink;
+
+ newdownlink = gistgetadjusted(rel, downlink, ituple,
+ giststate);
+ if (newdownlink)
+ downlink = newdownlink;
+ }
+ }
+
+ /*
+ * If the page is completely empty, we can't form a meaningful downlink
+ * for it. But we have to insert a downlink for the page. Any key will do,
+ * as long as its consistent with the downlink of parent page, so that we
+ * can legally insert it to the parent. A minimal one that matches as few
+ * scans as possible would be best, to keep scans from doing useless work,
+ * but we don't know how to construct that. So we just use the downlink of
+ * the original page that was split - that's as far from optimal as it can
+ * get but will do..
+ */
+ if (!downlink)
+ {
+ ItemId iid;
+
+ LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
+ gistFindCorrectParent(rel, stack);
+ iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum);
+ downlink = (IndexTuple) PageGetItem(stack->parent->page, iid);
+ downlink = CopyIndexTuple(downlink);
+ LockBuffer(stack->parent->buffer, GIST_UNLOCK);
+ }
+
+ ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf));
+ GistTupleSetValid(downlink);
+
+ return downlink;
+}
+
+
+/*
+ * Complete the incomplete split of state->stack->page.
+ */
+static void
+gistfixsplit(GISTInsertState *state, GISTSTATE *giststate)
+{
+ GISTInsertStack *stack = state->stack;
+ Buffer buf;
+ Page page;
+ List *splitinfo = NIL;
+
+ elog(LOG, "fixing incomplete split in index \"%s\", block %u",
+ RelationGetRelationName(state->r), stack->blkno);
+
+ Assert(GistFollowRight(stack->page));
+ Assert(OffsetNumberIsValid(stack->downlinkoffnum));
+
+ buf = stack->buffer;
+
+ /*
+ * Read the chain of split pages, following the rightlinks. Construct a
+ * downlink tuple for each page.
+ */
+ for (;;)
+ {
+ GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
+ IndexTuple downlink;
+
+ page = BufferGetPage(buf);
+
+ /* Form the new downlink tuples to insert to parent */
+ downlink = gistformdownlink(state->r, buf, giststate, stack);
+
+ si->buf = buf;
+ si->downlink = downlink;
+
+ splitinfo = lappend(splitinfo, si);
+
+ if (GistFollowRight(page))
+ {
+ /* lock next page */
+ buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink);
+ LockBuffer(buf, GIST_EXCLUSIVE);
+ }
+ else
+ break;
+ }
+
+ /* Insert the downlinks */
+ gistfinishsplit(state, stack, giststate, splitinfo, false);
+}
+
+/*
+ * Insert or replace a tuple in stack->buffer. If 'oldoffnum' is valid, the
+ * tuple at 'oldoffnum' is replaced, otherwise the tuple is inserted as new.
+ * 'stack' represents the path from the root to the page being updated.
+ *
+ * The caller must hold an exclusive lock on stack->buffer. The lock is still
+ * held on return, but the page might not contain the inserted tuple if the
+ * page was split. The function returns true if the page was split, false
+ * otherwise.
+ */
+static bool
+gistinserttuple(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum)
+{
+ return gistinserttuples(state, stack, giststate, &tuple, 1, oldoffnum,
+ InvalidBuffer, InvalidBuffer, false, false);
+}
+
+/* ----------------
+ * An extended workhorse version of gistinserttuple(). This version allows
+ * inserting multiple tuples, or replacing a single tuple with multiple tuples.
+ * This is used to recursively update the downlinks in the parent when a page
+ * is split.
+ *
+ * If leftchild and rightchild are valid, we're inserting/replacing the
+ * downlink for rightchild, and leftchild is its left sibling. We clear the
+ * F_FOLLOW_RIGHT flag and update NSN on leftchild, atomically with the
+ * insertion of the downlink.
+ *
+ * To avoid holding locks for longer than necessary, when recursing up the
+ * tree to update the parents, the locking is a bit peculiar here. On entry,
+ * the caller must hold an exclusive lock on stack->buffer, as well as
+ * leftchild and rightchild if given. On return:
+ *
+ * - Lock on stack->buffer is released, if 'unlockbuf' is true. The page is
+ * always kept pinned, however.
+ * - Lock on 'leftchild' is released, if 'unlockleftchild' is true. The page
+ * is kept pinned.
+ * - Lock and pin on 'rightchild' are always released.
+ *
+ * Returns 'true' if the page had to be split. Note that if the page was
+ * split, the inserted/updated tuples might've been inserted to a right
+ * sibling of stack->buffer instead of stack->buffer itself.
+ */
+static bool
+gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate,
+ IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+ Buffer leftchild, Buffer rightchild,
+ bool unlockbuf, bool unlockleftchild)
+{
+ List *splitinfo;
+ bool is_split;
+
+ /*
+ * Check for any rw conflicts (in serializable isolation level) just
+ * before we intend to modify the page
+ */
+ CheckForSerializableConflictIn(state->r, NULL, BufferGetBlockNumber(stack->buffer));
+
+ /* Insert the tuple(s) to the page, splitting the page if necessary */
+ is_split = gistplacetopage(state->r, state->freespace, giststate,
+ stack->buffer,
+ tuples, ntup,
+ oldoffnum, NULL,
+ leftchild,
+ &splitinfo,
+ true,
+ state->heapRel,
+ state->is_build);
+
+ /*
+ * Before recursing up in case the page was split, release locks on the
+ * child pages. We don't need to keep them locked when updating the
+ * parent.
+ */
+ if (BufferIsValid(rightchild))
+ UnlockReleaseBuffer(rightchild);
+ if (BufferIsValid(leftchild) && unlockleftchild)
+ LockBuffer(leftchild, GIST_UNLOCK);
+
+ /*
+ * If we had to split, insert/update the downlinks in the parent. If the
+ * caller requested us to release the lock on stack->buffer, tell
+ * gistfinishsplit() to do that as soon as it's safe to do so. If we
+ * didn't have to split, release it ourselves.
+ */
+ if (splitinfo)
+ gistfinishsplit(state, stack, giststate, splitinfo, unlockbuf);
+ else if (unlockbuf)
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+
+ return is_split;
+}
+
+/*
+ * Finish an incomplete split by inserting/updating the downlinks in parent
+ * page. 'splitinfo' contains all the child pages involved in the split,
+ * from left-to-right.
+ *
+ * On entry, the caller must hold a lock on stack->buffer and all the child
+ * pages in 'splitinfo'. If 'unlockbuf' is true, the lock on stack->buffer is
+ * released on return. The child pages are always unlocked and unpinned.
+ */
+static void
+gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate, List *splitinfo, bool unlockbuf)
+{
+ GISTPageSplitInfo *right;
+ GISTPageSplitInfo *left;
+ IndexTuple tuples[2];
+
+ /* A split always contains at least two halves */
+ Assert(list_length(splitinfo) >= 2);
+
+ /*
+ * We need to insert downlinks for each new page, and update the downlink
+ * for the original (leftmost) page in the split. Begin at the rightmost
+ * page, inserting one downlink at a time until there's only two pages
+ * left. Finally insert the downlink for the last new page and update the
+ * downlink for the original page as one operation.
+ */
+ LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
+
+ /*
+ * Insert downlinks for the siblings from right to left, until there are
+ * only two siblings left.
+ */
+ for (int pos = list_length(splitinfo) - 1; pos > 1; pos--)
+ {
+ right = (GISTPageSplitInfo *) list_nth(splitinfo, pos);
+ left = (GISTPageSplitInfo *) list_nth(splitinfo, pos - 1);
+
+ gistFindCorrectParent(state->r, stack);
+ if (gistinserttuples(state, stack->parent, giststate,
+ &right->downlink, 1,
+ InvalidOffsetNumber,
+ left->buf, right->buf, false, false))
+ {
+ /*
+ * If the parent page was split, the existing downlink might have
+ * moved.
+ */
+ stack->downlinkoffnum = InvalidOffsetNumber;
+ }
+ /* gistinserttuples() released the lock on right->buf. */
+ }
+
+ right = (GISTPageSplitInfo *) lsecond(splitinfo);
+ left = (GISTPageSplitInfo *) linitial(splitinfo);
+
+ /*
+ * Finally insert downlink for the remaining right page and update the
+ * downlink for the original page to not contain the tuples that were
+ * moved to the new pages.
+ */
+ tuples[0] = left->downlink;
+ tuples[1] = right->downlink;
+ gistFindCorrectParent(state->r, stack);
+ if (gistinserttuples(state, stack->parent, giststate,
+ tuples, 2,
+ stack->downlinkoffnum,
+ left->buf, right->buf,
+ true, /* Unlock parent */
+ unlockbuf /* Unlock stack->buffer if caller wants
+ * that */
+ ))
+ {
+ /*
+ * If the parent page was split, the downlink might have moved.
+ */
+ stack->downlinkoffnum = InvalidOffsetNumber;
+ }
+
+ Assert(left->buf == stack->buffer);
+
+ /*
+ * If we split the page because we had to adjust the downlink on an
+ * internal page, while descending the tree for inserting a new tuple,
+ * then this might no longer be the correct page for the new tuple. The
+ * downlink to this page might not cover the new tuple anymore, it might
+ * need to go to the newly-created right sibling instead. Tell the caller
+ * to walk back up the stack, to re-check at the parent which page to
+ * insert to.
+ *
+ * Normally, the LSN-NSN interlock during the tree descend would also
+ * detect that a concurrent split happened (by ourselves), and cause us to
+ * retry at the parent. But that mechanism doesn't work during index
+ * build, because we don't do WAL-logging, and don't update LSNs, during
+ * index build.
+ */
+ stack->retry_from_parent = true;
+}
+
+/*
+ * gistSplit -- split a page in the tree and fill struct
+ * used for XLOG and real writes buffers. Function is recursive, ie
+ * it will split page until keys will fit in every page.
+ */
+SplitedPageLayout *
+gistSplit(Relation r,
+ Page page,
+ IndexTuple *itup, /* contains compressed entry */
+ int len,
+ GISTSTATE *giststate)
+{
+ IndexTuple *lvectup,
+ *rvectup;
+ GistSplitVector v;
+ int i;
+ SplitedPageLayout *res = NULL;
+
+ /* this should never recurse very deeply, but better safe than sorry */
+ check_stack_depth();
+
+ /* there's no point in splitting an empty page */
+ Assert(len > 0);
+
+ /*
+ * If a single tuple doesn't fit on a page, no amount of splitting will
+ * help.
+ */
+ if (len == 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ IndexTupleSize(itup[0]), GiSTPageSize,
+ RelationGetRelationName(r))));
+
+ memset(v.spl_lisnull, true,
+ sizeof(bool) * giststate->nonLeafTupdesc->natts);
+ memset(v.spl_risnull, true,
+ sizeof(bool) * giststate->nonLeafTupdesc->natts);
+ gistSplitByKey(r, page, itup, len, giststate, &v, 0);
+
+ /* form left and right vector */
+ lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1));
+ rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1));
+
+ for (i = 0; i < v.splitVector.spl_nleft; i++)
+ lvectup[i] = itup[v.splitVector.spl_left[i] - 1];
+
+ for (i = 0; i < v.splitVector.spl_nright; i++)
+ rvectup[i] = itup[v.splitVector.spl_right[i] - 1];
+
+ /* finalize splitting (may need another split) */
+ if (!gistfitpage(rvectup, v.splitVector.spl_nright))
+ {
+ res = gistSplit(r, page, rvectup, v.splitVector.spl_nright, giststate);
+ }
+ else
+ {
+ ROTATEDIST(res);
+ res->block.num = v.splitVector.spl_nright;
+ res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist));
+ res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false);
+ }
+
+ if (!gistfitpage(lvectup, v.splitVector.spl_nleft))
+ {
+ SplitedPageLayout *resptr,
+ *subres;
+
+ resptr = subres = gistSplit(r, page, lvectup, v.splitVector.spl_nleft, giststate);
+
+ /* install on list's tail */
+ while (resptr->next)
+ resptr = resptr->next;
+
+ resptr->next = res;
+ res = subres;
+ }
+ else
+ {
+ ROTATEDIST(res);
+ res->block.num = v.splitVector.spl_nleft;
+ res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist));
+ res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false);
+ }
+
+ return res;
+}
+
+/*
+ * Create a GISTSTATE and fill it with information about the index
+ */
+GISTSTATE *
+initGISTstate(Relation index)
+{
+ GISTSTATE *giststate;
+ MemoryContext scanCxt;
+ MemoryContext oldCxt;
+ int i;
+
+ /* safety check to protect fixed-size arrays in GISTSTATE */
+ if (index->rd_att->natts > INDEX_MAX_KEYS)
+ elog(ERROR, "numberOfAttributes %d > %d",
+ index->rd_att->natts, INDEX_MAX_KEYS);
+
+ /* Create the memory context that will hold the GISTSTATE */
+ scanCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "GiST scan context",
+ ALLOCSET_DEFAULT_SIZES);
+ oldCxt = MemoryContextSwitchTo(scanCxt);
+
+ /* Create and fill in the GISTSTATE */
+ giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE));
+
+ giststate->scanCxt = scanCxt;
+ giststate->tempCxt = scanCxt; /* caller must change this if needed */
+ giststate->leafTupdesc = index->rd_att;
+
+ /*
+ * The truncated tupdesc for non-leaf index tuples, which doesn't contain
+ * the INCLUDE attributes.
+ *
+ * It is used to form tuples during tuple adjustment and page split.
+ * B-tree creates shortened tuple descriptor for every truncated tuple,
+ * because it is doing this less often: it does not have to form truncated
+ * tuples during page split. Also, B-tree is not adjusting tuples on
+ * internal pages the way GiST does.
+ */
+ giststate->nonLeafTupdesc = CreateTupleDescCopyConstr(index->rd_att);
+ giststate->nonLeafTupdesc->natts =
+ IndexRelationGetNumberOfKeyAttributes(index);
+
+ for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++)
+ {
+ fmgr_info_copy(&(giststate->consistentFn[i]),
+ index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC),
+ scanCxt);
+ fmgr_info_copy(&(giststate->unionFn[i]),
+ index_getprocinfo(index, i + 1, GIST_UNION_PROC),
+ scanCxt);
+
+ /* opclasses are not required to provide a Compress method */
+ if (OidIsValid(index_getprocid(index, i + 1, GIST_COMPRESS_PROC)))
+ fmgr_info_copy(&(giststate->compressFn[i]),
+ index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC),
+ scanCxt);
+ else
+ giststate->compressFn[i].fn_oid = InvalidOid;
+
+ /* opclasses are not required to provide a Decompress method */
+ if (OidIsValid(index_getprocid(index, i + 1, GIST_DECOMPRESS_PROC)))
+ fmgr_info_copy(&(giststate->decompressFn[i]),
+ index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC),
+ scanCxt);
+ else
+ giststate->decompressFn[i].fn_oid = InvalidOid;
+
+ fmgr_info_copy(&(giststate->penaltyFn[i]),
+ index_getprocinfo(index, i + 1, GIST_PENALTY_PROC),
+ scanCxt);
+ fmgr_info_copy(&(giststate->picksplitFn[i]),
+ index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC),
+ scanCxt);
+ fmgr_info_copy(&(giststate->equalFn[i]),
+ index_getprocinfo(index, i + 1, GIST_EQUAL_PROC),
+ scanCxt);
+
+ /* opclasses are not required to provide a Distance method */
+ if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC)))
+ fmgr_info_copy(&(giststate->distanceFn[i]),
+ index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC),
+ scanCxt);
+ else
+ giststate->distanceFn[i].fn_oid = InvalidOid;
+
+ /* opclasses are not required to provide a Fetch method */
+ if (OidIsValid(index_getprocid(index, i + 1, GIST_FETCH_PROC)))
+ fmgr_info_copy(&(giststate->fetchFn[i]),
+ index_getprocinfo(index, i + 1, GIST_FETCH_PROC),
+ scanCxt);
+ else
+ giststate->fetchFn[i].fn_oid = InvalidOid;
+
+ /*
+ * If the index column has a specified collation, we should honor that
+ * while doing comparisons. However, we may have a collatable storage
+ * type for a noncollatable indexed data type. If there's no index
+ * collation then specify default collation in case the support
+ * functions need collation. This is harmless if the support
+ * functions don't care about collation, so we just do it
+ * unconditionally. (We could alternatively call get_typcollation,
+ * but that seems like expensive overkill --- there aren't going to be
+ * any cases where a GiST storage type has a nondefault collation.)
+ */
+ if (OidIsValid(index->rd_indcollation[i]))
+ giststate->supportCollation[i] = index->rd_indcollation[i];
+ else
+ giststate->supportCollation[i] = DEFAULT_COLLATION_OID;
+ }
+
+ /* No opclass information for INCLUDE attributes */
+ for (; i < index->rd_att->natts; i++)
+ {
+ giststate->consistentFn[i].fn_oid = InvalidOid;
+ giststate->unionFn[i].fn_oid = InvalidOid;
+ giststate->compressFn[i].fn_oid = InvalidOid;
+ giststate->decompressFn[i].fn_oid = InvalidOid;
+ giststate->penaltyFn[i].fn_oid = InvalidOid;
+ giststate->picksplitFn[i].fn_oid = InvalidOid;
+ giststate->equalFn[i].fn_oid = InvalidOid;
+ giststate->distanceFn[i].fn_oid = InvalidOid;
+ giststate->fetchFn[i].fn_oid = InvalidOid;
+ giststate->supportCollation[i] = InvalidOid;
+ }
+
+ MemoryContextSwitchTo(oldCxt);
+
+ return giststate;
+}
+
+void
+freeGISTstate(GISTSTATE *giststate)
+{
+ /* It's sufficient to delete the scanCxt */
+ MemoryContextDelete(giststate->scanCxt);
+}
+
+/*
+ * gistprunepage() -- try to remove LP_DEAD items from the given page.
+ * Function assumes that buffer is exclusively locked.
+ */
+static void
+gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel)
+{
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
+ int ndeletable = 0;
+ OffsetNumber offnum,
+ maxoff;
+ TransactionId latestRemovedXid = InvalidTransactionId;
+
+ Assert(GistPageIsLeaf(page));
+
+ /*
+ * Scan over all items to see which ones need to be deleted according to
+ * LP_DEAD flags.
+ */
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = FirstOffsetNumber;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemId = PageGetItemId(page, offnum);
+
+ if (ItemIdIsDead(itemId))
+ deletable[ndeletable++] = offnum;
+ }
+
+ if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+ latestRemovedXid =
+ index_compute_xid_horizon_for_tuples(rel, heapRel, buffer,
+ deletable, ndeletable);
+
+ if (ndeletable > 0)
+ {
+ START_CRIT_SECTION();
+
+ PageIndexMultiDelete(page, deletable, ndeletable);
+
+ /*
+ * Mark the page as not containing any LP_DEAD items. This is not
+ * certainly true (there might be some that have recently been marked,
+ * but weren't included in our target-item list), but it will almost
+ * always be true and it doesn't seem worth an additional page scan to
+ * check it. Remember that F_HAS_GARBAGE is only a hint anyway.
+ */
+ GistClearPageHasGarbage(page);
+
+ MarkBufferDirty(buffer);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+
+ recptr = gistXLogDelete(buffer,
+ deletable, ndeletable,
+ latestRemovedXid);
+
+ PageSetLSN(page, recptr);
+ }
+ else
+ PageSetLSN(page, gistGetFakeLSN(rel));
+
+ END_CRIT_SECTION();
+ }
+
+ /*
+ * Note: if we didn't find any LP_DEAD items, then the page's
+ * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
+ * separate write to clear it, however. We will clear it when we split
+ * the page.
+ */
+}
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
new file mode 100644
index 0000000..671b5e9
--- /dev/null
+++ b/src/backend/access/gist/gistbuild.c
@@ -0,0 +1,1186 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistbuild.c
+ * build algorithm for GiST indexes implementation.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "access/gistxlog.h"
+#include "access/tableam.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "storage/bufmgr.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* Step of index tuples for check whether to switch to buffering build mode */
+#define BUFFERING_MODE_SWITCH_CHECK_STEP 256
+
+/*
+ * Number of tuples to process in the slow way before switching to buffering
+ * mode, when buffering is explicitly turned on. Also, the number of tuples
+ * to process between readjusting the buffer size parameter, while in
+ * buffering mode.
+ */
+#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
+
+typedef enum
+{
+ GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
+ * switch */
+ GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
+ * buffering build mode if the index grows too
+ * big */
+ GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
+ * before switching to the buffering build
+ * mode */
+ GIST_BUFFERING_ACTIVE /* in buffering build mode */
+} GistBufferingMode;
+
+/* Working state for gistbuild and its callback */
+typedef struct
+{
+ Relation indexrel;
+ Relation heaprel;
+ GISTSTATE *giststate;
+
+ int64 indtuples; /* number of tuples indexed */
+ int64 indtuplesSize; /* total size of all indexed tuples */
+
+ Size freespace; /* amount of free space to leave on pages */
+
+ /*
+ * Extra data structures used during a buffering build. 'gfbb' contains
+ * information related to managing the build buffers. 'parentMap' is a
+ * lookup table of the parent of each internal page.
+ */
+ GISTBuildBuffers *gfbb;
+ HTAB *parentMap;
+
+ GistBufferingMode bufferingMode;
+} GISTBuildState;
+
+/* prototypes for private functions */
+static void gistInitBuffering(GISTBuildState *buildstate);
+static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
+static void gistBuildCallback(Relation index,
+ ItemPointer tid,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state);
+static void gistBufferingBuildInsert(GISTBuildState *buildstate,
+ IndexTuple itup);
+static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
+ BlockNumber startblkno, int startlevel);
+static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
+ Buffer buffer, int level,
+ IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+ BlockNumber parentblk, OffsetNumber downlinkoffnum);
+static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
+ BlockNumber childblkno, int level,
+ BlockNumber *parentblk,
+ OffsetNumber *downlinkoffnum);
+static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
+static void gistEmptyAllBuffers(GISTBuildState *buildstate);
+static int gistGetMaxLevel(Relation index);
+
+static void gistInitParentMap(GISTBuildState *buildstate);
+static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
+ BlockNumber parent);
+static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
+static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
+
+/*
+ * Main entry point to GiST index build. Initially calls insert over and over,
+ * but switches to more efficient buffering build algorithm after a certain
+ * number of tuples (unless buffering mode is disabled).
+ */
+IndexBuildResult *
+gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+ IndexBuildResult *result;
+ double reltuples;
+ GISTBuildState buildstate;
+ Buffer buffer;
+ Page page;
+ MemoryContext oldcxt = CurrentMemoryContext;
+ int fillfactor;
+
+ buildstate.indexrel = index;
+ buildstate.heaprel = heap;
+
+ if (index->rd_options)
+ {
+ /* Get buffering mode from the options string */
+ GiSTOptions *options = (GiSTOptions *) index->rd_options;
+
+ if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
+ buildstate.bufferingMode = GIST_BUFFERING_STATS;
+ else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
+ buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
+ else
+ buildstate.bufferingMode = GIST_BUFFERING_AUTO;
+
+ fillfactor = options->fillfactor;
+ }
+ else
+ {
+ /*
+ * By default, switch to buffering mode when the index grows too large
+ * to fit in cache.
+ */
+ buildstate.bufferingMode = GIST_BUFFERING_AUTO;
+ fillfactor = GIST_DEFAULT_FILLFACTOR;
+ }
+ /* Calculate target amount of free space to leave on pages */
+ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
+
+ /*
+ * We expect to be called exactly once for any index relation. If that's
+ * not the case, big trouble's what we have.
+ */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ /* no locking is needed */
+ buildstate.giststate = initGISTstate(index);
+
+ /*
+ * Create a temporary memory context that is reset once for each tuple
+ * processed. (Note: we don't bother to make this a child of the
+ * giststate's scanCxt, so we have to delete it separately at the end.)
+ */
+ buildstate.giststate->tempCxt = createTempGistContext();
+
+ /* initialize the root page */
+ buffer = gistNewBuffer(index);
+ Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
+ page = BufferGetPage(buffer);
+
+ START_CRIT_SECTION();
+
+ GISTInitBuffer(buffer, F_LEAF);
+
+ MarkBufferDirty(buffer);
+ PageSetLSN(page, GistBuildLSN);
+
+ UnlockReleaseBuffer(buffer);
+
+ END_CRIT_SECTION();
+
+ /* build the index */
+ buildstate.indtuples = 0;
+ buildstate.indtuplesSize = 0;
+
+ /*
+ * Do the heap scan.
+ */
+ reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+ gistBuildCallback,
+ (void *) &buildstate, NULL);
+
+ /*
+ * If buffering was used, flush out all the tuples that are still in the
+ * buffers.
+ */
+ if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
+ {
+ elog(DEBUG1, "all tuples processed, emptying buffers");
+ gistEmptyAllBuffers(&buildstate);
+ gistFreeBuildBuffers(buildstate.gfbb);
+ }
+
+ /* okay, all heap tuples are indexed */
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextDelete(buildstate.giststate->tempCxt);
+
+ freeGISTstate(buildstate.giststate);
+
+ /*
+ * We didn't write WAL records as we built the index, so if WAL-logging is
+ * required, write all pages to the WAL now.
+ */
+ if (RelationNeedsWAL(index))
+ {
+ log_newpage_range(index, MAIN_FORKNUM,
+ 0, RelationGetNumberOfBlocks(index),
+ true);
+ }
+
+ /*
+ * Return statistics
+ */
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+ result->heap_tuples = reltuples;
+ result->index_tuples = (double) buildstate.indtuples;
+
+ return result;
+}
+
+/*
+ * Attempt to switch to buffering mode.
+ *
+ * If there is not enough memory for buffering build, sets bufferingMode
+ * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
+ * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
+ * GIST_BUFFERING_ACTIVE.
+ */
+static void
+gistInitBuffering(GISTBuildState *buildstate)
+{
+ Relation index = buildstate->indexrel;
+ int pagesPerBuffer;
+ Size pageFreeSpace;
+ Size itupAvgSize,
+ itupMinSize;
+ double avgIndexTuplesPerPage,
+ maxIndexTuplesPerPage;
+ int i;
+ int levelStep;
+
+ /* Calc space of index page which is available for index tuples */
+ pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
+ - sizeof(ItemIdData)
+ - buildstate->freespace;
+
+ /*
+ * Calculate average size of already inserted index tuples using gathered
+ * statistics.
+ */
+ itupAvgSize = (double) buildstate->indtuplesSize /
+ (double) buildstate->indtuples;
+
+ /*
+ * Calculate minimal possible size of index tuple by index metadata.
+ * Minimal possible size of varlena is VARHDRSZ.
+ *
+ * XXX: that's not actually true, as a short varlen can be just 2 bytes.
+ * And we should take padding into account here.
+ */
+ itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
+ for (i = 0; i < index->rd_att->natts; i++)
+ {
+ if (TupleDescAttr(index->rd_att, i)->attlen < 0)
+ itupMinSize += VARHDRSZ;
+ else
+ itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
+ }
+
+ /* Calculate average and maximal number of index tuples which fit to page */
+ avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
+ maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
+
+ /*
+ * We need to calculate two parameters for the buffering algorithm:
+ * levelStep and pagesPerBuffer.
+ *
+ * levelStep determines the size of subtree that we operate on, while
+ * emptying a buffer. A higher value is better, as you need fewer buffer
+ * emptying steps to build the index. However, if you set it too high, the
+ * subtree doesn't fit in cache anymore, and you quickly lose the benefit
+ * of the buffers.
+ *
+ * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
+ * the number of tuples on page (ie. fanout), and M is the amount of
+ * internal memory available. Curiously, they doesn't explain *why* that
+ * setting is optimal. We calculate it by taking the highest levelStep so
+ * that a subtree still fits in cache. For a small B, our way of
+ * calculating levelStep is very close to Arge et al's formula. For a
+ * large B, our formula gives a value that is 2x higher.
+ *
+ * The average size (in pages) of a subtree of depth n can be calculated
+ * as a geometric series:
+ *
+ * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
+ *
+ * where B is the average number of index tuples on page. The subtree is
+ * cached in the shared buffer cache and the OS cache, so we choose
+ * levelStep so that the subtree size is comfortably smaller than
+ * effective_cache_size, with a safety factor of 4.
+ *
+ * The estimate on the average number of index tuples on page is based on
+ * average tuple sizes observed before switching to buffered build, so the
+ * real subtree size can be somewhat larger. Also, it would selfish to
+ * gobble the whole cache for our index build. The safety factor of 4
+ * should account for those effects.
+ *
+ * The other limiting factor for setting levelStep is that while
+ * processing a subtree, we need to hold one page for each buffer at the
+ * next lower buffered level. The max. number of buffers needed for that
+ * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
+ * hopefully maintenance_work_mem is set high enough that you're
+ * constrained by effective_cache_size rather than maintenance_work_mem.
+ *
+ * XXX: the buffer hash table consumes a fair amount of memory too per
+ * buffer, but that is not currently taken into account. That scales on
+ * the total number of buffers used, ie. the index size and on levelStep.
+ * Note that a higher levelStep *reduces* the amount of memory needed for
+ * the hash table.
+ */
+ levelStep = 1;
+ for (;;)
+ {
+ double subtreesize;
+ double maxlowestlevelpages;
+
+ /* size of an average subtree at this levelStep (in pages). */
+ subtreesize =
+ (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
+ (1 - avgIndexTuplesPerPage);
+
+ /* max number of pages at the lowest level of a subtree */
+ maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
+
+ /* subtree must fit in cache (with safety factor of 4) */
+ if (subtreesize > effective_cache_size / 4)
+ break;
+
+ /* each node in the lowest level of a subtree has one page in memory */
+ if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
+ break;
+
+ /* Good, we can handle this levelStep. See if we can go one higher. */
+ levelStep++;
+ }
+
+ /*
+ * We just reached an unacceptable value of levelStep in previous loop.
+ * So, decrease levelStep to get last acceptable value.
+ */
+ levelStep--;
+
+ /*
+ * If there's not enough cache or maintenance_work_mem, fall back to plain
+ * inserts.
+ */
+ if (levelStep <= 0)
+ {
+ elog(DEBUG1, "failed to switch to buffered GiST build");
+ buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
+ return;
+ }
+
+ /*
+ * The second parameter to set is pagesPerBuffer, which determines the
+ * size of each buffer. We adjust pagesPerBuffer also during the build,
+ * which is why this calculation is in a separate function.
+ */
+ pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
+
+ /* Initialize GISTBuildBuffers with these parameters */
+ buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
+ gistGetMaxLevel(index));
+
+ gistInitParentMap(buildstate);
+
+ buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
+
+ elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
+ levelStep, pagesPerBuffer);
+}
+
+/*
+ * Calculate pagesPerBuffer parameter for the buffering algorithm.
+ *
+ * Buffer size is chosen so that assuming that tuples are distributed
+ * randomly, emptying half a buffer fills on average one page in every buffer
+ * at the next lower level.
+ */
+static int
+calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
+{
+ double pagesPerBuffer;
+ double avgIndexTuplesPerPage;
+ double itupAvgSize;
+ Size pageFreeSpace;
+
+ /* Calc space of index page which is available for index tuples */
+ pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
+ - sizeof(ItemIdData)
+ - buildstate->freespace;
+
+ /*
+ * Calculate average size of already inserted index tuples using gathered
+ * statistics.
+ */
+ itupAvgSize = (double) buildstate->indtuplesSize /
+ (double) buildstate->indtuples;
+
+ avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
+
+ /*
+ * Recalculate required size of buffers.
+ */
+ pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
+
+ return (int) rint(pagesPerBuffer);
+}
+
+/*
+ * Per-tuple callback for table_index_build_scan.
+ */
+static void
+gistBuildCallback(Relation index,
+ ItemPointer tid,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state)
+{
+ GISTBuildState *buildstate = (GISTBuildState *) state;
+ IndexTuple itup;
+ MemoryContext oldCtx;
+
+ oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
+ itup->t_tid = *tid;
+
+ if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
+ {
+ /* We have buffers, so use them. */
+ gistBufferingBuildInsert(buildstate, itup);
+ }
+ else
+ {
+ /*
+ * There's no buffers (yet). Since we already have the index relation
+ * locked, we call gistdoinsert directly.
+ */
+ gistdoinsert(index, itup, buildstate->freespace,
+ buildstate->giststate, buildstate->heaprel, true);
+ }
+
+ /* Update tuple count and total size. */
+ buildstate->indtuples += 1;
+ buildstate->indtuplesSize += IndexTupleSize(itup);
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(buildstate->giststate->tempCxt);
+
+ if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
+ buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
+ {
+ /* Adjust the target buffer size now */
+ buildstate->gfbb->pagesPerBuffer =
+ calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
+ }
+
+ /*
+ * In 'auto' mode, check if the index has grown too large to fit in cache,
+ * and switch to buffering mode if it has.
+ *
+ * To avoid excessive calls to smgrnblocks(), only check this every
+ * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
+ */
+ if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
+ buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
+ effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
+ (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
+ buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
+ {
+ /*
+ * Index doesn't fit in effective cache anymore. Try to switch to
+ * buffering build mode.
+ */
+ gistInitBuffering(buildstate);
+ }
+}
+
+/*
+ * Insert function for buffering index build.
+ */
+static void
+gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
+{
+ /* Insert the tuple to buffers. */
+ gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
+
+ /* If we filled up (half of a) buffer, process buffer emptying. */
+ gistProcessEmptyingQueue(buildstate);
+}
+
+/*
+ * Process an index tuple. Runs the tuple down the tree until we reach a leaf
+ * page or node buffer, and inserts the tuple there. Returns true if we have
+ * to stop buffer emptying process (because one of child buffers can't take
+ * index tuples anymore).
+ */
+static bool
+gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
+ BlockNumber startblkno, int startlevel)
+{
+ GISTSTATE *giststate = buildstate->giststate;
+ GISTBuildBuffers *gfbb = buildstate->gfbb;
+ Relation indexrel = buildstate->indexrel;
+ BlockNumber childblkno;
+ Buffer buffer;
+ bool result = false;
+ BlockNumber blkno;
+ int level;
+ OffsetNumber downlinkoffnum = InvalidOffsetNumber;
+ BlockNumber parentblkno = InvalidBlockNumber;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Loop until we reach a leaf page (level == 0) or a level with buffers
+ * (not including the level we start at, because we would otherwise make
+ * no progress).
+ */
+ blkno = startblkno;
+ level = startlevel;
+ for (;;)
+ {
+ ItemId iid;
+ IndexTuple idxtuple,
+ newtup;
+ Page page;
+ OffsetNumber childoffnum;
+
+ /* Have we reached a level with buffers? */
+ if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
+ break;
+
+ /* Have we reached a leaf page? */
+ if (level == 0)
+ break;
+
+ /*
+ * Nope. Descend down to the next level then. Choose a child to
+ * descend down to.
+ */
+
+ buffer = ReadBuffer(indexrel, blkno);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+
+ page = (Page) BufferGetPage(buffer);
+ childoffnum = gistchoose(indexrel, page, itup, giststate);
+ iid = PageGetItemId(page, childoffnum);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+ if (level > 1)
+ gistMemorizeParent(buildstate, childblkno, blkno);
+
+ /*
+ * Check that the key representing the target child node is consistent
+ * with the key we're inserting. Update it if it's not.
+ */
+ newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
+ if (newtup)
+ {
+ blkno = gistbufferinginserttuples(buildstate,
+ buffer,
+ level,
+ &newtup,
+ 1,
+ childoffnum,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+ /* gistbufferinginserttuples() released the buffer */
+ }
+ else
+ UnlockReleaseBuffer(buffer);
+
+ /* Descend to the child */
+ parentblkno = blkno;
+ blkno = childblkno;
+ downlinkoffnum = childoffnum;
+ Assert(level > 0);
+ level--;
+ }
+
+ if (LEVEL_HAS_BUFFERS(level, gfbb))
+ {
+ /*
+ * We've reached level with buffers. Place the index tuple to the
+ * buffer, and add the buffer to the emptying queue if it overflows.
+ */
+ GISTNodeBuffer *childNodeBuffer;
+
+ /* Find the buffer or create a new one */
+ childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
+
+ /* Add index tuple to it */
+ gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
+
+ if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
+ result = true;
+ }
+ else
+ {
+ /*
+ * We've reached a leaf page. Place the tuple here.
+ */
+ Assert(level == 0);
+ buffer = ReadBuffer(indexrel, blkno);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ gistbufferinginserttuples(buildstate, buffer, level,
+ &itup, 1, InvalidOffsetNumber,
+ parentblkno, downlinkoffnum);
+ /* gistbufferinginserttuples() released the buffer */
+ }
+
+ return result;
+}
+
+/*
+ * Insert tuples to a given page.
+ *
+ * This is analogous with gistinserttuples() in the regular insertion code.
+ *
+ * Returns the block number of the page where the (first) new or updated tuple
+ * was inserted. Usually that's the original page, but might be a sibling page
+ * if the original page was split.
+ *
+ * Caller should hold a lock on 'buffer' on entry. This function will unlock
+ * and unpin it.
+ */
+static BlockNumber
+gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
+ IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+ BlockNumber parentblk, OffsetNumber downlinkoffnum)
+{
+ GISTBuildBuffers *gfbb = buildstate->gfbb;
+ List *splitinfo;
+ bool is_split;
+ BlockNumber placed_to_blk = InvalidBlockNumber;
+
+ is_split = gistplacetopage(buildstate->indexrel,
+ buildstate->freespace,
+ buildstate->giststate,
+ buffer,
+ itup, ntup, oldoffnum, &placed_to_blk,
+ InvalidBuffer,
+ &splitinfo,
+ false,
+ buildstate->heaprel, true);
+
+ /*
+ * If this is a root split, update the root path item kept in memory. This
+ * ensures that all path stacks are always complete, including all parent
+ * nodes up to the root. That simplifies the algorithm to re-find correct
+ * parent.
+ */
+ if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
+ {
+ Page page = BufferGetPage(buffer);
+ OffsetNumber off;
+ OffsetNumber maxoff;
+
+ Assert(level == gfbb->rootlevel);
+ gfbb->rootlevel++;
+
+ elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
+
+ /*
+ * All the downlinks on the old root page are now on one of the child
+ * pages. Visit all the new child pages to memorize the parents of the
+ * grandchildren.
+ */
+ if (gfbb->rootlevel > 1)
+ {
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (off = FirstOffsetNumber; off <= maxoff; off++)
+ {
+ ItemId iid = PageGetItemId(page, off);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
+ BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+ Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
+
+ LockBuffer(childbuf, GIST_SHARE);
+ gistMemorizeAllDownlinks(buildstate, childbuf);
+ UnlockReleaseBuffer(childbuf);
+
+ /*
+ * Also remember that the parent of the new child page is the
+ * root block.
+ */
+ gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
+ }
+ }
+ }
+
+ if (splitinfo)
+ {
+ /*
+ * Insert the downlinks to the parent. This is analogous with
+ * gistfinishsplit() in the regular insertion code, but the locking is
+ * simpler, and we have to maintain the buffers on internal nodes and
+ * the parent map.
+ */
+ IndexTuple *downlinks;
+ int ndownlinks,
+ i;
+ Buffer parentBuffer;
+ ListCell *lc;
+
+ /* Parent may have changed since we memorized this path. */
+ parentBuffer =
+ gistBufferingFindCorrectParent(buildstate,
+ BufferGetBlockNumber(buffer),
+ level,
+ &parentblk,
+ &downlinkoffnum);
+
+ /*
+ * If there's a buffer associated with this page, that needs to be
+ * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
+ * downlinks in 'splitinfo', to make sure they're consistent not only
+ * with the tuples already on the pages, but also the tuples in the
+ * buffers that will eventually be inserted to them.
+ */
+ gistRelocateBuildBuffersOnSplit(gfbb,
+ buildstate->giststate,
+ buildstate->indexrel,
+ level,
+ buffer, splitinfo);
+
+ /* Create an array of all the downlink tuples */
+ ndownlinks = list_length(splitinfo);
+ downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
+ i = 0;
+ foreach(lc, splitinfo)
+ {
+ GISTPageSplitInfo *splitinfo = lfirst(lc);
+
+ /*
+ * Remember the parent of each new child page in our parent map.
+ * This assumes that the downlinks fit on the parent page. If the
+ * parent page is split, too, when we recurse up to insert the
+ * downlinks, the recursive gistbufferinginserttuples() call will
+ * update the map again.
+ */
+ if (level > 0)
+ gistMemorizeParent(buildstate,
+ BufferGetBlockNumber(splitinfo->buf),
+ BufferGetBlockNumber(parentBuffer));
+
+ /*
+ * Also update the parent map for all the downlinks that got moved
+ * to a different page. (actually this also loops through the
+ * downlinks that stayed on the original page, but it does no
+ * harm).
+ */
+ if (level > 1)
+ gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
+
+ /*
+ * Since there's no concurrent access, we can release the lower
+ * level buffers immediately. This includes the original page.
+ */
+ UnlockReleaseBuffer(splitinfo->buf);
+ downlinks[i++] = splitinfo->downlink;
+ }
+
+ /* Insert them into parent. */
+ gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
+ downlinks, ndownlinks, downlinkoffnum,
+ InvalidBlockNumber, InvalidOffsetNumber);
+
+ list_free_deep(splitinfo); /* we don't need this anymore */
+ }
+ else
+ UnlockReleaseBuffer(buffer);
+
+ return placed_to_blk;
+}
+
+/*
+ * Find the downlink pointing to a child page.
+ *
+ * 'childblkno' indicates the child page to find the parent for. 'level' is
+ * the level of the child. On entry, *parentblkno and *downlinkoffnum can
+ * point to a location where the downlink used to be - we will check that
+ * location first, and save some cycles if it hasn't moved. The function
+ * returns a buffer containing the downlink, exclusively-locked, and
+ * *parentblkno and *downlinkoffnum are set to the real location of the
+ * downlink.
+ *
+ * If the child page is a leaf (level == 0), the caller must supply a correct
+ * parentblkno. Otherwise we use the parent map hash table to find the parent
+ * block.
+ *
+ * This function serves the same purpose as gistFindCorrectParent() during
+ * normal index inserts, but this is simpler because we don't need to deal
+ * with concurrent inserts.
+ */
+static Buffer
+gistBufferingFindCorrectParent(GISTBuildState *buildstate,
+ BlockNumber childblkno, int level,
+ BlockNumber *parentblkno,
+ OffsetNumber *downlinkoffnum)
+{
+ BlockNumber parent;
+ Buffer buffer;
+ Page page;
+ OffsetNumber maxoff;
+ OffsetNumber off;
+
+ if (level > 0)
+ parent = gistGetParent(buildstate, childblkno);
+ else
+ {
+ /*
+ * For a leaf page, the caller must supply a correct parent block
+ * number.
+ */
+ if (*parentblkno == InvalidBlockNumber)
+ elog(ERROR, "no parent buffer provided of child %d", childblkno);
+ parent = *parentblkno;
+ }
+
+ buffer = ReadBuffer(buildstate->indexrel, parent);
+ page = BufferGetPage(buffer);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ gistcheckpage(buildstate->indexrel, buffer);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* Check if it was not moved */
+ if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
+ *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
+ {
+ ItemId iid = PageGetItemId(page, *downlinkoffnum);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+ if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
+ {
+ /* Still there */
+ return buffer;
+ }
+ }
+
+ /*
+ * Downlink was not at the offset where it used to be. Scan the page to
+ * find it. During normal gist insertions, it might've moved to another
+ * page, to the right, but during a buffering build, we keep track of the
+ * parent of each page in the lookup table so we should always know what
+ * page it's on.
+ */
+ for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
+ {
+ ItemId iid = PageGetItemId(page, off);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+ if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
+ {
+ /* yes!!, found it */
+ *downlinkoffnum = off;
+ return buffer;
+ }
+ }
+
+ elog(ERROR, "failed to re-find parent for block %u", childblkno);
+ return InvalidBuffer; /* keep compiler quiet */
+}
+
+/*
+ * Process buffers emptying stack. Emptying of one buffer can cause emptying
+ * of other buffers. This function iterates until this cascading emptying
+ * process finished, e.g. until buffers emptying stack is empty.
+ */
+static void
+gistProcessEmptyingQueue(GISTBuildState *buildstate)
+{
+ GISTBuildBuffers *gfbb = buildstate->gfbb;
+
+ /* Iterate while we have elements in buffers emptying stack. */
+ while (gfbb->bufferEmptyingQueue != NIL)
+ {
+ GISTNodeBuffer *emptyingNodeBuffer;
+
+ /* Get node buffer from emptying stack. */
+ emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
+ gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
+ emptyingNodeBuffer->queuedForEmptying = false;
+
+ /*
+ * We are going to load last pages of buffers where emptying will be
+ * to. So let's unload any previously loaded buffers.
+ */
+ gistUnloadNodeBuffers(gfbb);
+
+ /*
+ * Pop tuples from the buffer and run them down to the buffers at
+ * lower level, or leaf pages. We continue until one of the lower
+ * level buffers fills up, or this buffer runs empty.
+ *
+ * In Arge et al's paper, the buffer emptying is stopped after
+ * processing 1/2 node buffer worth of tuples, to avoid overfilling
+ * any of the lower level buffers. However, it's more efficient to
+ * keep going until one of the lower level buffers actually fills up,
+ * so that's what we do. This doesn't need to be exact, if a buffer
+ * overfills by a few tuples, there's no harm done.
+ */
+ while (true)
+ {
+ IndexTuple itup;
+
+ /* Get next index tuple from the buffer */
+ if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
+ break;
+
+ /*
+ * Run it down to the underlying node buffer or leaf page.
+ *
+ * Note: it's possible that the buffer we're emptying splits as a
+ * result of this call. If that happens, our emptyingNodeBuffer
+ * points to the left half of the split. After split, it's very
+ * likely that the new left buffer is no longer over the half-full
+ * threshold, but we might as well keep flushing tuples from it
+ * until we fill a lower-level buffer.
+ */
+ if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
+ {
+ /*
+ * A lower level buffer filled up. Stop emptying this buffer,
+ * to avoid overflowing the lower level buffer.
+ */
+ break;
+ }
+
+ /* Free all the memory allocated during index tuple processing */
+ MemoryContextReset(buildstate->giststate->tempCxt);
+ }
+ }
+}
+
+/*
+ * Empty all node buffers, from top to bottom. This is done at the end of
+ * index build to flush all remaining tuples to the index.
+ *
+ * Note: This destroys the buffersOnLevels lists, so the buffers should not
+ * be inserted to after this call.
+ */
+static void
+gistEmptyAllBuffers(GISTBuildState *buildstate)
+{
+ GISTBuildBuffers *gfbb = buildstate->gfbb;
+ MemoryContext oldCtx;
+ int i;
+
+ oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
+
+ /*
+ * Iterate through the levels from top to bottom.
+ */
+ for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
+ {
+ /*
+ * Empty all buffers on this level. Note that new buffers can pop up
+ * in the list during the processing, as a result of page splits, so a
+ * simple walk through the list won't work. We remove buffers from the
+ * list when we see them empty; a buffer can't become non-empty once
+ * it's been fully emptied.
+ */
+ while (gfbb->buffersOnLevels[i] != NIL)
+ {
+ GISTNodeBuffer *nodeBuffer;
+
+ nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
+
+ if (nodeBuffer->blocksCount != 0)
+ {
+ /*
+ * Add this buffer to the emptying queue, and proceed to empty
+ * the queue.
+ */
+ if (!nodeBuffer->queuedForEmptying)
+ {
+ MemoryContextSwitchTo(gfbb->context);
+ nodeBuffer->queuedForEmptying = true;
+ gfbb->bufferEmptyingQueue =
+ lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
+ MemoryContextSwitchTo(buildstate->giststate->tempCxt);
+ }
+ gistProcessEmptyingQueue(buildstate);
+ }
+ else
+ gfbb->buffersOnLevels[i] =
+ list_delete_first(gfbb->buffersOnLevels[i]);
+ }
+ elog(DEBUG2, "emptied all buffers at level %d", i);
+ }
+ MemoryContextSwitchTo(oldCtx);
+}
+
+/*
+ * Get the depth of the GiST index.
+ */
+static int
+gistGetMaxLevel(Relation index)
+{
+ int maxLevel;
+ BlockNumber blkno;
+
+ /*
+ * Traverse down the tree, starting from the root, until we hit the leaf
+ * level.
+ */
+ maxLevel = 0;
+ blkno = GIST_ROOT_BLKNO;
+ while (true)
+ {
+ Buffer buffer;
+ Page page;
+ IndexTuple itup;
+
+ buffer = ReadBuffer(index, blkno);
+
+ /*
+ * There's no concurrent access during index build, so locking is just
+ * pro forma.
+ */
+ LockBuffer(buffer, GIST_SHARE);
+ page = (Page) BufferGetPage(buffer);
+
+ if (GistPageIsLeaf(page))
+ {
+ /* We hit the bottom, so we're done. */
+ UnlockReleaseBuffer(buffer);
+ break;
+ }
+
+ /*
+ * Pick the first downlink on the page, and follow it. It doesn't
+ * matter which downlink we choose, the tree has the same depth
+ * everywhere, so we just pick the first one.
+ */
+ itup = (IndexTuple) PageGetItem(page,
+ PageGetItemId(page, FirstOffsetNumber));
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * We're going down on the tree. It means that there is yet one more
+ * level in the tree.
+ */
+ maxLevel++;
+ }
+ return maxLevel;
+}
+
+
+/*
+ * Routines for managing the parent map.
+ *
+ * Whenever a page is split, we need to insert the downlinks into the parent.
+ * We need to somehow find the parent page to do that. In normal insertions,
+ * we keep a stack of nodes visited when we descend the tree. However, in
+ * buffering build, we can start descending the tree from any internal node,
+ * when we empty a buffer by cascading tuples to its children. So we don't
+ * have a full stack up to the root available at that time.
+ *
+ * So instead, we maintain a hash table to track the parent of every internal
+ * page. We don't need to track the parents of leaf nodes, however. Whenever
+ * we insert to a leaf, we've just descended down from its parent, so we know
+ * its immediate parent already. This helps a lot to limit the memory used
+ * by this hash table.
+ *
+ * Whenever an internal node is split, the parent map needs to be updated.
+ * the parent of the new child page needs to be recorded, and also the
+ * entries for all page whose downlinks are moved to a new page at the split
+ * needs to be updated.
+ *
+ * We also update the parent map whenever we descend the tree. That might seem
+ * unnecessary, because we maintain the map whenever a downlink is moved or
+ * created, but it is needed because we switch to buffering mode after
+ * creating a tree with regular index inserts. Any pages created before
+ * switching to buffering mode will not be present in the parent map initially,
+ * but will be added there the first time we visit them.
+ */
+
+typedef struct
+{
+ BlockNumber childblkno; /* hash key */
+ BlockNumber parentblkno;
+} ParentMapEntry;
+
+static void
+gistInitParentMap(GISTBuildState *buildstate)
+{
+ HASHCTL hashCtl;
+
+ hashCtl.keysize = sizeof(BlockNumber);
+ hashCtl.entrysize = sizeof(ParentMapEntry);
+ hashCtl.hcxt = CurrentMemoryContext;
+ buildstate->parentMap = hash_create("gistbuild parent map",
+ 1024,
+ &hashCtl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+}
+
+static void
+gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
+{
+ ParentMapEntry *entry;
+ bool found;
+
+ entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
+ (const void *) &child,
+ HASH_ENTER,
+ &found);
+ entry->parentblkno = parent;
+}
+
+/*
+ * Scan all downlinks on a page, and memorize their parent.
+ */
+static void
+gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
+{
+ OffsetNumber maxoff;
+ OffsetNumber off;
+ BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
+ Page page = BufferGetPage(parentbuf);
+
+ Assert(!GistPageIsLeaf(page));
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (off = FirstOffsetNumber; off <= maxoff; off++)
+ {
+ ItemId iid = PageGetItemId(page, off);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
+ BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+ gistMemorizeParent(buildstate, childblkno, parentblkno);
+ }
+}
+
+static BlockNumber
+gistGetParent(GISTBuildState *buildstate, BlockNumber child)
+{
+ ParentMapEntry *entry;
+ bool found;
+
+ /* Find node buffer in hash table */
+ entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
+ (const void *) &child,
+ HASH_FIND,
+ &found);
+ if (!found)
+ elog(ERROR, "could not find parent of block %d in lookup table", child);
+
+ return entry->parentblkno;
+}
diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c
new file mode 100644
index 0000000..4ad67c8
--- /dev/null
+++ b/src/backend/access/gist/gistbuildbuffers.c
@@ -0,0 +1,776 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistbuildbuffers.c
+ * node buffer management functions for GiST buffering build algorithm.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistbuildbuffers.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "storage/buffile.h"
+#include "storage/bufmgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb);
+static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb,
+ GISTNodeBuffer *nodeBuffer);
+static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb,
+ GISTNodeBuffer *nodeBuffer);
+static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb,
+ GISTNodeBuffer *nodeBuffer);
+static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer,
+ IndexTuple item);
+static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer,
+ IndexTuple *item);
+static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb);
+static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum);
+
+static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr);
+static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr);
+
+
+/*
+ * Initialize GiST build buffers.
+ */
+GISTBuildBuffers *
+gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
+{
+ GISTBuildBuffers *gfbb;
+ HASHCTL hashCtl;
+
+ gfbb = palloc(sizeof(GISTBuildBuffers));
+ gfbb->pagesPerBuffer = pagesPerBuffer;
+ gfbb->levelStep = levelStep;
+
+ /*
+ * Create a temporary file to hold buffer pages that are swapped out of
+ * memory.
+ */
+ gfbb->pfile = BufFileCreateTemp(false);
+ gfbb->nFileBlocks = 0;
+
+ /* Initialize free page management. */
+ gfbb->nFreeBlocks = 0;
+ gfbb->freeBlocksLen = 32;
+ gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
+
+ /*
+ * Current memory context will be used for all in-memory data structures
+ * of buffers which are persistent during buffering build.
+ */
+ gfbb->context = CurrentMemoryContext;
+
+ /*
+ * nodeBuffersTab hash is association between index blocks and it's
+ * buffers.
+ */
+ memset(&hashCtl, 0, sizeof(hashCtl));
+ hashCtl.keysize = sizeof(BlockNumber);
+ hashCtl.entrysize = sizeof(GISTNodeBuffer);
+ hashCtl.hcxt = CurrentMemoryContext;
+ gfbb->nodeBuffersTab = hash_create("gistbuildbuffers",
+ 1024,
+ &hashCtl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ gfbb->bufferEmptyingQueue = NIL;
+
+ /*
+ * Per-level node buffers lists for final buffers emptying process. Node
+ * buffers are inserted here when they are created.
+ */
+ gfbb->buffersOnLevelsLen = 1;
+ gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) *
+ gfbb->buffersOnLevelsLen);
+ gfbb->buffersOnLevels[0] = NIL;
+
+ /*
+ * Block numbers of node buffers which last pages are currently loaded
+ * into main memory.
+ */
+ gfbb->loadedBuffersLen = 32;
+ gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen *
+ sizeof(GISTNodeBuffer *));
+ gfbb->loadedBuffersCount = 0;
+
+ gfbb->rootlevel = maxLevel;
+
+ return gfbb;
+}
+
+/*
+ * Returns a node buffer for given block. The buffer is created if it
+ * doesn't exist yet.
+ */
+GISTNodeBuffer *
+gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
+ BlockNumber nodeBlocknum, int level)
+{
+ GISTNodeBuffer *nodeBuffer;
+ bool found;
+
+ /* Find node buffer in hash table */
+ nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab,
+ (const void *) &nodeBlocknum,
+ HASH_ENTER,
+ &found);
+ if (!found)
+ {
+ /*
+ * Node buffer wasn't found. Initialize the new buffer as empty.
+ */
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+ /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */
+ nodeBuffer->blocksCount = 0;
+ nodeBuffer->pageBlocknum = InvalidBlockNumber;
+ nodeBuffer->pageBuffer = NULL;
+ nodeBuffer->queuedForEmptying = false;
+ nodeBuffer->isTemp = false;
+ nodeBuffer->level = level;
+
+ /*
+ * Add this buffer to the list of buffers on this level. Enlarge
+ * buffersOnLevels array if needed.
+ */
+ if (level >= gfbb->buffersOnLevelsLen)
+ {
+ int i;
+
+ gfbb->buffersOnLevels =
+ (List **) repalloc(gfbb->buffersOnLevels,
+ (level + 1) * sizeof(List *));
+
+ /* initialize the enlarged portion */
+ for (i = gfbb->buffersOnLevelsLen; i <= level; i++)
+ gfbb->buffersOnLevels[i] = NIL;
+ gfbb->buffersOnLevelsLen = level + 1;
+ }
+
+ /*
+ * Prepend the new buffer to the list of buffers on this level. It's
+ * not arbitrary that the new buffer is put to the beginning of the
+ * list: in the final emptying phase we loop through all buffers at
+ * each level, and flush them. If a page is split during the emptying,
+ * it's more efficient to flush the new splitted pages first, before
+ * moving on to pre-existing pages on the level. The buffers just
+ * created during the page split are likely still in cache, so
+ * flushing them immediately is more efficient than putting them to
+ * the end of the queue.
+ */
+ gfbb->buffersOnLevels[level] = lcons(nodeBuffer,
+ gfbb->buffersOnLevels[level]);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ return nodeBuffer;
+}
+
+/*
+ * Allocate memory for a buffer page.
+ */
+static GISTNodeBufferPage *
+gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
+{
+ GISTNodeBufferPage *pageBuffer;
+
+ pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context,
+ BLCKSZ);
+ pageBuffer->prev = InvalidBlockNumber;
+
+ /* Set page free space */
+ PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET;
+ return pageBuffer;
+}
+
+/*
+ * Add specified buffer into loadedBuffers array.
+ */
+static void
+gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
+{
+ /* Never add a temporary buffer to the array */
+ if (nodeBuffer->isTemp)
+ return;
+
+ /* Enlarge the array if needed */
+ if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen)
+ {
+ gfbb->loadedBuffersLen *= 2;
+ gfbb->loadedBuffers = (GISTNodeBuffer **)
+ repalloc(gfbb->loadedBuffers,
+ gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *));
+ }
+
+ gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer;
+ gfbb->loadedBuffersCount++;
+}
+
+/*
+ * Load last page of node buffer into main memory.
+ */
+static void
+gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
+{
+ /* Check if we really should load something */
+ if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0)
+ {
+ /* Allocate memory for page */
+ nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
+
+ /* Read block from temporary file */
+ ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum,
+ nodeBuffer->pageBuffer);
+
+ /* Mark file block as free */
+ gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum);
+
+ /* Mark node buffer as loaded */
+ gistAddLoadedBuffer(gfbb, nodeBuffer);
+ nodeBuffer->pageBlocknum = InvalidBlockNumber;
+ }
+}
+
+/*
+ * Write last page of node buffer to the disk.
+ */
+static void
+gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
+{
+ /* Check if we have something to write */
+ if (nodeBuffer->pageBuffer)
+ {
+ BlockNumber blkno;
+
+ /* Get free file block */
+ blkno = gistBuffersGetFreeBlock(gfbb);
+
+ /* Write block to the temporary file */
+ WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
+
+ /* Free memory of that page */
+ pfree(nodeBuffer->pageBuffer);
+ nodeBuffer->pageBuffer = NULL;
+
+ /* Save block number */
+ nodeBuffer->pageBlocknum = blkno;
+ }
+}
+
+/*
+ * Write last pages of all node buffers to the disk.
+ */
+void
+gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
+{
+ int i;
+
+ /* Unload all the buffers that have a page loaded in memory. */
+ for (i = 0; i < gfbb->loadedBuffersCount; i++)
+ gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]);
+
+ /* Now there are no node buffers with loaded last page */
+ gfbb->loadedBuffersCount = 0;
+}
+
+/*
+ * Add index tuple to buffer page.
+ */
+static void
+gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup)
+{
+ Size itupsz = IndexTupleSize(itup);
+ char *ptr;
+
+ /* There should be enough of space. */
+ Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz));
+
+ /* Reduce free space value of page to reserve a spot for the tuple. */
+ PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz);
+
+ /* Get pointer to the spot we reserved (ie. end of free space). */
+ ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET
+ + PAGE_FREE_SPACE(pageBuffer);
+
+ /* Copy the index tuple there. */
+ memcpy(ptr, itup, itupsz);
+}
+
+/*
+ * Get last item from buffer page and remove it from page.
+ */
+static void
+gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup)
+{
+ IndexTuple ptr;
+ Size itupsz;
+
+ Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */
+
+ /* Get pointer to last index tuple */
+ ptr = (IndexTuple) ((char *) pageBuffer
+ + BUFFER_PAGE_DATA_OFFSET
+ + PAGE_FREE_SPACE(pageBuffer));
+ itupsz = IndexTupleSize(ptr);
+
+ /* Make a copy of the tuple */
+ *itup = (IndexTuple) palloc(itupsz);
+ memcpy(*itup, ptr, itupsz);
+
+ /* Mark the space used by the tuple as free */
+ PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz);
+}
+
+/*
+ * Push an index tuple to node buffer.
+ */
+void
+gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
+ IndexTuple itup)
+{
+ /*
+ * Most part of memory operations will be in buffering build persistent
+ * context. So, let's switch to it.
+ */
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+ /*
+ * If the buffer is currently empty, create the first page.
+ */
+ if (nodeBuffer->blocksCount == 0)
+ {
+ nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
+ nodeBuffer->blocksCount = 1;
+ gistAddLoadedBuffer(gfbb, nodeBuffer);
+ }
+
+ /* Load last page of node buffer if it wasn't in memory already */
+ if (!nodeBuffer->pageBuffer)
+ gistLoadNodeBuffer(gfbb, nodeBuffer);
+
+ /*
+ * Check if there is enough space on the last page for the tuple.
+ */
+ if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
+ {
+ /*
+ * Nope. Swap previous block to disk and allocate a new one.
+ */
+ BlockNumber blkno;
+
+ /* Write filled page to the disk */
+ blkno = gistBuffersGetFreeBlock(gfbb);
+ WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
+
+ /*
+ * Reset the in-memory page as empty, and link the previous block to
+ * the new page by storing its block number in the prev-link.
+ */
+ PAGE_FREE_SPACE(nodeBuffer->pageBuffer) =
+ BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata));
+ nodeBuffer->pageBuffer->prev = blkno;
+
+ /* We've just added one more page */
+ nodeBuffer->blocksCount++;
+ }
+
+ gistPlaceItupToPage(nodeBuffer->pageBuffer, itup);
+
+ /*
+ * If the buffer just overflowed, add it to the emptying queue.
+ */
+ if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying)
+ {
+ gfbb->bufferEmptyingQueue = lcons(nodeBuffer,
+ gfbb->bufferEmptyingQueue);
+ nodeBuffer->queuedForEmptying = true;
+ }
+
+ /* Restore memory context */
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * Removes one index tuple from node buffer. Returns true if success and false
+ * if node buffer is empty.
+ */
+bool
+gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
+ IndexTuple *itup)
+{
+ /*
+ * If node buffer is empty then return false.
+ */
+ if (nodeBuffer->blocksCount <= 0)
+ return false;
+
+ /* Load last page of node buffer if needed */
+ if (!nodeBuffer->pageBuffer)
+ gistLoadNodeBuffer(gfbb, nodeBuffer);
+
+ /*
+ * Get index tuple from last non-empty page.
+ */
+ gistGetItupFromPage(nodeBuffer->pageBuffer, itup);
+
+ /*
+ * If we just removed the last tuple from the page, fetch previous page on
+ * this node buffer (if any).
+ */
+ if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
+ {
+ BlockNumber prevblkno;
+
+ /*
+ * blocksCount includes the page in pageBuffer, so decrease it now.
+ */
+ nodeBuffer->blocksCount--;
+
+ /*
+ * If there's more pages, fetch previous one.
+ */
+ prevblkno = nodeBuffer->pageBuffer->prev;
+ if (prevblkno != InvalidBlockNumber)
+ {
+ /* There is a previous page. Fetch it. */
+ Assert(nodeBuffer->blocksCount > 0);
+ ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer);
+
+ /*
+ * Now that we've read the block in memory, we can release its
+ * on-disk block for reuse.
+ */
+ gistBuffersReleaseBlock(gfbb, prevblkno);
+ }
+ else
+ {
+ /* No more pages. Free memory. */
+ Assert(nodeBuffer->blocksCount == 0);
+ pfree(nodeBuffer->pageBuffer);
+ nodeBuffer->pageBuffer = NULL;
+ }
+ }
+ return true;
+}
+
+/*
+ * Select a currently unused block for writing to.
+ */
+static long
+gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb)
+{
+ /*
+ * If there are multiple free blocks, we select the one appearing last in
+ * freeBlocks[]. If there are none, assign the next block at the end of
+ * the file (causing the file to be extended).
+ */
+ if (gfbb->nFreeBlocks > 0)
+ return gfbb->freeBlocks[--gfbb->nFreeBlocks];
+ else
+ return gfbb->nFileBlocks++;
+}
+
+/*
+ * Return a block# to the freelist.
+ */
+static void
+gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum)
+{
+ int ndx;
+
+ /* Enlarge freeBlocks array if full. */
+ if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
+ {
+ gfbb->freeBlocksLen *= 2;
+ gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
+ gfbb->freeBlocksLen *
+ sizeof(long));
+ }
+
+ /* Add blocknum to array */
+ ndx = gfbb->nFreeBlocks++;
+ gfbb->freeBlocks[ndx] = blocknum;
+}
+
+/*
+ * Free buffering build data structure.
+ */
+void
+gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
+{
+ /* Close buffers file. */
+ BufFileClose(gfbb->pfile);
+
+ /* All other things will be freed on memory context release */
+}
+
+/*
+ * Data structure representing information about node buffer for index tuples
+ * relocation from splitted node buffer.
+ */
+typedef struct
+{
+ GISTENTRY entry[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ GISTPageSplitInfo *splitinfo;
+ GISTNodeBuffer *nodeBuffer;
+} RelocationBufferInfo;
+
+/*
+ * At page split, distribute tuples from the buffer of the split page to
+ * new buffers for the created page halves. This also adjusts the downlinks
+ * in 'splitinfo' to include the tuples in the buffers.
+ */
+void
+gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
+ Relation r, int level,
+ Buffer buffer, List *splitinfo)
+{
+ RelocationBufferInfo *relocationBuffersInfos;
+ bool found;
+ GISTNodeBuffer *nodeBuffer;
+ BlockNumber blocknum;
+ IndexTuple itup;
+ int splitPagesCount = 0,
+ i;
+ GISTENTRY entry[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ GISTNodeBuffer oldBuf;
+ ListCell *lc;
+
+ /* If the splitted page doesn't have buffers, we have nothing to do. */
+ if (!LEVEL_HAS_BUFFERS(level, gfbb))
+ return;
+
+ /*
+ * Get the node buffer of the splitted page.
+ */
+ blocknum = BufferGetBlockNumber(buffer);
+ nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
+ HASH_FIND, &found);
+ if (!found)
+ {
+ /* The page has no buffer, so we have nothing to do. */
+ return;
+ }
+
+ /*
+ * Make a copy of the old buffer, as we're going reuse it as the buffer
+ * for the new left page, which is on the same block as the old page.
+ * That's not true for the root page, but that's fine because we never
+ * have a buffer on the root page anyway. The original algorithm as
+ * described by Arge et al did, but it's of no use, as you might as well
+ * read the tuples straight from the heap instead of the root buffer.
+ */
+ Assert(blocknum != GIST_ROOT_BLKNO);
+ memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer));
+ oldBuf.isTemp = true;
+
+ /* Reset the old buffer, used for the new left page from now on */
+ nodeBuffer->blocksCount = 0;
+ nodeBuffer->pageBuffer = NULL;
+ nodeBuffer->pageBlocknum = InvalidBlockNumber;
+
+ /*
+ * Allocate memory for information about relocation buffers.
+ */
+ splitPagesCount = list_length(splitinfo);
+ relocationBuffersInfos =
+ (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) *
+ splitPagesCount);
+
+ /*
+ * Fill relocation buffers information for node buffers of pages produced
+ * by split.
+ */
+ i = 0;
+ foreach(lc, splitinfo)
+ {
+ GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc);
+ GISTNodeBuffer *newNodeBuffer;
+
+ /* Decompress parent index tuple of node buffer page. */
+ gistDeCompressAtt(giststate, r,
+ si->downlink, NULL, (OffsetNumber) 0,
+ relocationBuffersInfos[i].entry,
+ relocationBuffersInfos[i].isnull);
+
+ /*
+ * Create a node buffer for the page. The leftmost half is on the same
+ * block as the old page before split, so for the leftmost half this
+ * will return the original buffer. The tuples on the original buffer
+ * were relinked to the temporary buffer, so the original one is now
+ * empty.
+ */
+ newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level);
+
+ relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
+ relocationBuffersInfos[i].splitinfo = si;
+
+ i++;
+ }
+
+ /*
+ * Loop through all index tuples in the buffer of the page being split,
+ * moving them to buffers for the new pages. We try to move each tuple to
+ * the page that will result in the lowest penalty for the leading column
+ * or, in the case of a tie, the lowest penalty for the earliest column
+ * that is not tied.
+ *
+ * The page searching logic is very similar to gistchoose().
+ */
+ while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup))
+ {
+ float best_penalty[INDEX_MAX_KEYS];
+ int i,
+ which;
+ IndexTuple newtup;
+ RelocationBufferInfo *targetBufferInfo;
+
+ gistDeCompressAtt(giststate, r,
+ itup, NULL, (OffsetNumber) 0, entry, isnull);
+
+ /* default to using first page (shouldn't matter) */
+ which = 0;
+
+ /*
+ * best_penalty[j] is the best penalty we have seen so far for column
+ * j, or -1 when we haven't yet examined column j. Array entries to
+ * the right of the first -1 are undefined.
+ */
+ best_penalty[0] = -1;
+
+ /*
+ * Loop over possible target pages, looking for one to move this tuple
+ * to.
+ */
+ for (i = 0; i < splitPagesCount; i++)
+ {
+ RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
+ bool zero_penalty;
+ int j;
+
+ zero_penalty = true;
+
+ /* Loop over index attributes. */
+ for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++)
+ {
+ float usize;
+
+ /* Compute penalty for this column. */
+ usize = gistpenalty(giststate, j,
+ &splitPageInfo->entry[j],
+ splitPageInfo->isnull[j],
+ &entry[j], isnull[j]);
+ if (usize > 0)
+ zero_penalty = false;
+
+ if (best_penalty[j] < 0 || usize < best_penalty[j])
+ {
+ /*
+ * New best penalty for column. Tentatively select this
+ * page as the target, and record the best penalty. Then
+ * reset the next column's penalty to "unknown" (and
+ * indirectly, the same for all the ones to its right).
+ * This will force us to adopt this page's penalty values
+ * as the best for all the remaining columns during
+ * subsequent loop iterations.
+ */
+ which = i;
+ best_penalty[j] = usize;
+
+ if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1)
+ best_penalty[j + 1] = -1;
+ }
+ else if (best_penalty[j] == usize)
+ {
+ /*
+ * The current page is exactly as good for this column as
+ * the best page seen so far. The next iteration of this
+ * loop will compare the next column.
+ */
+ }
+ else
+ {
+ /*
+ * The current page is worse for this column than the best
+ * page seen so far. Skip the remaining columns and move
+ * on to the next page, if any.
+ */
+ zero_penalty = false; /* so outer loop won't exit */
+ break;
+ }
+ }
+
+ /*
+ * If we find a page with zero penalty for all columns, there's no
+ * need to examine remaining pages; just break out of the loop and
+ * return it.
+ */
+ if (zero_penalty)
+ break;
+ }
+
+ /* OK, "which" is the page index to push the tuple to */
+ targetBufferInfo = &relocationBuffersInfos[which];
+
+ /* Push item to selected node buffer */
+ gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup);
+
+ /* Adjust the downlink for this page, if needed. */
+ newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink,
+ itup, giststate);
+ if (newtup)
+ {
+ gistDeCompressAtt(giststate, r,
+ newtup, NULL, (OffsetNumber) 0,
+ targetBufferInfo->entry,
+ targetBufferInfo->isnull);
+
+ targetBufferInfo->splitinfo->downlink = newtup;
+ }
+ }
+
+ pfree(relocationBuffersInfos);
+}
+
+
+/*
+ * Wrappers around BufFile operations. The main difference is that these
+ * wrappers report errors with ereport(), so that the callers don't need
+ * to check the return code.
+ */
+
+static void
+ReadTempFileBlock(BufFile *file, long blknum, void *ptr)
+{
+ size_t nread;
+
+ if (BufFileSeekBlock(file, blknum) != 0)
+ elog(ERROR, "could not seek to block %ld in temporary file", blknum);
+ nread = BufFileRead(file, ptr, BLCKSZ);
+ if (nread != BLCKSZ)
+ elog(ERROR, "could not read temporary file: read only %zu of %zu bytes",
+ nread, (size_t) BLCKSZ);
+}
+
+static void
+WriteTempFileBlock(BufFile *file, long blknum, void *ptr)
+{
+ if (BufFileSeekBlock(file, blknum) != 0)
+ elog(ERROR, "could not seek to block %ld in temporary file", blknum);
+ BufFileWrite(file, ptr, BLCKSZ);
+}
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
new file mode 100644
index 0000000..a4edcc7
--- /dev/null
+++ b/src/backend/access/gist/gistget.c
@@ -0,0 +1,803 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistget.c
+ * fetch tuples from a GiST scan.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistget.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "access/relscan.h"
+#include "lib/pairingheap.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "utils/float.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/*
+ * gistkillitems() -- set LP_DEAD state for items an indexscan caller has
+ * told us were killed.
+ *
+ * We re-read page here, so it's important to check page LSN. If the page
+ * has been modified since the last read (as determined by LSN), we cannot
+ * flag any entries because it is possible that the old entry was vacuumed
+ * away and the TID was re-used by a completely different heap tuple.
+ */
+static void
+gistkillitems(IndexScanDesc scan)
+{
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ Buffer buffer;
+ Page page;
+ OffsetNumber offnum;
+ ItemId iid;
+ int i;
+ bool killedsomething = false;
+
+ Assert(so->curBlkno != InvalidBlockNumber);
+ Assert(!XLogRecPtrIsInvalid(so->curPageLSN));
+ Assert(so->killedItems != NULL);
+
+ buffer = ReadBuffer(scan->indexRelation, so->curBlkno);
+ if (!BufferIsValid(buffer))
+ return;
+
+ LockBuffer(buffer, GIST_SHARE);
+ gistcheckpage(scan->indexRelation, buffer);
+ page = BufferGetPage(buffer);
+
+ /*
+ * If page LSN differs it means that the page was modified since the last
+ * read. killedItems could be not valid so LP_DEAD hints applying is not
+ * safe.
+ */
+ if (BufferGetLSNAtomic(buffer) != so->curPageLSN)
+ {
+ UnlockReleaseBuffer(buffer);
+ so->numKilled = 0; /* reset counter */
+ return;
+ }
+
+ Assert(GistPageIsLeaf(page));
+
+ /*
+ * Mark all killedItems as dead. We need no additional recheck, because,
+ * if page was modified, curPageLSN must have changed.
+ */
+ for (i = 0; i < so->numKilled; i++)
+ {
+ offnum = so->killedItems[i];
+ iid = PageGetItemId(page, offnum);
+ ItemIdMarkDead(iid);
+ killedsomething = true;
+ }
+
+ if (killedsomething)
+ {
+ GistMarkPageHasGarbage(page);
+ MarkBufferDirtyHint(buffer, true);
+ }
+
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * Always reset the scan state, so we don't look for same items on other
+ * pages.
+ */
+ so->numKilled = 0;
+}
+
+/*
+ * gistindex_keytest() -- does this index tuple satisfy the scan key(s)?
+ *
+ * The index tuple might represent either a heap tuple or a lower index page,
+ * depending on whether the containing page is a leaf page or not.
+ *
+ * On success return for a heap tuple, *recheck_p is set to indicate whether
+ * the quals need to be rechecked. We recheck if any of the consistent()
+ * functions request it. recheck is not interesting when examining a non-leaf
+ * entry, since we must visit the lower index page if there's any doubt.
+ * Similarly, *recheck_distances_p is set to indicate whether the distances
+ * need to be rechecked, and it is also ignored for non-leaf entries.
+ *
+ * If we are doing an ordered scan, so->distances[] is filled with distance
+ * data from the distance() functions before returning success.
+ *
+ * We must decompress the key in the IndexTuple before passing it to the
+ * sk_funcs (which actually are the opclass Consistent or Distance methods).
+ *
+ * Note that this function is always invoked in a short-lived memory context,
+ * so we don't need to worry about cleaning up allocated memory, either here
+ * or in the implementation of any Consistent or Distance methods.
+ */
+static bool
+gistindex_keytest(IndexScanDesc scan,
+ IndexTuple tuple,
+ Page page,
+ OffsetNumber offset,
+ bool *recheck_p,
+ bool *recheck_distances_p)
+{
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ GISTSTATE *giststate = so->giststate;
+ ScanKey key = scan->keyData;
+ int keySize = scan->numberOfKeys;
+ IndexOrderByDistance *distance_p;
+ Relation r = scan->indexRelation;
+
+ *recheck_p = false;
+ *recheck_distances_p = false;
+
+ /*
+ * If it's a leftover invalid tuple from pre-9.1, treat it as a match with
+ * minimum possible distances. This means we'll always follow it to the
+ * referenced page.
+ */
+ if (GistTupleIsInvalid(tuple))
+ {
+ int i;
+
+ if (GistPageIsLeaf(page)) /* shouldn't happen */
+ elog(ERROR, "invalid GiST tuple found on leaf page");
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ {
+ so->distances[i].value = -get_float8_infinity();
+ so->distances[i].isnull = false;
+ }
+ return true;
+ }
+
+ /* Check whether it matches according to the Consistent functions */
+ while (keySize > 0)
+ {
+ Datum datum;
+ bool isNull;
+
+ datum = index_getattr(tuple,
+ key->sk_attno,
+ giststate->leafTupdesc,
+ &isNull);
+
+ if (key->sk_flags & SK_ISNULL)
+ {
+ /*
+ * On non-leaf page we can't conclude that child hasn't NULL
+ * values because of assumption in GiST: union (VAL, NULL) is VAL.
+ * But if on non-leaf page key IS NULL, then all children are
+ * NULL.
+ */
+ if (key->sk_flags & SK_SEARCHNULL)
+ {
+ if (GistPageIsLeaf(page) && !isNull)
+ return false;
+ }
+ else
+ {
+ Assert(key->sk_flags & SK_SEARCHNOTNULL);
+ if (isNull)
+ return false;
+ }
+ }
+ else if (isNull)
+ {
+ return false;
+ }
+ else
+ {
+ Datum test;
+ bool recheck;
+ GISTENTRY de;
+
+ gistdentryinit(giststate, key->sk_attno - 1, &de,
+ datum, r, page, offset,
+ false, isNull);
+
+ /*
+ * Call the Consistent function to evaluate the test. The
+ * arguments are the index datum (as a GISTENTRY*), the comparison
+ * datum, the comparison operator's strategy number and subtype
+ * from pg_amop, and the recheck flag.
+ *
+ * (Presently there's no need to pass the subtype since it'll
+ * always be zero, but might as well pass it for possible future
+ * use.)
+ *
+ * We initialize the recheck flag to true (the safest assumption)
+ * in case the Consistent function forgets to set it.
+ */
+ recheck = true;
+
+ test = FunctionCall5Coll(&key->sk_func,
+ key->sk_collation,
+ PointerGetDatum(&de),
+ key->sk_argument,
+ Int16GetDatum(key->sk_strategy),
+ ObjectIdGetDatum(key->sk_subtype),
+ PointerGetDatum(&recheck));
+
+ if (!DatumGetBool(test))
+ return false;
+ *recheck_p |= recheck;
+ }
+
+ key++;
+ keySize--;
+ }
+
+ /* OK, it passes --- now let's compute the distances */
+ key = scan->orderByData;
+ distance_p = so->distances;
+ keySize = scan->numberOfOrderBys;
+ while (keySize > 0)
+ {
+ Datum datum;
+ bool isNull;
+
+ datum = index_getattr(tuple,
+ key->sk_attno,
+ giststate->leafTupdesc,
+ &isNull);
+
+ if ((key->sk_flags & SK_ISNULL) || isNull)
+ {
+ /* Assume distance computes as null */
+ distance_p->value = 0.0;
+ distance_p->isnull = true;
+ }
+ else
+ {
+ Datum dist;
+ bool recheck;
+ GISTENTRY de;
+
+ gistdentryinit(giststate, key->sk_attno - 1, &de,
+ datum, r, page, offset,
+ false, isNull);
+
+ /*
+ * Call the Distance function to evaluate the distance. The
+ * arguments are the index datum (as a GISTENTRY*), the comparison
+ * datum, the ordering operator's strategy number and subtype from
+ * pg_amop, and the recheck flag.
+ *
+ * (Presently there's no need to pass the subtype since it'll
+ * always be zero, but might as well pass it for possible future
+ * use.)
+ *
+ * If the function sets the recheck flag, the returned distance is
+ * a lower bound on the true distance and needs to be rechecked.
+ * We initialize the flag to 'false'. This flag was added in
+ * version 9.5; distance functions written before that won't know
+ * about the flag, but are expected to never be lossy.
+ */
+ recheck = false;
+ dist = FunctionCall5Coll(&key->sk_func,
+ key->sk_collation,
+ PointerGetDatum(&de),
+ key->sk_argument,
+ Int16GetDatum(key->sk_strategy),
+ ObjectIdGetDatum(key->sk_subtype),
+ PointerGetDatum(&recheck));
+ *recheck_distances_p |= recheck;
+ distance_p->value = DatumGetFloat8(dist);
+ distance_p->isnull = false;
+ }
+
+ key++;
+ distance_p++;
+ keySize--;
+ }
+
+ return true;
+}
+
+/*
+ * Scan all items on the GiST index page identified by *pageItem, and insert
+ * them into the queue (or directly to output areas)
+ *
+ * scan: index scan we are executing
+ * pageItem: search queue item identifying an index page to scan
+ * myDistances: distances array associated with pageItem, or NULL at the root
+ * tbm: if not NULL, gistgetbitmap's output bitmap
+ * ntids: if not NULL, gistgetbitmap's output tuple counter
+ *
+ * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap
+ * tuples should be reported directly into the bitmap. If they are NULL,
+ * we're doing a plain or ordered indexscan. For a plain indexscan, heap
+ * tuple TIDs are returned into so->pageData[]. For an ordered indexscan,
+ * heap tuple TIDs are pushed into individual search queue items. In an
+ * index-only scan, reconstructed index tuples are returned along with the
+ * TIDs.
+ *
+ * If we detect that the index page has split since we saw its downlink
+ * in the parent, we push its new right sibling onto the queue so the
+ * sibling will be processed next.
+ */
+static void
+gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
+ IndexOrderByDistance *myDistances, TIDBitmap *tbm, int64 *ntids)
+{
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ GISTSTATE *giststate = so->giststate;
+ Relation r = scan->indexRelation;
+ Buffer buffer;
+ Page page;
+ GISTPageOpaque opaque;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ MemoryContext oldcxt;
+
+ Assert(!GISTSearchItemIsHeap(*pageItem));
+
+ buffer = ReadBuffer(scan->indexRelation, pageItem->blkno);
+ LockBuffer(buffer, GIST_SHARE);
+ PredicateLockPage(r, BufferGetBlockNumber(buffer), scan->xs_snapshot);
+ gistcheckpage(scan->indexRelation, buffer);
+ page = BufferGetPage(buffer);
+ TestForOldSnapshot(scan->xs_snapshot, r, page);
+ opaque = GistPageGetOpaque(page);
+
+ /*
+ * Check if we need to follow the rightlink. We need to follow it if the
+ * page was concurrently split since we visited the parent (in which case
+ * parentlsn < nsn), or if the system crashed after a page split but
+ * before the downlink was inserted into the parent.
+ */
+ if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) &&
+ (GistFollowRight(page) ||
+ pageItem->data.parentlsn < GistPageGetNSN(page)) &&
+ opaque->rightlink != InvalidBlockNumber /* sanity check */ )
+ {
+ /* There was a page split, follow right link to add pages */
+ GISTSearchItem *item;
+
+ /* This can't happen when starting at the root */
+ Assert(myDistances != NULL);
+
+ oldcxt = MemoryContextSwitchTo(so->queueCxt);
+
+ /* Create new GISTSearchItem for the right sibling index page */
+ item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys));
+ item->blkno = opaque->rightlink;
+ item->data.parentlsn = pageItem->data.parentlsn;
+
+ /* Insert it into the queue using same distances as for this page */
+ memcpy(item->distances, myDistances,
+ sizeof(item->distances[0]) * scan->numberOfOrderBys);
+
+ pairingheap_add(so->queue, &item->phNode);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ /*
+ * Check if the page was deleted after we saw the downlink. There's
+ * nothing of interest on a deleted page. Note that we must do this after
+ * checking the NSN for concurrent splits! It's possible that the page
+ * originally contained some tuples that are visible to us, but was split
+ * so that all the visible tuples were moved to another page, and then
+ * this page was deleted.
+ */
+ if (GistPageIsDeleted(page))
+ {
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+
+ so->nPageData = so->curPageData = 0;
+ scan->xs_hitup = NULL; /* might point into pageDataCxt */
+ if (so->pageDataCxt)
+ MemoryContextReset(so->pageDataCxt);
+
+ /*
+ * We save the LSN of the page as we read it, so that we know whether it
+ * safe to apply LP_DEAD hints to the page later. This allows us to drop
+ * the pin for MVCC scans, which allows vacuum to avoid blocking.
+ */
+ so->curPageLSN = BufferGetLSNAtomic(buffer);
+
+ /*
+ * check all tuples on page
+ */
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ ItemId iid = PageGetItemId(page, i);
+ IndexTuple it;
+ bool match;
+ bool recheck;
+ bool recheck_distances;
+
+ /*
+ * If the scan specifies not to return killed tuples, then we treat a
+ * killed tuple as not passing the qual.
+ */
+ if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+ continue;
+
+ it = (IndexTuple) PageGetItem(page, iid);
+
+ /*
+ * Must call gistindex_keytest in tempCxt, and clean up any leftover
+ * junk afterward.
+ */
+ oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt);
+
+ match = gistindex_keytest(scan, it, page, i,
+ &recheck, &recheck_distances);
+
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextReset(so->giststate->tempCxt);
+
+ /* Ignore tuple if it doesn't match */
+ if (!match)
+ continue;
+
+ if (tbm && GistPageIsLeaf(page))
+ {
+ /*
+ * getbitmap scan, so just push heap tuple TIDs into the bitmap
+ * without worrying about ordering
+ */
+ tbm_add_tuples(tbm, &it->t_tid, 1, recheck);
+ (*ntids)++;
+ }
+ else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page))
+ {
+ /*
+ * Non-ordered scan, so report tuples in so->pageData[]
+ */
+ so->pageData[so->nPageData].heapPtr = it->t_tid;
+ so->pageData[so->nPageData].recheck = recheck;
+ so->pageData[so->nPageData].offnum = i;
+
+ /*
+ * In an index-only scan, also fetch the data from the tuple. The
+ * reconstructed tuples are stored in pageDataCxt.
+ */
+ if (scan->xs_want_itup)
+ {
+ oldcxt = MemoryContextSwitchTo(so->pageDataCxt);
+ so->pageData[so->nPageData].recontup =
+ gistFetchTuple(giststate, r, it);
+ MemoryContextSwitchTo(oldcxt);
+ }
+ so->nPageData++;
+ }
+ else
+ {
+ /*
+ * Must push item into search queue. We get here for any lower
+ * index page, and also for heap tuples if doing an ordered
+ * search.
+ */
+ GISTSearchItem *item;
+ int nOrderBys = scan->numberOfOrderBys;
+
+ oldcxt = MemoryContextSwitchTo(so->queueCxt);
+
+ /* Create new GISTSearchItem for this item */
+ item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys));
+
+ if (GistPageIsLeaf(page))
+ {
+ /* Creating heap-tuple GISTSearchItem */
+ item->blkno = InvalidBlockNumber;
+ item->data.heap.heapPtr = it->t_tid;
+ item->data.heap.recheck = recheck;
+ item->data.heap.recheckDistances = recheck_distances;
+
+ /*
+ * In an index-only scan, also fetch the data from the tuple.
+ */
+ if (scan->xs_want_itup)
+ item->data.heap.recontup = gistFetchTuple(giststate, r, it);
+ }
+ else
+ {
+ /* Creating index-page GISTSearchItem */
+ item->blkno = ItemPointerGetBlockNumber(&it->t_tid);
+
+ /*
+ * LSN of current page is lsn of parent page for child. We
+ * only have a shared lock, so we need to get the LSN
+ * atomically.
+ */
+ item->data.parentlsn = BufferGetLSNAtomic(buffer);
+ }
+
+ /* Insert it into the queue using new distance data */
+ memcpy(item->distances, so->distances,
+ sizeof(item->distances[0]) * nOrderBys);
+
+ pairingheap_add(so->queue, &item->phNode);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+ }
+
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Extract next item (in order) from search queue
+ *
+ * Returns a GISTSearchItem or NULL. Caller must pfree item when done with it.
+ */
+static GISTSearchItem *
+getNextGISTSearchItem(GISTScanOpaque so)
+{
+ GISTSearchItem *item;
+
+ if (!pairingheap_is_empty(so->queue))
+ {
+ item = (GISTSearchItem *) pairingheap_remove_first(so->queue);
+ }
+ else
+ {
+ /* Done when both heaps are empty */
+ item = NULL;
+ }
+
+ /* Return item; caller is responsible to pfree it */
+ return item;
+}
+
+/*
+ * Fetch next heap tuple in an ordered search
+ */
+static bool
+getNextNearest(IndexScanDesc scan)
+{
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ bool res = false;
+
+ if (scan->xs_hitup)
+ {
+ /* free previously returned tuple */
+ pfree(scan->xs_hitup);
+ scan->xs_hitup = NULL;
+ }
+
+ do
+ {
+ GISTSearchItem *item = getNextGISTSearchItem(so);
+
+ if (!item)
+ break;
+
+ if (GISTSearchItemIsHeap(*item))
+ {
+ /* found a heap item at currently minimal distance */
+ scan->xs_heaptid = item->data.heap.heapPtr;
+ scan->xs_recheck = item->data.heap.recheck;
+
+ index_store_float8_orderby_distances(scan, so->orderByTypes,
+ item->distances,
+ item->data.heap.recheckDistances);
+
+ /* in an index-only scan, also return the reconstructed tuple. */
+ if (scan->xs_want_itup)
+ scan->xs_hitup = item->data.heap.recontup;
+ res = true;
+ }
+ else
+ {
+ /* visit an index page, extract its items into queue */
+ CHECK_FOR_INTERRUPTS();
+
+ gistScanPage(scan, item, item->distances, NULL, NULL);
+ }
+
+ pfree(item);
+ } while (!res);
+
+ return res;
+}
+
+/*
+ * gistgettuple() -- Get the next tuple in the scan
+ */
+bool
+gistgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+
+ if (dir != ForwardScanDirection)
+ elog(ERROR, "GiST only supports forward scan direction");
+
+ if (!so->qual_ok)
+ return false;
+
+ if (so->firstCall)
+ {
+ /* Begin the scan by processing the root page */
+ GISTSearchItem fakeItem;
+
+ pgstat_count_index_scan(scan->indexRelation);
+
+ so->firstCall = false;
+ so->curPageData = so->nPageData = 0;
+ scan->xs_hitup = NULL;
+ if (so->pageDataCxt)
+ MemoryContextReset(so->pageDataCxt);
+
+ fakeItem.blkno = GIST_ROOT_BLKNO;
+ memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
+ gistScanPage(scan, &fakeItem, NULL, NULL, NULL);
+ }
+
+ if (scan->numberOfOrderBys > 0)
+ {
+ /* Must fetch tuples in strict distance order */
+ return getNextNearest(scan);
+ }
+ else
+ {
+ /* Fetch tuples index-page-at-a-time */
+ for (;;)
+ {
+ if (so->curPageData < so->nPageData)
+ {
+ if (scan->kill_prior_tuple && so->curPageData > 0)
+ {
+
+ if (so->killedItems == NULL)
+ {
+ MemoryContext oldCxt =
+ MemoryContextSwitchTo(so->giststate->scanCxt);
+
+ so->killedItems =
+ (OffsetNumber *) palloc(MaxIndexTuplesPerPage
+ * sizeof(OffsetNumber));
+
+ MemoryContextSwitchTo(oldCxt);
+ }
+ if (so->numKilled < MaxIndexTuplesPerPage)
+ so->killedItems[so->numKilled++] =
+ so->pageData[so->curPageData - 1].offnum;
+ }
+ /* continuing to return tuples from a leaf page */
+ scan->xs_heaptid = so->pageData[so->curPageData].heapPtr;
+ scan->xs_recheck = so->pageData[so->curPageData].recheck;
+
+ /* in an index-only scan, also return the reconstructed tuple */
+ if (scan->xs_want_itup)
+ scan->xs_hitup = so->pageData[so->curPageData].recontup;
+
+ so->curPageData++;
+
+ return true;
+ }
+
+ /*
+ * Check the last returned tuple and add it to killedItems if
+ * necessary
+ */
+ if (scan->kill_prior_tuple
+ && so->curPageData > 0
+ && so->curPageData == so->nPageData)
+ {
+
+ if (so->killedItems == NULL)
+ {
+ MemoryContext oldCxt =
+ MemoryContextSwitchTo(so->giststate->scanCxt);
+
+ so->killedItems =
+ (OffsetNumber *) palloc(MaxIndexTuplesPerPage
+ * sizeof(OffsetNumber));
+
+ MemoryContextSwitchTo(oldCxt);
+ }
+ if (so->numKilled < MaxIndexTuplesPerPage)
+ so->killedItems[so->numKilled++] =
+ so->pageData[so->curPageData - 1].offnum;
+ }
+ /* find and process the next index page */
+ do
+ {
+ GISTSearchItem *item;
+
+ if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0))
+ gistkillitems(scan);
+
+ item = getNextGISTSearchItem(so);
+
+ if (!item)
+ return false;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* save current item BlockNumber for next gistkillitems() call */
+ so->curBlkno = item->blkno;
+
+ /*
+ * While scanning a leaf page, ItemPointers of matching heap
+ * tuples are stored in so->pageData. If there are any on
+ * this page, we fall out of the inner "do" and loop around to
+ * return them.
+ */
+ gistScanPage(scan, item, item->distances, NULL, NULL);
+
+ pfree(item);
+ } while (so->nPageData == 0);
+ }
+ }
+}
+
+/*
+ * gistgetbitmap() -- Get a bitmap of all heap tuple locations
+ */
+int64
+gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ int64 ntids = 0;
+ GISTSearchItem fakeItem;
+
+ if (!so->qual_ok)
+ return 0;
+
+ pgstat_count_index_scan(scan->indexRelation);
+
+ /* Begin the scan by processing the root page */
+ so->curPageData = so->nPageData = 0;
+ scan->xs_hitup = NULL;
+ if (so->pageDataCxt)
+ MemoryContextReset(so->pageDataCxt);
+
+ fakeItem.blkno = GIST_ROOT_BLKNO;
+ memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
+ gistScanPage(scan, &fakeItem, NULL, tbm, &ntids);
+
+ /*
+ * While scanning a leaf page, ItemPointers of matching heap tuples will
+ * be stored directly into tbm, so we don't need to deal with them here.
+ */
+ for (;;)
+ {
+ GISTSearchItem *item = getNextGISTSearchItem(so);
+
+ if (!item)
+ break;
+
+ CHECK_FOR_INTERRUPTS();
+
+ gistScanPage(scan, item, item->distances, tbm, &ntids);
+
+ pfree(item);
+ }
+
+ return ntids;
+}
+
+/*
+ * Can we do index-only scans on the given index column?
+ *
+ * Opclasses that implement a fetch function support index-only scans.
+ * Opclasses without compression functions also support index-only scans.
+ * Included attributes always can be fetched for index-only scans.
+ */
+bool
+gistcanreturn(Relation index, int attno)
+{
+ if (attno > IndexRelationGetNumberOfKeyAttributes(index) ||
+ OidIsValid(index_getprocid(index, attno, GIST_FETCH_PROC)) ||
+ !OidIsValid(index_getprocid(index, attno, GIST_COMPRESS_PROC)))
+ return true;
+ else
+ return false;
+}
diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c
new file mode 100644
index 0000000..9ace64c
--- /dev/null
+++ b/src/backend/access/gist/gistproc.c
@@ -0,0 +1,1542 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistproc.c
+ * Support procedures for GiSTs over 2-D objects (boxes, polygons, circles,
+ * points).
+ *
+ * This gives R-tree behavior, with Guttman's poly-time split algorithm.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistproc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/gist.h"
+#include "access/stratnum.h"
+#include "utils/builtins.h"
+#include "utils/float.h"
+#include "utils/geo_decls.h"
+
+
+static bool gist_box_leaf_consistent(BOX *key, BOX *query,
+ StrategyNumber strategy);
+static bool rtree_internal_consistent(BOX *key, BOX *query,
+ StrategyNumber strategy);
+
+/* Minimum accepted ratio of split */
+#define LIMIT_RATIO 0.3
+
+
+/**************************************************
+ * Box ops
+ **************************************************/
+
+/*
+ * Calculates union of two boxes, a and b. The result is stored in *n.
+ */
+static void
+rt_box_union(BOX *n, const BOX *a, const BOX *b)
+{
+ n->high.x = float8_max(a->high.x, b->high.x);
+ n->high.y = float8_max(a->high.y, b->high.y);
+ n->low.x = float8_min(a->low.x, b->low.x);
+ n->low.y = float8_min(a->low.y, b->low.y);
+}
+
+/*
+ * Size of a BOX for penalty-calculation purposes.
+ * The result can be +Infinity, but not NaN.
+ */
+static float8
+size_box(const BOX *box)
+{
+ /*
+ * Check for zero-width cases. Note that we define the size of a zero-
+ * by-infinity box as zero. It's important to special-case this somehow,
+ * as naively multiplying infinity by zero will produce NaN.
+ *
+ * The less-than cases should not happen, but if they do, say "zero".
+ */
+ if (float8_le(box->high.x, box->low.x) ||
+ float8_le(box->high.y, box->low.y))
+ return 0.0;
+
+ /*
+ * We treat NaN as larger than +Infinity, so any distance involving a NaN
+ * and a non-NaN is infinite. Note the previous check eliminated the
+ * possibility that the low fields are NaNs.
+ */
+ if (isnan(box->high.x) || isnan(box->high.y))
+ return get_float8_infinity();
+ return float8_mul(float8_mi(box->high.x, box->low.x),
+ float8_mi(box->high.y, box->low.y));
+}
+
+/*
+ * Return amount by which the union of the two boxes is larger than
+ * the original BOX's area. The result can be +Infinity, but not NaN.
+ */
+static float8
+box_penalty(const BOX *original, const BOX *new)
+{
+ BOX unionbox;
+
+ rt_box_union(&unionbox, original, new);
+ return float8_mi(size_box(&unionbox), size_box(original));
+}
+
+/*
+ * The GiST Consistent method for boxes
+ *
+ * Should return false if for all data items x below entry,
+ * the predicate x op query must be false, where op is the oper
+ * corresponding to strategy in the pg_amop table.
+ */
+Datum
+gist_box_consistent(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ BOX *query = PG_GETARG_BOX_P(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+
+ /* Oid subtype = PG_GETARG_OID(3); */
+ bool *recheck = (bool *) PG_GETARG_POINTER(4);
+
+ /* All cases served by this function are exact */
+ *recheck = false;
+
+ if (DatumGetBoxP(entry->key) == NULL || query == NULL)
+ PG_RETURN_BOOL(false);
+
+ /*
+ * if entry is not leaf, use rtree_internal_consistent, else use
+ * gist_box_leaf_consistent
+ */
+ if (GIST_LEAF(entry))
+ PG_RETURN_BOOL(gist_box_leaf_consistent(DatumGetBoxP(entry->key),
+ query,
+ strategy));
+ else
+ PG_RETURN_BOOL(rtree_internal_consistent(DatumGetBoxP(entry->key),
+ query,
+ strategy));
+}
+
+/*
+ * Increase BOX b to include addon.
+ */
+static void
+adjustBox(BOX *b, const BOX *addon)
+{
+ if (float8_lt(b->high.x, addon->high.x))
+ b->high.x = addon->high.x;
+ if (float8_gt(b->low.x, addon->low.x))
+ b->low.x = addon->low.x;
+ if (float8_lt(b->high.y, addon->high.y))
+ b->high.y = addon->high.y;
+ if (float8_gt(b->low.y, addon->low.y))
+ b->low.y = addon->low.y;
+}
+
+/*
+ * The GiST Union method for boxes
+ *
+ * returns the minimal bounding box that encloses all the entries in entryvec
+ */
+Datum
+gist_box_union(PG_FUNCTION_ARGS)
+{
+ GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
+ int *sizep = (int *) PG_GETARG_POINTER(1);
+ int numranges,
+ i;
+ BOX *cur,
+ *pageunion;
+
+ numranges = entryvec->n;
+ pageunion = (BOX *) palloc(sizeof(BOX));
+ cur = DatumGetBoxP(entryvec->vector[0].key);
+ memcpy((void *) pageunion, (void *) cur, sizeof(BOX));
+
+ for (i = 1; i < numranges; i++)
+ {
+ cur = DatumGetBoxP(entryvec->vector[i].key);
+ adjustBox(pageunion, cur);
+ }
+ *sizep = sizeof(BOX);
+
+ PG_RETURN_POINTER(pageunion);
+}
+
+/*
+ * We store boxes as boxes in GiST indexes, so we do not need
+ * compress, decompress, or fetch functions.
+ */
+
+/*
+ * The GiST Penalty method for boxes (also used for points)
+ *
+ * As in the R-tree paper, we use change in area as our penalty metric
+ */
+Datum
+gist_box_penalty(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
+ float *result = (float *) PG_GETARG_POINTER(2);
+ BOX *origbox = DatumGetBoxP(origentry->key);
+ BOX *newbox = DatumGetBoxP(newentry->key);
+
+ *result = (float) box_penalty(origbox, newbox);
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * Trivial split: half of entries will be placed on one page
+ * and another half - to another
+ */
+static void
+fallbackSplit(GistEntryVector *entryvec, GIST_SPLITVEC *v)
+{
+ OffsetNumber i,
+ maxoff;
+ BOX *unionL = NULL,
+ *unionR = NULL;
+ int nbytes;
+
+ maxoff = entryvec->n - 1;
+
+ nbytes = (maxoff + 2) * sizeof(OffsetNumber);
+ v->spl_left = (OffsetNumber *) palloc(nbytes);
+ v->spl_right = (OffsetNumber *) palloc(nbytes);
+ v->spl_nleft = v->spl_nright = 0;
+
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ BOX *cur = DatumGetBoxP(entryvec->vector[i].key);
+
+ if (i <= (maxoff - FirstOffsetNumber + 1) / 2)
+ {
+ v->spl_left[v->spl_nleft] = i;
+ if (unionL == NULL)
+ {
+ unionL = (BOX *) palloc(sizeof(BOX));
+ *unionL = *cur;
+ }
+ else
+ adjustBox(unionL, cur);
+
+ v->spl_nleft++;
+ }
+ else
+ {
+ v->spl_right[v->spl_nright] = i;
+ if (unionR == NULL)
+ {
+ unionR = (BOX *) palloc(sizeof(BOX));
+ *unionR = *cur;
+ }
+ else
+ adjustBox(unionR, cur);
+
+ v->spl_nright++;
+ }
+ }
+
+ v->spl_ldatum = BoxPGetDatum(unionL);
+ v->spl_rdatum = BoxPGetDatum(unionR);
+}
+
+/*
+ * Represents information about an entry that can be placed to either group
+ * without affecting overlap over selected axis ("common entry").
+ */
+typedef struct
+{
+ /* Index of entry in the initial array */
+ int index;
+ /* Delta between penalties of entry insertion into different groups */
+ float8 delta;
+} CommonEntry;
+
+/*
+ * Context for g_box_consider_split. Contains information about currently
+ * selected split and some general information.
+ */
+typedef struct
+{
+ int entriesCount; /* total number of entries being split */
+ BOX boundingBox; /* minimum bounding box across all entries */
+
+ /* Information about currently selected split follows */
+
+ bool first; /* true if no split was selected yet */
+
+ float8 leftUpper; /* upper bound of left interval */
+ float8 rightLower; /* lower bound of right interval */
+
+ float4 ratio;
+ float4 overlap;
+ int dim; /* axis of this split */
+ float8 range; /* width of general MBR projection to the
+ * selected axis */
+} ConsiderSplitContext;
+
+/*
+ * Interval represents projection of box to axis.
+ */
+typedef struct
+{
+ float8 lower,
+ upper;
+} SplitInterval;
+
+/*
+ * Interval comparison function by lower bound of the interval;
+ */
+static int
+interval_cmp_lower(const void *i1, const void *i2)
+{
+ float8 lower1 = ((const SplitInterval *) i1)->lower,
+ lower2 = ((const SplitInterval *) i2)->lower;
+
+ return float8_cmp_internal(lower1, lower2);
+}
+
+/*
+ * Interval comparison function by upper bound of the interval;
+ */
+static int
+interval_cmp_upper(const void *i1, const void *i2)
+{
+ float8 upper1 = ((const SplitInterval *) i1)->upper,
+ upper2 = ((const SplitInterval *) i2)->upper;
+
+ return float8_cmp_internal(upper1, upper2);
+}
+
+/*
+ * Replace negative (or NaN) value with zero.
+ */
+static inline float
+non_negative(float val)
+{
+ if (val >= 0.0f)
+ return val;
+ else
+ return 0.0f;
+}
+
+/*
+ * Consider replacement of currently selected split with the better one.
+ */
+static inline void
+g_box_consider_split(ConsiderSplitContext *context, int dimNum,
+ float8 rightLower, int minLeftCount,
+ float8 leftUpper, int maxLeftCount)
+{
+ int leftCount,
+ rightCount;
+ float4 ratio,
+ overlap;
+ float8 range;
+
+ /*
+ * Calculate entries distribution ratio assuming most uniform distribution
+ * of common entries.
+ */
+ if (minLeftCount >= (context->entriesCount + 1) / 2)
+ {
+ leftCount = minLeftCount;
+ }
+ else
+ {
+ if (maxLeftCount <= context->entriesCount / 2)
+ leftCount = maxLeftCount;
+ else
+ leftCount = context->entriesCount / 2;
+ }
+ rightCount = context->entriesCount - leftCount;
+
+ /*
+ * Ratio of split - quotient between size of lesser group and total
+ * entries count.
+ */
+ ratio = float4_div(Min(leftCount, rightCount), context->entriesCount);
+
+ if (ratio > LIMIT_RATIO)
+ {
+ bool selectthis = false;
+
+ /*
+ * The ratio is acceptable, so compare current split with previously
+ * selected one. Between splits of one dimension we search for minimal
+ * overlap (allowing negative values) and minimal ration (between same
+ * overlaps. We switch dimension if find less overlap (non-negative)
+ * or less range with same overlap.
+ */
+ if (dimNum == 0)
+ range = float8_mi(context->boundingBox.high.x,
+ context->boundingBox.low.x);
+ else
+ range = float8_mi(context->boundingBox.high.y,
+ context->boundingBox.low.y);
+
+ overlap = float8_div(float8_mi(leftUpper, rightLower), range);
+
+ /* If there is no previous selection, select this */
+ if (context->first)
+ selectthis = true;
+ else if (context->dim == dimNum)
+ {
+ /*
+ * Within the same dimension, choose the new split if it has a
+ * smaller overlap, or same overlap but better ratio.
+ */
+ if (overlap < context->overlap ||
+ (overlap == context->overlap && ratio > context->ratio))
+ selectthis = true;
+ }
+ else
+ {
+ /*
+ * Across dimensions, choose the new split if it has a smaller
+ * *non-negative* overlap, or same *non-negative* overlap but
+ * bigger range. This condition differs from the one described in
+ * the article. On the datasets where leaf MBRs don't overlap
+ * themselves, non-overlapping splits (i.e. splits which have zero
+ * *non-negative* overlap) are frequently possible. In this case
+ * splits tends to be along one dimension, because most distant
+ * non-overlapping splits (i.e. having lowest negative overlap)
+ * appears to be in the same dimension as in the previous split.
+ * Therefore MBRs appear to be very prolonged along another
+ * dimension, which leads to bad search performance. Using range
+ * as the second split criteria makes MBRs more quadratic. Using
+ * *non-negative* overlap instead of overlap as the first split
+ * criteria gives to range criteria a chance to matter, because
+ * non-overlapping splits are equivalent in this criteria.
+ */
+ if (non_negative(overlap) < non_negative(context->overlap) ||
+ (range > context->range &&
+ non_negative(overlap) <= non_negative(context->overlap)))
+ selectthis = true;
+ }
+
+ if (selectthis)
+ {
+ /* save information about selected split */
+ context->first = false;
+ context->ratio = ratio;
+ context->range = range;
+ context->overlap = overlap;
+ context->rightLower = rightLower;
+ context->leftUpper = leftUpper;
+ context->dim = dimNum;
+ }
+ }
+}
+
+/*
+ * Compare common entries by their deltas.
+ */
+static int
+common_entry_cmp(const void *i1, const void *i2)
+{
+ float8 delta1 = ((const CommonEntry *) i1)->delta,
+ delta2 = ((const CommonEntry *) i2)->delta;
+
+ return float8_cmp_internal(delta1, delta2);
+}
+
+/*
+ * --------------------------------------------------------------------------
+ * Double sorting split algorithm. This is used for both boxes and points.
+ *
+ * The algorithm finds split of boxes by considering splits along each axis.
+ * Each entry is first projected as an interval on the X-axis, and different
+ * ways to split the intervals into two groups are considered, trying to
+ * minimize the overlap of the groups. Then the same is repeated for the
+ * Y-axis, and the overall best split is chosen. The quality of a split is
+ * determined by overlap along that axis and some other criteria (see
+ * g_box_consider_split).
+ *
+ * After that, all the entries are divided into three groups:
+ *
+ * 1) Entries which should be placed to the left group
+ * 2) Entries which should be placed to the right group
+ * 3) "Common entries" which can be placed to any of groups without affecting
+ * of overlap along selected axis.
+ *
+ * The common entries are distributed by minimizing penalty.
+ *
+ * For details see:
+ * "A new double sorting-based node splitting algorithm for R-tree", A. Korotkov
+ * http://syrcose.ispras.ru/2011/files/SYRCoSE2011_Proceedings.pdf#page=36
+ * --------------------------------------------------------------------------
+ */
+Datum
+gist_box_picksplit(PG_FUNCTION_ARGS)
+{
+ GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
+ GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
+ OffsetNumber i,
+ maxoff;
+ ConsiderSplitContext context;
+ BOX *box,
+ *leftBox,
+ *rightBox;
+ int dim,
+ commonEntriesCount;
+ SplitInterval *intervalsLower,
+ *intervalsUpper;
+ CommonEntry *commonEntries;
+ int nentries;
+
+ memset(&context, 0, sizeof(ConsiderSplitContext));
+
+ maxoff = entryvec->n - 1;
+ nentries = context.entriesCount = maxoff - FirstOffsetNumber + 1;
+
+ /* Allocate arrays for intervals along axes */
+ intervalsLower = (SplitInterval *) palloc(nentries * sizeof(SplitInterval));
+ intervalsUpper = (SplitInterval *) palloc(nentries * sizeof(SplitInterval));
+
+ /*
+ * Calculate the overall minimum bounding box over all the entries.
+ */
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ box = DatumGetBoxP(entryvec->vector[i].key);
+ if (i == FirstOffsetNumber)
+ context.boundingBox = *box;
+ else
+ adjustBox(&context.boundingBox, box);
+ }
+
+ /*
+ * Iterate over axes for optimal split searching.
+ */
+ context.first = true; /* nothing selected yet */
+ for (dim = 0; dim < 2; dim++)
+ {
+ float8 leftUpper,
+ rightLower;
+ int i1,
+ i2;
+
+ /* Project each entry as an interval on the selected axis. */
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ box = DatumGetBoxP(entryvec->vector[i].key);
+ if (dim == 0)
+ {
+ intervalsLower[i - FirstOffsetNumber].lower = box->low.x;
+ intervalsLower[i - FirstOffsetNumber].upper = box->high.x;
+ }
+ else
+ {
+ intervalsLower[i - FirstOffsetNumber].lower = box->low.y;
+ intervalsLower[i - FirstOffsetNumber].upper = box->high.y;
+ }
+ }
+
+ /*
+ * Make two arrays of intervals: one sorted by lower bound and another
+ * sorted by upper bound.
+ */
+ memcpy(intervalsUpper, intervalsLower,
+ sizeof(SplitInterval) * nentries);
+ qsort(intervalsLower, nentries, sizeof(SplitInterval),
+ interval_cmp_lower);
+ qsort(intervalsUpper, nentries, sizeof(SplitInterval),
+ interval_cmp_upper);
+
+ /*----
+ * The goal is to form a left and right interval, so that every entry
+ * interval is contained by either left or right interval (or both).
+ *
+ * For example, with the intervals (0,1), (1,3), (2,3), (2,4):
+ *
+ * 0 1 2 3 4
+ * +-+
+ * +---+
+ * +-+
+ * +---+
+ *
+ * The left and right intervals are of the form (0,a) and (b,4).
+ * We first consider splits where b is the lower bound of an entry.
+ * We iterate through all entries, and for each b, calculate the
+ * smallest possible a. Then we consider splits where a is the
+ * upper bound of an entry, and for each a, calculate the greatest
+ * possible b.
+ *
+ * In the above example, the first loop would consider splits:
+ * b=0: (0,1)-(0,4)
+ * b=1: (0,1)-(1,4)
+ * b=2: (0,3)-(2,4)
+ *
+ * And the second loop:
+ * a=1: (0,1)-(1,4)
+ * a=3: (0,3)-(2,4)
+ * a=4: (0,4)-(2,4)
+ */
+
+ /*
+ * Iterate over lower bound of right group, finding smallest possible
+ * upper bound of left group.
+ */
+ i1 = 0;
+ i2 = 0;
+ rightLower = intervalsLower[i1].lower;
+ leftUpper = intervalsUpper[i2].lower;
+ while (true)
+ {
+ /*
+ * Find next lower bound of right group.
+ */
+ while (i1 < nentries &&
+ float8_eq(rightLower, intervalsLower[i1].lower))
+ {
+ if (float8_lt(leftUpper, intervalsLower[i1].upper))
+ leftUpper = intervalsLower[i1].upper;
+ i1++;
+ }
+ if (i1 >= nentries)
+ break;
+ rightLower = intervalsLower[i1].lower;
+
+ /*
+ * Find count of intervals which anyway should be placed to the
+ * left group.
+ */
+ while (i2 < nentries &&
+ float8_le(intervalsUpper[i2].upper, leftUpper))
+ i2++;
+
+ /*
+ * Consider found split.
+ */
+ g_box_consider_split(&context, dim, rightLower, i1, leftUpper, i2);
+ }
+
+ /*
+ * Iterate over upper bound of left group finding greatest possible
+ * lower bound of right group.
+ */
+ i1 = nentries - 1;
+ i2 = nentries - 1;
+ rightLower = intervalsLower[i1].upper;
+ leftUpper = intervalsUpper[i2].upper;
+ while (true)
+ {
+ /*
+ * Find next upper bound of left group.
+ */
+ while (i2 >= 0 && float8_eq(leftUpper, intervalsUpper[i2].upper))
+ {
+ if (float8_gt(rightLower, intervalsUpper[i2].lower))
+ rightLower = intervalsUpper[i2].lower;
+ i2--;
+ }
+ if (i2 < 0)
+ break;
+ leftUpper = intervalsUpper[i2].upper;
+
+ /*
+ * Find count of intervals which anyway should be placed to the
+ * right group.
+ */
+ while (i1 >= 0 && float8_ge(intervalsLower[i1].lower, rightLower))
+ i1--;
+
+ /*
+ * Consider found split.
+ */
+ g_box_consider_split(&context, dim,
+ rightLower, i1 + 1, leftUpper, i2 + 1);
+ }
+ }
+
+ /*
+ * If we failed to find any acceptable splits, use trivial split.
+ */
+ if (context.first)
+ {
+ fallbackSplit(entryvec, v);
+ PG_RETURN_POINTER(v);
+ }
+
+ /*
+ * Ok, we have now selected the split across one axis.
+ *
+ * While considering the splits, we already determined that there will be
+ * enough entries in both groups to reach the desired ratio, but we did
+ * not memorize which entries go to which group. So determine that now.
+ */
+
+ /* Allocate vectors for results */
+ v->spl_left = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber));
+ v->spl_right = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber));
+ v->spl_nleft = 0;
+ v->spl_nright = 0;
+
+ /* Allocate bounding boxes of left and right groups */
+ leftBox = palloc0(sizeof(BOX));
+ rightBox = palloc0(sizeof(BOX));
+
+ /*
+ * Allocate an array for "common entries" - entries which can be placed to
+ * either group without affecting overlap along selected axis.
+ */
+ commonEntriesCount = 0;
+ commonEntries = (CommonEntry *) palloc(nentries * sizeof(CommonEntry));
+
+ /* Helper macros to place an entry in the left or right group */
+#define PLACE_LEFT(box, off) \
+ do { \
+ if (v->spl_nleft > 0) \
+ adjustBox(leftBox, box); \
+ else \
+ *leftBox = *(box); \
+ v->spl_left[v->spl_nleft++] = off; \
+ } while(0)
+
+#define PLACE_RIGHT(box, off) \
+ do { \
+ if (v->spl_nright > 0) \
+ adjustBox(rightBox, box); \
+ else \
+ *rightBox = *(box); \
+ v->spl_right[v->spl_nright++] = off; \
+ } while(0)
+
+ /*
+ * Distribute entries which can be distributed unambiguously, and collect
+ * common entries.
+ */
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ float8 lower,
+ upper;
+
+ /*
+ * Get upper and lower bounds along selected axis.
+ */
+ box = DatumGetBoxP(entryvec->vector[i].key);
+ if (context.dim == 0)
+ {
+ lower = box->low.x;
+ upper = box->high.x;
+ }
+ else
+ {
+ lower = box->low.y;
+ upper = box->high.y;
+ }
+
+ if (float8_le(upper, context.leftUpper))
+ {
+ /* Fits to the left group */
+ if (float8_ge(lower, context.rightLower))
+ {
+ /* Fits also to the right group, so "common entry" */
+ commonEntries[commonEntriesCount++].index = i;
+ }
+ else
+ {
+ /* Doesn't fit to the right group, so join to the left group */
+ PLACE_LEFT(box, i);
+ }
+ }
+ else
+ {
+ /*
+ * Each entry should fit on either left or right group. Since this
+ * entry didn't fit on the left group, it better fit in the right
+ * group.
+ */
+ Assert(float8_ge(lower, context.rightLower));
+
+ /* Doesn't fit to the left group, so join to the right group */
+ PLACE_RIGHT(box, i);
+ }
+ }
+
+ /*
+ * Distribute "common entries", if any.
+ */
+ if (commonEntriesCount > 0)
+ {
+ /*
+ * Calculate minimum number of entries that must be placed in both
+ * groups, to reach LIMIT_RATIO.
+ */
+ int m = ceil(LIMIT_RATIO * nentries);
+
+ /*
+ * Calculate delta between penalties of join "common entries" to
+ * different groups.
+ */
+ for (i = 0; i < commonEntriesCount; i++)
+ {
+ box = DatumGetBoxP(entryvec->vector[commonEntries[i].index].key);
+ commonEntries[i].delta = Abs(float8_mi(box_penalty(leftBox, box),
+ box_penalty(rightBox, box)));
+ }
+
+ /*
+ * Sort "common entries" by calculated deltas in order to distribute
+ * the most ambiguous entries first.
+ */
+ qsort(commonEntries, commonEntriesCount, sizeof(CommonEntry), common_entry_cmp);
+
+ /*
+ * Distribute "common entries" between groups.
+ */
+ for (i = 0; i < commonEntriesCount; i++)
+ {
+ box = DatumGetBoxP(entryvec->vector[commonEntries[i].index].key);
+
+ /*
+ * Check if we have to place this entry in either group to achieve
+ * LIMIT_RATIO.
+ */
+ if (v->spl_nleft + (commonEntriesCount - i) <= m)
+ PLACE_LEFT(box, commonEntries[i].index);
+ else if (v->spl_nright + (commonEntriesCount - i) <= m)
+ PLACE_RIGHT(box, commonEntries[i].index);
+ else
+ {
+ /* Otherwise select the group by minimal penalty */
+ if (box_penalty(leftBox, box) < box_penalty(rightBox, box))
+ PLACE_LEFT(box, commonEntries[i].index);
+ else
+ PLACE_RIGHT(box, commonEntries[i].index);
+ }
+ }
+ }
+
+ v->spl_ldatum = PointerGetDatum(leftBox);
+ v->spl_rdatum = PointerGetDatum(rightBox);
+ PG_RETURN_POINTER(v);
+}
+
+/*
+ * Equality method
+ *
+ * This is used for boxes, points, circles, and polygons, all of which store
+ * boxes as GiST index entries.
+ *
+ * Returns true only when boxes are exactly the same. We can't use fuzzy
+ * comparisons here without breaking index consistency; therefore, this isn't
+ * equivalent to box_same().
+ */
+Datum
+gist_box_same(PG_FUNCTION_ARGS)
+{
+ BOX *b1 = PG_GETARG_BOX_P(0);
+ BOX *b2 = PG_GETARG_BOX_P(1);
+ bool *result = (bool *) PG_GETARG_POINTER(2);
+
+ if (b1 && b2)
+ *result = (float8_eq(b1->low.x, b2->low.x) &&
+ float8_eq(b1->low.y, b2->low.y) &&
+ float8_eq(b1->high.x, b2->high.x) &&
+ float8_eq(b1->high.y, b2->high.y));
+ else
+ *result = (b1 == NULL && b2 == NULL);
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * Leaf-level consistency for boxes: just apply the query operator
+ */
+static bool
+gist_box_leaf_consistent(BOX *key, BOX *query, StrategyNumber strategy)
+{
+ bool retval;
+
+ switch (strategy)
+ {
+ case RTLeftStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_left,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverLeftStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_overleft,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverlapStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_overlap,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverRightStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_overright,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTRightStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_right,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTSameStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_same,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTContainsStrategyNumber:
+ case RTOldContainsStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_contain,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTContainedByStrategyNumber:
+ case RTOldContainedByStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_contained,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverBelowStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_overbelow,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTBelowStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_below,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTAboveStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_above,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverAboveStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_overabove,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ retval = false; /* keep compiler quiet */
+ break;
+ }
+ return retval;
+}
+
+/*****************************************
+ * Common rtree functions (for boxes, polygons, and circles)
+ *****************************************/
+
+/*
+ * Internal-page consistency for all these types
+ *
+ * We can use the same function since all types use bounding boxes as the
+ * internal-page representation.
+ */
+static bool
+rtree_internal_consistent(BOX *key, BOX *query, StrategyNumber strategy)
+{
+ bool retval;
+
+ switch (strategy)
+ {
+ case RTLeftStrategyNumber:
+ retval = !DatumGetBool(DirectFunctionCall2(box_overright,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverLeftStrategyNumber:
+ retval = !DatumGetBool(DirectFunctionCall2(box_right,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverlapStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_overlap,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverRightStrategyNumber:
+ retval = !DatumGetBool(DirectFunctionCall2(box_left,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTRightStrategyNumber:
+ retval = !DatumGetBool(DirectFunctionCall2(box_overleft,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTSameStrategyNumber:
+ case RTContainsStrategyNumber:
+ case RTOldContainsStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_contain,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTContainedByStrategyNumber:
+ case RTOldContainedByStrategyNumber:
+ retval = DatumGetBool(DirectFunctionCall2(box_overlap,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverBelowStrategyNumber:
+ retval = !DatumGetBool(DirectFunctionCall2(box_above,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTBelowStrategyNumber:
+ retval = !DatumGetBool(DirectFunctionCall2(box_overabove,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTAboveStrategyNumber:
+ retval = !DatumGetBool(DirectFunctionCall2(box_overbelow,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ case RTOverAboveStrategyNumber:
+ retval = !DatumGetBool(DirectFunctionCall2(box_below,
+ PointerGetDatum(key),
+ PointerGetDatum(query)));
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ retval = false; /* keep compiler quiet */
+ break;
+ }
+ return retval;
+}
+
+/**************************************************
+ * Polygon ops
+ **************************************************/
+
+/*
+ * GiST compress for polygons: represent a polygon by its bounding box
+ */
+Datum
+gist_poly_compress(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ GISTENTRY *retval;
+
+ if (entry->leafkey)
+ {
+ POLYGON *in = DatumGetPolygonP(entry->key);
+ BOX *r;
+
+ r = (BOX *) palloc(sizeof(BOX));
+ memcpy((void *) r, (void *) &(in->boundbox), sizeof(BOX));
+
+ retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
+ gistentryinit(*retval, PointerGetDatum(r),
+ entry->rel, entry->page,
+ entry->offset, false);
+ }
+ else
+ retval = entry;
+ PG_RETURN_POINTER(retval);
+}
+
+/*
+ * The GiST Consistent method for polygons
+ */
+Datum
+gist_poly_consistent(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ POLYGON *query = PG_GETARG_POLYGON_P(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+
+ /* Oid subtype = PG_GETARG_OID(3); */
+ bool *recheck = (bool *) PG_GETARG_POINTER(4);
+ bool result;
+
+ /* All cases served by this function are inexact */
+ *recheck = true;
+
+ if (DatumGetBoxP(entry->key) == NULL || query == NULL)
+ PG_RETURN_BOOL(false);
+
+ /*
+ * Since the operators require recheck anyway, we can just use
+ * rtree_internal_consistent even at leaf nodes. (This works in part
+ * because the index entries are bounding boxes not polygons.)
+ */
+ result = rtree_internal_consistent(DatumGetBoxP(entry->key),
+ &(query->boundbox), strategy);
+
+ /* Avoid memory leak if supplied poly is toasted */
+ PG_FREE_IF_COPY(query, 1);
+
+ PG_RETURN_BOOL(result);
+}
+
+/**************************************************
+ * Circle ops
+ **************************************************/
+
+/*
+ * GiST compress for circles: represent a circle by its bounding box
+ */
+Datum
+gist_circle_compress(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ GISTENTRY *retval;
+
+ if (entry->leafkey)
+ {
+ CIRCLE *in = DatumGetCircleP(entry->key);
+ BOX *r;
+
+ r = (BOX *) palloc(sizeof(BOX));
+ r->high.x = float8_pl(in->center.x, in->radius);
+ r->low.x = float8_mi(in->center.x, in->radius);
+ r->high.y = float8_pl(in->center.y, in->radius);
+ r->low.y = float8_mi(in->center.y, in->radius);
+
+ retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
+ gistentryinit(*retval, PointerGetDatum(r),
+ entry->rel, entry->page,
+ entry->offset, false);
+ }
+ else
+ retval = entry;
+ PG_RETURN_POINTER(retval);
+}
+
+/*
+ * The GiST Consistent method for circles
+ */
+Datum
+gist_circle_consistent(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ CIRCLE *query = PG_GETARG_CIRCLE_P(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+
+ /* Oid subtype = PG_GETARG_OID(3); */
+ bool *recheck = (bool *) PG_GETARG_POINTER(4);
+ BOX bbox;
+ bool result;
+
+ /* All cases served by this function are inexact */
+ *recheck = true;
+
+ if (DatumGetBoxP(entry->key) == NULL || query == NULL)
+ PG_RETURN_BOOL(false);
+
+ /*
+ * Since the operators require recheck anyway, we can just use
+ * rtree_internal_consistent even at leaf nodes. (This works in part
+ * because the index entries are bounding boxes not circles.)
+ */
+ bbox.high.x = float8_pl(query->center.x, query->radius);
+ bbox.low.x = float8_mi(query->center.x, query->radius);
+ bbox.high.y = float8_pl(query->center.y, query->radius);
+ bbox.low.y = float8_mi(query->center.y, query->radius);
+
+ result = rtree_internal_consistent(DatumGetBoxP(entry->key),
+ &bbox, strategy);
+
+ PG_RETURN_BOOL(result);
+}
+
+/**************************************************
+ * Point ops
+ **************************************************/
+
+Datum
+gist_point_compress(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+
+ if (entry->leafkey) /* Point, actually */
+ {
+ BOX *box = palloc(sizeof(BOX));
+ Point *point = DatumGetPointP(entry->key);
+ GISTENTRY *retval = palloc(sizeof(GISTENTRY));
+
+ box->high = box->low = *point;
+
+ gistentryinit(*retval, BoxPGetDatum(box),
+ entry->rel, entry->page, entry->offset, false);
+
+ PG_RETURN_POINTER(retval);
+ }
+
+ PG_RETURN_POINTER(entry);
+}
+
+/*
+ * GiST Fetch method for point
+ *
+ * Get point coordinates from its bounding box coordinates and form new
+ * gistentry.
+ */
+Datum
+gist_point_fetch(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ BOX *in = DatumGetBoxP(entry->key);
+ Point *r;
+ GISTENTRY *retval;
+
+ retval = palloc(sizeof(GISTENTRY));
+
+ r = (Point *) palloc(sizeof(Point));
+ r->x = in->high.x;
+ r->y = in->high.y;
+ gistentryinit(*retval, PointerGetDatum(r),
+ entry->rel, entry->page,
+ entry->offset, false);
+
+ PG_RETURN_POINTER(retval);
+}
+
+
+#define point_point_distance(p1,p2) \
+ DatumGetFloat8(DirectFunctionCall2(point_distance, \
+ PointPGetDatum(p1), PointPGetDatum(p2)))
+
+static float8
+computeDistance(bool isLeaf, BOX *box, Point *point)
+{
+ float8 result = 0.0;
+
+ if (isLeaf)
+ {
+ /* simple point to point distance */
+ result = point_point_distance(point, &box->low);
+ }
+ else if (point->x <= box->high.x && point->x >= box->low.x &&
+ point->y <= box->high.y && point->y >= box->low.y)
+ {
+ /* point inside the box */
+ result = 0.0;
+ }
+ else if (point->x <= box->high.x && point->x >= box->low.x)
+ {
+ /* point is over or below box */
+ Assert(box->low.y <= box->high.y);
+ if (point->y > box->high.y)
+ result = float8_mi(point->y, box->high.y);
+ else if (point->y < box->low.y)
+ result = float8_mi(box->low.y, point->y);
+ else
+ elog(ERROR, "inconsistent point values");
+ }
+ else if (point->y <= box->high.y && point->y >= box->low.y)
+ {
+ /* point is to left or right of box */
+ Assert(box->low.x <= box->high.x);
+ if (point->x > box->high.x)
+ result = float8_mi(point->x, box->high.x);
+ else if (point->x < box->low.x)
+ result = float8_mi(box->low.x, point->x);
+ else
+ elog(ERROR, "inconsistent point values");
+ }
+ else
+ {
+ /* closest point will be a vertex */
+ Point p;
+ float8 subresult;
+
+ result = point_point_distance(point, &box->low);
+
+ subresult = point_point_distance(point, &box->high);
+ if (result > subresult)
+ result = subresult;
+
+ p.x = box->low.x;
+ p.y = box->high.y;
+ subresult = point_point_distance(point, &p);
+ if (result > subresult)
+ result = subresult;
+
+ p.x = box->high.x;
+ p.y = box->low.y;
+ subresult = point_point_distance(point, &p);
+ if (result > subresult)
+ result = subresult;
+ }
+
+ return result;
+}
+
+static bool
+gist_point_consistent_internal(StrategyNumber strategy,
+ bool isLeaf, BOX *key, Point *query)
+{
+ bool result = false;
+
+ switch (strategy)
+ {
+ case RTLeftStrategyNumber:
+ result = FPlt(key->low.x, query->x);
+ break;
+ case RTRightStrategyNumber:
+ result = FPgt(key->high.x, query->x);
+ break;
+ case RTAboveStrategyNumber:
+ result = FPgt(key->high.y, query->y);
+ break;
+ case RTBelowStrategyNumber:
+ result = FPlt(key->low.y, query->y);
+ break;
+ case RTSameStrategyNumber:
+ if (isLeaf)
+ {
+ /* key.high must equal key.low, so we can disregard it */
+ result = (FPeq(key->low.x, query->x) &&
+ FPeq(key->low.y, query->y));
+ }
+ else
+ {
+ result = (FPle(query->x, key->high.x) &&
+ FPge(query->x, key->low.x) &&
+ FPle(query->y, key->high.y) &&
+ FPge(query->y, key->low.y));
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ result = false; /* keep compiler quiet */
+ break;
+ }
+
+ return result;
+}
+
+#define GeoStrategyNumberOffset 20
+#define PointStrategyNumberGroup 0
+#define BoxStrategyNumberGroup 1
+#define PolygonStrategyNumberGroup 2
+#define CircleStrategyNumberGroup 3
+
+Datum
+gist_point_consistent(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+ bool *recheck = (bool *) PG_GETARG_POINTER(4);
+ bool result;
+ StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset;
+
+ switch (strategyGroup)
+ {
+ case PointStrategyNumberGroup:
+ result = gist_point_consistent_internal(strategy % GeoStrategyNumberOffset,
+ GIST_LEAF(entry),
+ DatumGetBoxP(entry->key),
+ PG_GETARG_POINT_P(1));
+ *recheck = false;
+ break;
+ case BoxStrategyNumberGroup:
+ {
+ /*
+ * The only operator in this group is point <@ box (on_pb), so
+ * we needn't examine strategy again.
+ *
+ * For historical reasons, on_pb uses exact rather than fuzzy
+ * comparisons. We could use box_overlap when at an internal
+ * page, but that would lead to possibly visiting child pages
+ * uselessly, because box_overlap uses fuzzy comparisons.
+ * Instead we write a non-fuzzy overlap test. The same code
+ * will also serve for leaf-page tests, since leaf keys have
+ * high == low.
+ */
+ BOX *query,
+ *key;
+
+ query = PG_GETARG_BOX_P(1);
+ key = DatumGetBoxP(entry->key);
+
+ result = (key->high.x >= query->low.x &&
+ key->low.x <= query->high.x &&
+ key->high.y >= query->low.y &&
+ key->low.y <= query->high.y);
+ *recheck = false;
+ }
+ break;
+ case PolygonStrategyNumberGroup:
+ {
+ POLYGON *query = PG_GETARG_POLYGON_P(1);
+
+ result = DatumGetBool(DirectFunctionCall5(gist_poly_consistent,
+ PointerGetDatum(entry),
+ PolygonPGetDatum(query),
+ Int16GetDatum(RTOverlapStrategyNumber),
+ 0, PointerGetDatum(recheck)));
+
+ if (GIST_LEAF(entry) && result)
+ {
+ /*
+ * We are on leaf page and quick check shows overlapping
+ * of polygon's bounding box and point
+ */
+ BOX *box = DatumGetBoxP(entry->key);
+
+ Assert(box->high.x == box->low.x
+ && box->high.y == box->low.y);
+ result = DatumGetBool(DirectFunctionCall2(poly_contain_pt,
+ PolygonPGetDatum(query),
+ PointPGetDatum(&box->high)));
+ *recheck = false;
+ }
+ }
+ break;
+ case CircleStrategyNumberGroup:
+ {
+ CIRCLE *query = PG_GETARG_CIRCLE_P(1);
+
+ result = DatumGetBool(DirectFunctionCall5(gist_circle_consistent,
+ PointerGetDatum(entry),
+ CirclePGetDatum(query),
+ Int16GetDatum(RTOverlapStrategyNumber),
+ 0, PointerGetDatum(recheck)));
+
+ if (GIST_LEAF(entry) && result)
+ {
+ /*
+ * We are on leaf page and quick check shows overlapping
+ * of polygon's bounding box and point
+ */
+ BOX *box = DatumGetBoxP(entry->key);
+
+ Assert(box->high.x == box->low.x
+ && box->high.y == box->low.y);
+ result = DatumGetBool(DirectFunctionCall2(circle_contain_pt,
+ CirclePGetDatum(query),
+ PointPGetDatum(&box->high)));
+ *recheck = false;
+ }
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ result = false; /* keep compiler quiet */
+ break;
+ }
+
+ PG_RETURN_BOOL(result);
+}
+
+Datum
+gist_point_distance(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+ float8 distance;
+ StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset;
+
+ switch (strategyGroup)
+ {
+ case PointStrategyNumberGroup:
+ distance = computeDistance(GIST_LEAF(entry),
+ DatumGetBoxP(entry->key),
+ PG_GETARG_POINT_P(1));
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ distance = 0.0; /* keep compiler quiet */
+ break;
+ }
+
+ PG_RETURN_FLOAT8(distance);
+}
+
+static float8
+gist_bbox_distance(GISTENTRY *entry, Datum query, StrategyNumber strategy)
+{
+ float8 distance;
+ StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset;
+
+ switch (strategyGroup)
+ {
+ case PointStrategyNumberGroup:
+ distance = computeDistance(false,
+ DatumGetBoxP(entry->key),
+ DatumGetPointP(query));
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ distance = 0.0; /* keep compiler quiet */
+ }
+
+ return distance;
+}
+
+Datum
+gist_box_distance(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ Datum query = PG_GETARG_DATUM(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+
+ /* Oid subtype = PG_GETARG_OID(3); */
+ /* bool *recheck = (bool *) PG_GETARG_POINTER(4); */
+ float8 distance;
+
+ distance = gist_bbox_distance(entry, query, strategy);
+
+ PG_RETURN_FLOAT8(distance);
+}
+
+/*
+ * The inexact GiST distance methods for geometric types that store bounding
+ * boxes.
+ *
+ * Compute lossy distance from point to index entries. The result is inexact
+ * because index entries are bounding boxes, not the exact shapes of the
+ * indexed geometric types. We use distance from point to MBR of index entry.
+ * This is a lower bound estimate of distance from point to indexed geometric
+ * type.
+ */
+Datum
+gist_circle_distance(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ Datum query = PG_GETARG_DATUM(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+
+ /* Oid subtype = PG_GETARG_OID(3); */
+ bool *recheck = (bool *) PG_GETARG_POINTER(4);
+ float8 distance;
+
+ distance = gist_bbox_distance(entry, query, strategy);
+ *recheck = true;
+
+ PG_RETURN_FLOAT8(distance);
+}
+
+Datum
+gist_poly_distance(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ Datum query = PG_GETARG_DATUM(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+
+ /* Oid subtype = PG_GETARG_OID(3); */
+ bool *recheck = (bool *) PG_GETARG_POINTER(4);
+ float8 distance;
+
+ distance = gist_bbox_distance(entry, query, strategy);
+ *recheck = true;
+
+ PG_RETURN_FLOAT8(distance);
+}
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
new file mode 100644
index 0000000..b8aa77f
--- /dev/null
+++ b/src/backend/access/gist/gistscan.c
@@ -0,0 +1,358 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistscan.c
+ * routines to manage scans on GiST index relations
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/gist_private.h"
+#include "access/gistscan.h"
+#include "access/relscan.h"
+#include "utils/float.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+/*
+ * Pairing heap comparison function for the GISTSearchItem queue
+ */
+static int
+pairingheap_GISTSearchItem_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg)
+{
+ const GISTSearchItem *sa = (const GISTSearchItem *) a;
+ const GISTSearchItem *sb = (const GISTSearchItem *) b;
+ IndexScanDesc scan = (IndexScanDesc) arg;
+ int i;
+
+ /* Order according to distance comparison */
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ {
+ if (sa->distances[i].isnull)
+ {
+ if (!sb->distances[i].isnull)
+ return -1;
+ }
+ else if (sb->distances[i].isnull)
+ {
+ return 1;
+ }
+ else
+ {
+ int cmp = -float8_cmp_internal(sa->distances[i].value,
+ sb->distances[i].value);
+
+ if (cmp != 0)
+ return cmp;
+ }
+ }
+
+ /* Heap items go before inner pages, to ensure a depth-first search */
+ if (GISTSearchItemIsHeap(*sa) && !GISTSearchItemIsHeap(*sb))
+ return 1;
+ if (!GISTSearchItemIsHeap(*sa) && GISTSearchItemIsHeap(*sb))
+ return -1;
+
+ return 0;
+}
+
+
+/*
+ * Index AM API functions for scanning GiST indexes
+ */
+
+IndexScanDesc
+gistbeginscan(Relation r, int nkeys, int norderbys)
+{
+ IndexScanDesc scan;
+ GISTSTATE *giststate;
+ GISTScanOpaque so;
+ MemoryContext oldCxt;
+
+ scan = RelationGetIndexScan(r, nkeys, norderbys);
+
+ /* First, set up a GISTSTATE with a scan-lifespan memory context */
+ giststate = initGISTstate(scan->indexRelation);
+
+ /*
+ * Everything made below is in the scanCxt, or is a child of the scanCxt,
+ * so it'll all go away automatically in gistendscan.
+ */
+ oldCxt = MemoryContextSwitchTo(giststate->scanCxt);
+
+ /* initialize opaque data */
+ so = (GISTScanOpaque) palloc0(sizeof(GISTScanOpaqueData));
+ so->giststate = giststate;
+ giststate->tempCxt = createTempGistContext();
+ so->queue = NULL;
+ so->queueCxt = giststate->scanCxt; /* see gistrescan */
+
+ /* workspaces with size dependent on numberOfOrderBys: */
+ so->distances = palloc(sizeof(so->distances[0]) * scan->numberOfOrderBys);
+ so->qual_ok = true; /* in case there are zero keys */
+ if (scan->numberOfOrderBys > 0)
+ {
+ scan->xs_orderbyvals = palloc0(sizeof(Datum) * scan->numberOfOrderBys);
+ scan->xs_orderbynulls = palloc(sizeof(bool) * scan->numberOfOrderBys);
+ memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys);
+ }
+
+ so->killedItems = NULL; /* until needed */
+ so->numKilled = 0;
+ so->curBlkno = InvalidBlockNumber;
+ so->curPageLSN = InvalidXLogRecPtr;
+
+ scan->opaque = so;
+
+ /*
+ * All fields required for index-only scans are initialized in gistrescan,
+ * as we don't know yet if we're doing an index-only scan or not.
+ */
+
+ MemoryContextSwitchTo(oldCxt);
+
+ return scan;
+}
+
+void
+gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
+ ScanKey orderbys, int norderbys)
+{
+ /* nkeys and norderbys arguments are ignored */
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ bool first_time;
+ int i;
+ MemoryContext oldCxt;
+
+ /* rescan an existing indexscan --- reset state */
+
+ /*
+ * The first time through, we create the search queue in the scanCxt.
+ * Subsequent times through, we create the queue in a separate queueCxt,
+ * which is created on the second call and reset on later calls. Thus, in
+ * the common case where a scan is only rescan'd once, we just put the
+ * queue in scanCxt and don't pay the overhead of making a second memory
+ * context. If we do rescan more than once, the first queue is just left
+ * for dead until end of scan; this small wastage seems worth the savings
+ * in the common case.
+ */
+ if (so->queue == NULL)
+ {
+ /* first time through */
+ Assert(so->queueCxt == so->giststate->scanCxt);
+ first_time = true;
+ }
+ else if (so->queueCxt == so->giststate->scanCxt)
+ {
+ /* second time through */
+ so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt,
+ "GiST queue context",
+ ALLOCSET_DEFAULT_SIZES);
+ first_time = false;
+ }
+ else
+ {
+ /* third or later time through */
+ MemoryContextReset(so->queueCxt);
+ first_time = false;
+ }
+
+ /*
+ * If we're doing an index-only scan, on the first call, also initialize a
+ * tuple descriptor to represent the returned index tuples and create a
+ * memory context to hold them during the scan.
+ */
+ if (scan->xs_want_itup && !scan->xs_hitupdesc)
+ {
+ int natts;
+ int nkeyatts;
+ int attno;
+
+ /*
+ * The storage type of the index can be different from the original
+ * datatype being indexed, so we cannot just grab the index's tuple
+ * descriptor. Instead, construct a descriptor with the original data
+ * types.
+ */
+ natts = RelationGetNumberOfAttributes(scan->indexRelation);
+ nkeyatts = IndexRelationGetNumberOfKeyAttributes(scan->indexRelation);
+ so->giststate->fetchTupdesc = CreateTemplateTupleDesc(natts);
+ for (attno = 1; attno <= nkeyatts; attno++)
+ {
+ TupleDescInitEntry(so->giststate->fetchTupdesc, attno, NULL,
+ scan->indexRelation->rd_opcintype[attno - 1],
+ -1, 0);
+ }
+
+ for (; attno <= natts; attno++)
+ {
+ /* taking opcintype from giststate->tupdesc */
+ TupleDescInitEntry(so->giststate->fetchTupdesc, attno, NULL,
+ TupleDescAttr(so->giststate->leafTupdesc,
+ attno - 1)->atttypid,
+ -1, 0);
+ }
+ scan->xs_hitupdesc = so->giststate->fetchTupdesc;
+
+ /* Also create a memory context that will hold the returned tuples */
+ so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt,
+ "GiST page data context",
+ ALLOCSET_DEFAULT_SIZES);
+ }
+
+ /* create new, empty pairing heap for search queue */
+ oldCxt = MemoryContextSwitchTo(so->queueCxt);
+ so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan);
+ MemoryContextSwitchTo(oldCxt);
+
+ so->firstCall = true;
+
+ /* Update scan key, if a new one is given */
+ if (key && scan->numberOfKeys > 0)
+ {
+ void **fn_extras = NULL;
+
+ /*
+ * If this isn't the first time through, preserve the fn_extra
+ * pointers, so that if the consistentFns are using them to cache
+ * data, that data is not leaked across a rescan.
+ */
+ if (!first_time)
+ {
+ fn_extras = (void **) palloc(scan->numberOfKeys * sizeof(void *));
+ for (i = 0; i < scan->numberOfKeys; i++)
+ fn_extras[i] = scan->keyData[i].sk_func.fn_extra;
+ }
+
+ memmove(scan->keyData, key,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+
+ /*
+ * Modify the scan key so that the Consistent method is called for all
+ * comparisons. The original operator is passed to the Consistent
+ * function in the form of its strategy number, which is available
+ * from the sk_strategy field, and its subtype from the sk_subtype
+ * field.
+ *
+ * Next, if any of keys is a NULL and that key is not marked with
+ * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we
+ * assume all indexable operators are strict).
+ */
+ so->qual_ok = true;
+
+ for (i = 0; i < scan->numberOfKeys; i++)
+ {
+ ScanKey skey = scan->keyData + i;
+
+ /*
+ * Copy consistent support function to ScanKey structure instead
+ * of function implementing filtering operator.
+ */
+ fmgr_info_copy(&(skey->sk_func),
+ &(so->giststate->consistentFn[skey->sk_attno - 1]),
+ so->giststate->scanCxt);
+
+ /* Restore prior fn_extra pointers, if not first time */
+ if (!first_time)
+ skey->sk_func.fn_extra = fn_extras[i];
+
+ if (skey->sk_flags & SK_ISNULL)
+ {
+ if (!(skey->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)))
+ so->qual_ok = false;
+ }
+ }
+
+ if (!first_time)
+ pfree(fn_extras);
+ }
+
+ /* Update order-by key, if a new one is given */
+ if (orderbys && scan->numberOfOrderBys > 0)
+ {
+ void **fn_extras = NULL;
+
+ /* As above, preserve fn_extra if not first time through */
+ if (!first_time)
+ {
+ fn_extras = (void **) palloc(scan->numberOfOrderBys * sizeof(void *));
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ fn_extras[i] = scan->orderByData[i].sk_func.fn_extra;
+ }
+
+ memmove(scan->orderByData, orderbys,
+ scan->numberOfOrderBys * sizeof(ScanKeyData));
+
+ so->orderByTypes = (Oid *) palloc(scan->numberOfOrderBys * sizeof(Oid));
+
+ /*
+ * Modify the order-by key so that the Distance method is called for
+ * all comparisons. The original operator is passed to the Distance
+ * function in the form of its strategy number, which is available
+ * from the sk_strategy field, and its subtype from the sk_subtype
+ * field.
+ */
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ {
+ ScanKey skey = scan->orderByData + i;
+ FmgrInfo *finfo = &(so->giststate->distanceFn[skey->sk_attno - 1]);
+
+ /* Check we actually have a distance function ... */
+ if (!OidIsValid(finfo->fn_oid))
+ elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",
+ GIST_DISTANCE_PROC, skey->sk_attno,
+ RelationGetRelationName(scan->indexRelation));
+
+ /*
+ * Look up the datatype returned by the original ordering
+ * operator. GiST always uses a float8 for the distance function,
+ * but the ordering operator could be anything else.
+ *
+ * XXX: The distance function is only allowed to be lossy if the
+ * ordering operator's result type is float4 or float8. Otherwise
+ * we don't know how to return the distance to the executor. But
+ * we cannot check that here, as we won't know if the distance
+ * function is lossy until it returns *recheck = true for the
+ * first time.
+ */
+ so->orderByTypes[i] = get_func_rettype(skey->sk_func.fn_oid);
+
+ /*
+ * Copy distance support function to ScanKey structure instead of
+ * function implementing ordering operator.
+ */
+ fmgr_info_copy(&(skey->sk_func), finfo, so->giststate->scanCxt);
+
+ /* Restore prior fn_extra pointers, if not first time */
+ if (!first_time)
+ skey->sk_func.fn_extra = fn_extras[i];
+ }
+
+ if (!first_time)
+ pfree(fn_extras);
+ }
+
+ /* any previous xs_hitup will have been pfree'd in context resets above */
+ scan->xs_hitup = NULL;
+}
+
+void
+gistendscan(IndexScanDesc scan)
+{
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+
+ /*
+ * freeGISTstate is enough to clean up everything made by gistbeginscan,
+ * as well as the queueCxt if there is a separate context for it.
+ */
+ freeGISTstate(so->giststate);
+}
diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c
new file mode 100644
index 0000000..1bf48b4
--- /dev/null
+++ b/src/backend/access/gist/gistsplit.c
@@ -0,0 +1,779 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistsplit.c
+ * Multi-column page splitting algorithm
+ *
+ * This file is concerned with making good page-split decisions in multi-column
+ * GiST indexes. The opclass-specific picksplit functions can only be expected
+ * to produce answers based on a single column. We first run the picksplit
+ * function for column 1; then, if there are more columns, we check if any of
+ * the tuples are "don't cares" so far as the column 1 split is concerned
+ * (that is, they could go to either side for no additional penalty). If so,
+ * we try to redistribute those tuples on the basis of the next column.
+ * Repeat till we're out of columns.
+ *
+ * gistSplitByKey() is the entry point to this file.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistsplit.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/gist_private.h"
+#include "utils/rel.h"
+
+typedef struct
+{
+ OffsetNumber *entries;
+ int len;
+ Datum *attr;
+ bool *isnull;
+ bool *dontcare;
+} GistSplitUnion;
+
+
+/*
+ * Form unions of subkeys in itvec[] entries listed in gsvp->entries[],
+ * ignoring any tuples that are marked in gsvp->dontcare[]. Subroutine for
+ * gistunionsubkey.
+ */
+static void
+gistunionsubkeyvec(GISTSTATE *giststate, IndexTuple *itvec,
+ GistSplitUnion *gsvp)
+{
+ IndexTuple *cleanedItVec;
+ int i,
+ cleanedLen = 0;
+
+ cleanedItVec = (IndexTuple *) palloc(sizeof(IndexTuple) * gsvp->len);
+
+ for (i = 0; i < gsvp->len; i++)
+ {
+ if (gsvp->dontcare && gsvp->dontcare[gsvp->entries[i]])
+ continue;
+
+ cleanedItVec[cleanedLen++] = itvec[gsvp->entries[i] - 1];
+ }
+
+ gistMakeUnionItVec(giststate, cleanedItVec, cleanedLen,
+ gsvp->attr, gsvp->isnull);
+
+ pfree(cleanedItVec);
+}
+
+/*
+ * Recompute unions of left- and right-side subkeys after a page split,
+ * ignoring any tuples that are marked in spl->spl_dontcare[].
+ *
+ * Note: we always recompute union keys for all index columns. In some cases
+ * this might represent duplicate work for the leftmost column(s), but it's
+ * not safe to assume that "zero penalty to move a tuple" means "the union
+ * key doesn't change at all". Penalty functions aren't 100% accurate.
+ */
+static void
+gistunionsubkey(GISTSTATE *giststate, IndexTuple *itvec, GistSplitVector *spl)
+{
+ GistSplitUnion gsvp;
+
+ gsvp.dontcare = spl->spl_dontcare;
+
+ gsvp.entries = spl->splitVector.spl_left;
+ gsvp.len = spl->splitVector.spl_nleft;
+ gsvp.attr = spl->spl_lattr;
+ gsvp.isnull = spl->spl_lisnull;
+
+ gistunionsubkeyvec(giststate, itvec, &gsvp);
+
+ gsvp.entries = spl->splitVector.spl_right;
+ gsvp.len = spl->splitVector.spl_nright;
+ gsvp.attr = spl->spl_rattr;
+ gsvp.isnull = spl->spl_risnull;
+
+ gistunionsubkeyvec(giststate, itvec, &gsvp);
+}
+
+/*
+ * Find tuples that are "don't cares", that is could be moved to the other
+ * side of the split with zero penalty, so far as the attno column is
+ * concerned.
+ *
+ * Don't-care tuples are marked by setting the corresponding entry in
+ * spl->spl_dontcare[] to "true". Caller must have initialized that array
+ * to zeroes.
+ *
+ * Returns number of don't-cares found.
+ */
+static int
+findDontCares(Relation r, GISTSTATE *giststate, GISTENTRY *valvec,
+ GistSplitVector *spl, int attno)
+{
+ int i;
+ GISTENTRY entry;
+ int NumDontCare = 0;
+
+ /*
+ * First, search the left-side tuples to see if any have zero penalty to
+ * be added to the right-side union key.
+ *
+ * attno column is known all-not-null (see gistSplitByKey), so we need not
+ * check for nulls
+ */
+ gistentryinit(entry, spl->splitVector.spl_rdatum, r, NULL,
+ (OffsetNumber) 0, false);
+ for (i = 0; i < spl->splitVector.spl_nleft; i++)
+ {
+ int j = spl->splitVector.spl_left[i];
+ float penalty = gistpenalty(giststate, attno, &entry, false,
+ &valvec[j], false);
+
+ if (penalty == 0.0)
+ {
+ spl->spl_dontcare[j] = true;
+ NumDontCare++;
+ }
+ }
+
+ /* And conversely for the right-side tuples */
+ gistentryinit(entry, spl->splitVector.spl_ldatum, r, NULL,
+ (OffsetNumber) 0, false);
+ for (i = 0; i < spl->splitVector.spl_nright; i++)
+ {
+ int j = spl->splitVector.spl_right[i];
+ float penalty = gistpenalty(giststate, attno, &entry, false,
+ &valvec[j], false);
+
+ if (penalty == 0.0)
+ {
+ spl->spl_dontcare[j] = true;
+ NumDontCare++;
+ }
+ }
+
+ return NumDontCare;
+}
+
+/*
+ * Remove tuples that are marked don't-cares from the tuple index array a[]
+ * of length *len. This is applied separately to the spl_left and spl_right
+ * arrays.
+ */
+static void
+removeDontCares(OffsetNumber *a, int *len, const bool *dontcare)
+{
+ int origlen,
+ newlen,
+ i;
+ OffsetNumber *curwpos;
+
+ origlen = newlen = *len;
+ curwpos = a;
+ for (i = 0; i < origlen; i++)
+ {
+ OffsetNumber ai = a[i];
+
+ if (dontcare[ai] == false)
+ {
+ /* re-emit item into a[] */
+ *curwpos = ai;
+ curwpos++;
+ }
+ else
+ newlen--;
+ }
+
+ *len = newlen;
+}
+
+/*
+ * Place a single don't-care tuple into either the left or right side of the
+ * split, according to which has least penalty for merging the tuple into
+ * the previously-computed union keys. We need consider only columns starting
+ * at attno.
+ */
+static void
+placeOne(Relation r, GISTSTATE *giststate, GistSplitVector *v,
+ IndexTuple itup, OffsetNumber off, int attno)
+{
+ GISTENTRY identry[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ bool toLeft = true;
+
+ gistDeCompressAtt(giststate, r, itup, NULL, (OffsetNumber) 0,
+ identry, isnull);
+
+ for (; attno < giststate->nonLeafTupdesc->natts; attno++)
+ {
+ float lpenalty,
+ rpenalty;
+ GISTENTRY entry;
+
+ gistentryinit(entry, v->spl_lattr[attno], r, NULL, 0, false);
+ lpenalty = gistpenalty(giststate, attno, &entry, v->spl_lisnull[attno],
+ identry + attno, isnull[attno]);
+ gistentryinit(entry, v->spl_rattr[attno], r, NULL, 0, false);
+ rpenalty = gistpenalty(giststate, attno, &entry, v->spl_risnull[attno],
+ identry + attno, isnull[attno]);
+
+ if (lpenalty != rpenalty)
+ {
+ if (lpenalty > rpenalty)
+ toLeft = false;
+ break;
+ }
+ }
+
+ if (toLeft)
+ v->splitVector.spl_left[v->splitVector.spl_nleft++] = off;
+ else
+ v->splitVector.spl_right[v->splitVector.spl_nright++] = off;
+}
+
+#define SWAPVAR( s, d, t ) \
+do { \
+ (t) = (s); \
+ (s) = (d); \
+ (d) = (t); \
+} while(0)
+
+/*
+ * Clean up when we did a secondary split but the user-defined PickSplit
+ * method didn't support it (leaving spl_ldatum_exists or spl_rdatum_exists
+ * true).
+ *
+ * We consider whether to swap the left and right outputs of the secondary
+ * split; this can be worthwhile if the penalty for merging those tuples into
+ * the previously chosen sets is less that way.
+ *
+ * In any case we must update the union datums for the current column by
+ * adding in the previous union keys (oldL/oldR), since the user-defined
+ * PickSplit method didn't do so.
+ */
+static void
+supportSecondarySplit(Relation r, GISTSTATE *giststate, int attno,
+ GIST_SPLITVEC *sv, Datum oldL, Datum oldR)
+{
+ bool leaveOnLeft = true,
+ tmpBool;
+ GISTENTRY entryL,
+ entryR,
+ entrySL,
+ entrySR;
+
+ gistentryinit(entryL, oldL, r, NULL, 0, false);
+ gistentryinit(entryR, oldR, r, NULL, 0, false);
+ gistentryinit(entrySL, sv->spl_ldatum, r, NULL, 0, false);
+ gistentryinit(entrySR, sv->spl_rdatum, r, NULL, 0, false);
+
+ if (sv->spl_ldatum_exists && sv->spl_rdatum_exists)
+ {
+ float penalty1,
+ penalty2;
+
+ penalty1 = gistpenalty(giststate, attno, &entryL, false, &entrySL, false) +
+ gistpenalty(giststate, attno, &entryR, false, &entrySR, false);
+ penalty2 = gistpenalty(giststate, attno, &entryL, false, &entrySR, false) +
+ gistpenalty(giststate, attno, &entryR, false, &entrySL, false);
+
+ if (penalty1 > penalty2)
+ leaveOnLeft = false;
+ }
+ else
+ {
+ GISTENTRY *entry1 = (sv->spl_ldatum_exists) ? &entryL : &entryR;
+ float penalty1,
+ penalty2;
+
+ /*
+ * There is only one previously defined union, so we just choose swap
+ * or not by lowest penalty for that side. We can only get here if a
+ * secondary split happened to have all NULLs in its column in the
+ * tuples that the outer recursion level had assigned to one side.
+ * (Note that the null checks in gistSplitByKey don't prevent the
+ * case, because they'll only be checking tuples that were considered
+ * don't-cares at the outer recursion level, not the tuples that went
+ * into determining the passed-down left and right union keys.)
+ */
+ penalty1 = gistpenalty(giststate, attno, entry1, false, &entrySL, false);
+ penalty2 = gistpenalty(giststate, attno, entry1, false, &entrySR, false);
+
+ if (penalty1 < penalty2)
+ leaveOnLeft = (sv->spl_ldatum_exists) ? true : false;
+ else
+ leaveOnLeft = (sv->spl_rdatum_exists) ? true : false;
+ }
+
+ if (leaveOnLeft == false)
+ {
+ /*
+ * swap left and right
+ */
+ OffsetNumber *off,
+ noff;
+ Datum datum;
+
+ SWAPVAR(sv->spl_left, sv->spl_right, off);
+ SWAPVAR(sv->spl_nleft, sv->spl_nright, noff);
+ SWAPVAR(sv->spl_ldatum, sv->spl_rdatum, datum);
+ gistentryinit(entrySL, sv->spl_ldatum, r, NULL, 0, false);
+ gistentryinit(entrySR, sv->spl_rdatum, r, NULL, 0, false);
+ }
+
+ if (sv->spl_ldatum_exists)
+ gistMakeUnionKey(giststate, attno, &entryL, false, &entrySL, false,
+ &sv->spl_ldatum, &tmpBool);
+
+ if (sv->spl_rdatum_exists)
+ gistMakeUnionKey(giststate, attno, &entryR, false, &entrySR, false,
+ &sv->spl_rdatum, &tmpBool);
+
+ sv->spl_ldatum_exists = sv->spl_rdatum_exists = false;
+}
+
+/*
+ * Trivial picksplit implementation. Function called only
+ * if user-defined picksplit puts all keys on the same side of the split.
+ * That is a bug of user-defined picksplit but we don't want to fail.
+ */
+static void
+genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC *v, int attno)
+{
+ OffsetNumber i,
+ maxoff;
+ int nbytes;
+ GistEntryVector *evec;
+
+ maxoff = entryvec->n - 1;
+
+ nbytes = (maxoff + 2) * sizeof(OffsetNumber);
+
+ v->spl_left = (OffsetNumber *) palloc(nbytes);
+ v->spl_right = (OffsetNumber *) palloc(nbytes);
+ v->spl_nleft = v->spl_nright = 0;
+
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ if (i <= (maxoff - FirstOffsetNumber + 1) / 2)
+ {
+ v->spl_left[v->spl_nleft] = i;
+ v->spl_nleft++;
+ }
+ else
+ {
+ v->spl_right[v->spl_nright] = i;
+ v->spl_nright++;
+ }
+ }
+
+ /*
+ * Form union datums for each side
+ */
+ evec = palloc(sizeof(GISTENTRY) * entryvec->n + GEVHDRSZ);
+
+ evec->n = v->spl_nleft;
+ memcpy(evec->vector, entryvec->vector + FirstOffsetNumber,
+ sizeof(GISTENTRY) * evec->n);
+ v->spl_ldatum = FunctionCall2Coll(&giststate->unionFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(evec),
+ PointerGetDatum(&nbytes));
+
+ evec->n = v->spl_nright;
+ memcpy(evec->vector, entryvec->vector + FirstOffsetNumber + v->spl_nleft,
+ sizeof(GISTENTRY) * evec->n);
+ v->spl_rdatum = FunctionCall2Coll(&giststate->unionFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(evec),
+ PointerGetDatum(&nbytes));
+}
+
+/*
+ * Calls user picksplit method for attno column to split tuples into
+ * two vectors.
+ *
+ * Returns false if split is complete (there are no more index columns, or
+ * there is no need to consider them because split is optimal already).
+ *
+ * Returns true and v->spl_dontcare = NULL if the picksplit result is
+ * degenerate (all tuples seem to be don't-cares), so we should just
+ * disregard this column and split on the next column(s) instead.
+ *
+ * Returns true and v->spl_dontcare != NULL if there are don't-care tuples
+ * that could be relocated based on the next column(s). The don't-care
+ * tuples have been removed from the split and must be reinserted by caller.
+ * There is at least one non-don't-care tuple on each side of the split,
+ * and union keys for all columns are updated to include just those tuples.
+ *
+ * A true result implies there is at least one more index column.
+ */
+static bool
+gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVector *v,
+ IndexTuple *itup, int len, GISTSTATE *giststate)
+{
+ GIST_SPLITVEC *sv = &v->splitVector;
+
+ /*
+ * Prepare spl_ldatum/spl_rdatum/spl_ldatum_exists/spl_rdatum_exists in
+ * case we are doing a secondary split (see comments in gist.h).
+ */
+ sv->spl_ldatum_exists = (v->spl_lisnull[attno]) ? false : true;
+ sv->spl_rdatum_exists = (v->spl_risnull[attno]) ? false : true;
+ sv->spl_ldatum = v->spl_lattr[attno];
+ sv->spl_rdatum = v->spl_rattr[attno];
+
+ /*
+ * Let the opclass-specific PickSplit method do its thing. Note that at
+ * this point we know there are no null keys in the entryvec.
+ */
+ FunctionCall2Coll(&giststate->picksplitFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(entryvec),
+ PointerGetDatum(sv));
+
+ if (sv->spl_nleft == 0 || sv->spl_nright == 0)
+ {
+ /*
+ * User-defined picksplit failed to create an actual split, ie it put
+ * everything on the same side. Complain but cope.
+ */
+ ereport(DEBUG1,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("picksplit method for column %d of index \"%s\" failed",
+ attno + 1, RelationGetRelationName(r)),
+ errhint("The index is not optimal. To optimize it, contact a developer, or try to use the column as the second one in the CREATE INDEX command.")));
+
+ /*
+ * Reinit GIST_SPLITVEC. Although these fields are not used by
+ * genericPickSplit(), set them up for further processing
+ */
+ sv->spl_ldatum_exists = (v->spl_lisnull[attno]) ? false : true;
+ sv->spl_rdatum_exists = (v->spl_risnull[attno]) ? false : true;
+ sv->spl_ldatum = v->spl_lattr[attno];
+ sv->spl_rdatum = v->spl_rattr[attno];
+
+ /* Do a generic split */
+ genericPickSplit(giststate, entryvec, sv, attno);
+ }
+ else
+ {
+ /* hack for compatibility with old picksplit API */
+ if (sv->spl_left[sv->spl_nleft - 1] == InvalidOffsetNumber)
+ sv->spl_left[sv->spl_nleft - 1] = (OffsetNumber) (entryvec->n - 1);
+ if (sv->spl_right[sv->spl_nright - 1] == InvalidOffsetNumber)
+ sv->spl_right[sv->spl_nright - 1] = (OffsetNumber) (entryvec->n - 1);
+ }
+
+ /* Clean up if PickSplit didn't take care of a secondary split */
+ if (sv->spl_ldatum_exists || sv->spl_rdatum_exists)
+ supportSecondarySplit(r, giststate, attno, sv,
+ v->spl_lattr[attno], v->spl_rattr[attno]);
+
+ /* emit union datums computed by PickSplit back to v arrays */
+ v->spl_lattr[attno] = sv->spl_ldatum;
+ v->spl_rattr[attno] = sv->spl_rdatum;
+ v->spl_lisnull[attno] = false;
+ v->spl_risnull[attno] = false;
+
+ /*
+ * If index columns remain, then consider whether we can improve the split
+ * by using them.
+ */
+ v->spl_dontcare = NULL;
+
+ if (attno + 1 < giststate->nonLeafTupdesc->natts)
+ {
+ int NumDontCare;
+
+ /*
+ * Make a quick check to see if left and right union keys are equal;
+ * if so, the split is certainly degenerate, so tell caller to
+ * re-split with the next column.
+ */
+ if (gistKeyIsEQ(giststate, attno, sv->spl_ldatum, sv->spl_rdatum))
+ return true;
+
+ /*
+ * Locate don't-care tuples, if any. If there are none, the split is
+ * optimal, so just fall out and return false.
+ */
+ v->spl_dontcare = (bool *) palloc0(sizeof(bool) * (entryvec->n + 1));
+
+ NumDontCare = findDontCares(r, giststate, entryvec->vector, v, attno);
+
+ if (NumDontCare > 0)
+ {
+ /*
+ * Remove don't-cares from spl_left[] and spl_right[].
+ */
+ removeDontCares(sv->spl_left, &sv->spl_nleft, v->spl_dontcare);
+ removeDontCares(sv->spl_right, &sv->spl_nright, v->spl_dontcare);
+
+ /*
+ * If all tuples on either side were don't-cares, the split is
+ * degenerate, and we're best off to ignore it and split on the
+ * next column. (We used to try to press on with a secondary
+ * split by forcing a random tuple on each side to be treated as
+ * non-don't-care, but it seems unlikely that that technique
+ * really gives a better result. Note that we don't want to try a
+ * secondary split with empty left or right primary split sides,
+ * because then there is no union key on that side for the
+ * PickSplit function to try to expand, so it can have no good
+ * figure of merit for what it's doing. Also note that this check
+ * ensures we can't produce a bogus one-side-only split in the
+ * NumDontCare == 1 special case below.)
+ */
+ if (sv->spl_nleft == 0 || sv->spl_nright == 0)
+ {
+ v->spl_dontcare = NULL;
+ return true;
+ }
+
+ /*
+ * Recompute union keys, considering only non-don't-care tuples.
+ * NOTE: this will set union keys for remaining index columns,
+ * which will cause later calls of gistUserPicksplit to pass those
+ * values down to user-defined PickSplit methods with
+ * spl_ldatum_exists/spl_rdatum_exists set true.
+ */
+ gistunionsubkey(giststate, itup, v);
+
+ if (NumDontCare == 1)
+ {
+ /*
+ * If there's only one don't-care tuple then we can't do a
+ * PickSplit on it, so just choose whether to send it left or
+ * right by comparing penalties. We needed the
+ * gistunionsubkey step anyway so that we have appropriate
+ * union keys for figuring the penalties.
+ */
+ OffsetNumber toMove;
+
+ /* find it ... */
+ for (toMove = FirstOffsetNumber; toMove < entryvec->n; toMove++)
+ {
+ if (v->spl_dontcare[toMove])
+ break;
+ }
+ Assert(toMove < entryvec->n);
+
+ /* ... and assign it to cheaper side */
+ placeOne(r, giststate, v, itup[toMove - 1], toMove, attno + 1);
+
+ /*
+ * At this point the union keys are wrong, but we don't care
+ * because we're done splitting. The outermost recursion
+ * level of gistSplitByKey will fix things before returning.
+ */
+ }
+ else
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * simply split page in half
+ */
+static void
+gistSplitHalf(GIST_SPLITVEC *v, int len)
+{
+ int i;
+
+ v->spl_nright = v->spl_nleft = 0;
+ v->spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber));
+ v->spl_right = (OffsetNumber *) palloc(len * sizeof(OffsetNumber));
+ for (i = 1; i <= len; i++)
+ if (i < len / 2)
+ v->spl_right[v->spl_nright++] = i;
+ else
+ v->spl_left[v->spl_nleft++] = i;
+
+ /* we need not compute union keys, caller took care of it */
+}
+
+/*
+ * gistSplitByKey: main entry point for page-splitting algorithm
+ *
+ * r: index relation
+ * page: page being split
+ * itup: array of IndexTuples to be processed
+ * len: number of IndexTuples to be processed (must be at least 2)
+ * giststate: additional info about index
+ * v: working state and output area
+ * attno: column we are working on (zero-based index)
+ *
+ * Outside caller must initialize v->spl_lisnull and v->spl_risnull arrays
+ * to all-true. On return, spl_left/spl_nleft contain indexes of tuples
+ * to go left, spl_right/spl_nright contain indexes of tuples to go right,
+ * spl_lattr/spl_lisnull contain left-side union key values, and
+ * spl_rattr/spl_risnull contain right-side union key values. Other fields
+ * in this struct are workspace for this file.
+ *
+ * Outside caller must pass zero for attno. The function may internally
+ * recurse to the next column by passing attno+1.
+ */
+void
+gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len,
+ GISTSTATE *giststate, GistSplitVector *v, int attno)
+{
+ GistEntryVector *entryvec;
+ OffsetNumber *offNullTuples;
+ int nOffNullTuples = 0;
+ int i;
+
+ /* generate the item array, and identify tuples with null keys */
+ /* note that entryvec->vector[0] goes unused in this code */
+ entryvec = palloc(GEVHDRSZ + (len + 1) * sizeof(GISTENTRY));
+ entryvec->n = len + 1;
+ offNullTuples = (OffsetNumber *) palloc(len * sizeof(OffsetNumber));
+
+ for (i = 1; i <= len; i++)
+ {
+ Datum datum;
+ bool IsNull;
+
+ datum = index_getattr(itup[i - 1], attno + 1, giststate->leafTupdesc,
+ &IsNull);
+ gistdentryinit(giststate, attno, &(entryvec->vector[i]),
+ datum, r, page, i,
+ false, IsNull);
+ if (IsNull)
+ offNullTuples[nOffNullTuples++] = i;
+ }
+
+ if (nOffNullTuples == len)
+ {
+ /*
+ * Corner case: All keys in attno column are null, so just transfer
+ * our attention to the next column. If there's no next column, just
+ * split page in half.
+ */
+ v->spl_risnull[attno] = v->spl_lisnull[attno] = true;
+
+ if (attno + 1 < giststate->nonLeafTupdesc->natts)
+ gistSplitByKey(r, page, itup, len, giststate, v, attno + 1);
+ else
+ gistSplitHalf(&v->splitVector, len);
+ }
+ else if (nOffNullTuples > 0)
+ {
+ int j = 0;
+
+ /*
+ * We don't want to mix NULL and not-NULL keys on one page, so split
+ * nulls to right page and not-nulls to left.
+ */
+ v->splitVector.spl_right = offNullTuples;
+ v->splitVector.spl_nright = nOffNullTuples;
+ v->spl_risnull[attno] = true;
+
+ v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber));
+ v->splitVector.spl_nleft = 0;
+ for (i = 1; i <= len; i++)
+ if (j < v->splitVector.spl_nright && offNullTuples[j] == i)
+ j++;
+ else
+ v->splitVector.spl_left[v->splitVector.spl_nleft++] = i;
+
+ /* Compute union keys, unless outer recursion level will handle it */
+ if (attno == 0 && giststate->nonLeafTupdesc->natts == 1)
+ {
+ v->spl_dontcare = NULL;
+ gistunionsubkey(giststate, itup, v);
+ }
+ }
+ else
+ {
+ /*
+ * All keys are not-null, so apply user-defined PickSplit method
+ */
+ if (gistUserPicksplit(r, entryvec, attno, v, itup, len, giststate))
+ {
+ /*
+ * Splitting on attno column is not optimal, so consider
+ * redistributing don't-care tuples according to the next column
+ */
+ Assert(attno + 1 < giststate->nonLeafTupdesc->natts);
+
+ if (v->spl_dontcare == NULL)
+ {
+ /*
+ * This split was actually degenerate, so ignore it altogether
+ * and just split according to the next column.
+ */
+ gistSplitByKey(r, page, itup, len, giststate, v, attno + 1);
+ }
+ else
+ {
+ /*
+ * Form an array of just the don't-care tuples to pass to a
+ * recursive invocation of this function for the next column.
+ */
+ IndexTuple *newitup = (IndexTuple *) palloc(len * sizeof(IndexTuple));
+ OffsetNumber *map = (OffsetNumber *) palloc(len * sizeof(OffsetNumber));
+ int newlen = 0;
+ GIST_SPLITVEC backupSplit;
+
+ for (i = 0; i < len; i++)
+ {
+ if (v->spl_dontcare[i + 1])
+ {
+ newitup[newlen] = itup[i];
+ map[newlen] = i + 1;
+ newlen++;
+ }
+ }
+
+ Assert(newlen > 0);
+
+ /*
+ * Make a backup copy of v->splitVector, since the recursive
+ * call will overwrite that with its own result.
+ */
+ backupSplit = v->splitVector;
+ backupSplit.spl_left = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len);
+ memcpy(backupSplit.spl_left, v->splitVector.spl_left, sizeof(OffsetNumber) * v->splitVector.spl_nleft);
+ backupSplit.spl_right = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len);
+ memcpy(backupSplit.spl_right, v->splitVector.spl_right, sizeof(OffsetNumber) * v->splitVector.spl_nright);
+
+ /* Recursively decide how to split the don't-care tuples */
+ gistSplitByKey(r, page, newitup, newlen, giststate, v, attno + 1);
+
+ /* Merge result of subsplit with non-don't-care tuples */
+ for (i = 0; i < v->splitVector.spl_nleft; i++)
+ backupSplit.spl_left[backupSplit.spl_nleft++] = map[v->splitVector.spl_left[i] - 1];
+ for (i = 0; i < v->splitVector.spl_nright; i++)
+ backupSplit.spl_right[backupSplit.spl_nright++] = map[v->splitVector.spl_right[i] - 1];
+
+ v->splitVector = backupSplit;
+ }
+ }
+ }
+
+ /*
+ * If we're handling a multicolumn index, at the end of the recursion
+ * recompute the left and right union datums for all index columns. This
+ * makes sure we hand back correct union datums in all corner cases,
+ * including when we haven't processed all columns to start with, or when
+ * a secondary split moved "don't care" tuples from one side to the other
+ * (we really shouldn't assume that that didn't change the union datums).
+ *
+ * Note: when we're in an internal recursion (attno > 0), we do not worry
+ * about whether the union datums we return with are sensible, since
+ * calling levels won't care. Also, in a single-column index, we expect
+ * that PickSplit (or the special cases above) produced correct union
+ * datums.
+ */
+ if (attno == 0 && giststate->nonLeafTupdesc->natts > 1)
+ {
+ v->spl_dontcare = NULL;
+ gistunionsubkey(giststate, itup, v);
+ }
+}
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
new file mode 100644
index 0000000..765329b
--- /dev/null
+++ b/src/backend/access/gist/gistutil.c
@@ -0,0 +1,1054 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistutil.c
+ * utilities routines for the postgres GiST index access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistutil.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/gist_private.h"
+#include "access/htup_details.h"
+#include "access/reloptions.h"
+#include "catalog/pg_opclass.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "utils/float.h"
+#include "utils/lsyscache.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+
+/*
+ * Write itup vector to page, has no control of free space.
+ */
+void
+gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
+{
+ OffsetNumber l = InvalidOffsetNumber;
+ int i;
+
+ if (off == InvalidOffsetNumber)
+ off = (PageIsEmpty(page)) ? FirstOffsetNumber :
+ OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+ for (i = 0; i < len; i++)
+ {
+ Size sz = IndexTupleSize(itup[i]);
+
+ l = PageAddItem(page, (Item) itup[i], sz, off, false, false);
+ if (l == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes",
+ i, len, (int) sz);
+ off++;
+ }
+}
+
+/*
+ * Check space for itup vector on page
+ */
+bool
+gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace)
+{
+ unsigned int size = freespace,
+ deleted = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);
+
+ if (todelete != InvalidOffsetNumber)
+ {
+ IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete));
+
+ deleted = IndexTupleSize(itup) + sizeof(ItemIdData);
+ }
+
+ return (PageGetFreeSpace(page) + deleted < size);
+}
+
+bool
+gistfitpage(IndexTuple *itvec, int len)
+{
+ int i;
+ Size size = 0;
+
+ for (i = 0; i < len; i++)
+ size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);
+
+ /* TODO: Consider fillfactor */
+ return (size <= GiSTPageSize);
+}
+
+/*
+ * Read buffer into itup vector
+ */
+IndexTuple *
+gistextractpage(Page page, int *len /* out */ )
+{
+ OffsetNumber i,
+ maxoff;
+ IndexTuple *itvec;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ *len = maxoff;
+ itvec = palloc(sizeof(IndexTuple) * maxoff);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ itvec[i - FirstOffsetNumber] = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+
+ return itvec;
+}
+
+/*
+ * join two vectors into one
+ */
+IndexTuple *
+gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
+{
+ itvec = (IndexTuple *) repalloc((void *) itvec, sizeof(IndexTuple) * ((*len) + addlen));
+ memmove(&itvec[*len], additvec, sizeof(IndexTuple) * addlen);
+ *len += addlen;
+ return itvec;
+}
+
+/*
+ * make plain IndexTuple vector
+ */
+
+IndexTupleData *
+gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
+{
+ char *ptr,
+ *ret;
+ int i;
+
+ *memlen = 0;
+
+ for (i = 0; i < veclen; i++)
+ *memlen += IndexTupleSize(vec[i]);
+
+ ptr = ret = palloc(*memlen);
+
+ for (i = 0; i < veclen; i++)
+ {
+ memcpy(ptr, vec[i], IndexTupleSize(vec[i]));
+ ptr += IndexTupleSize(vec[i]);
+ }
+
+ return (IndexTupleData *) ret;
+}
+
+/*
+ * Make unions of keys in IndexTuple vector (one union datum per index column).
+ * Union Datums are returned into the attr/isnull arrays.
+ * Resulting Datums aren't compressed.
+ */
+void
+gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
+ Datum *attr, bool *isnull)
+{
+ int i;
+ GistEntryVector *evec;
+ int attrsize;
+
+ evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ);
+
+ for (i = 0; i < giststate->nonLeafTupdesc->natts; i++)
+ {
+ int j;
+
+ /* Collect non-null datums for this column */
+ evec->n = 0;
+ for (j = 0; j < len; j++)
+ {
+ Datum datum;
+ bool IsNull;
+
+ datum = index_getattr(itvec[j], i + 1, giststate->leafTupdesc,
+ &IsNull);
+ if (IsNull)
+ continue;
+
+ gistdentryinit(giststate, i,
+ evec->vector + evec->n,
+ datum,
+ NULL, NULL, (OffsetNumber) 0,
+ false, IsNull);
+ evec->n++;
+ }
+
+ /* If this column was all NULLs, the union is NULL */
+ if (evec->n == 0)
+ {
+ attr[i] = (Datum) 0;
+ isnull[i] = true;
+ }
+ else
+ {
+ if (evec->n == 1)
+ {
+ /* unionFn may expect at least two inputs */
+ evec->n = 2;
+ evec->vector[1] = evec->vector[0];
+ }
+
+ /* Make union and store in attr array */
+ attr[i] = FunctionCall2Coll(&giststate->unionFn[i],
+ giststate->supportCollation[i],
+ PointerGetDatum(evec),
+ PointerGetDatum(&attrsize));
+
+ isnull[i] = false;
+ }
+ }
+}
+
+/*
+ * Return an IndexTuple containing the result of applying the "union"
+ * method to the specified IndexTuple vector.
+ */
+IndexTuple
+gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
+{
+ Datum attr[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+
+ gistMakeUnionItVec(giststate, itvec, len, attr, isnull);
+
+ return gistFormTuple(giststate, r, attr, isnull, false);
+}
+
+/*
+ * makes union of two key
+ */
+void
+gistMakeUnionKey(GISTSTATE *giststate, int attno,
+ GISTENTRY *entry1, bool isnull1,
+ GISTENTRY *entry2, bool isnull2,
+ Datum *dst, bool *dstisnull)
+{
+ /* we need a GistEntryVector with room for exactly 2 elements */
+ union
+ {
+ GistEntryVector gev;
+ char padding[2 * sizeof(GISTENTRY) + GEVHDRSZ];
+ } storage;
+ GistEntryVector *evec = &storage.gev;
+ int dstsize;
+
+ evec->n = 2;
+
+ if (isnull1 && isnull2)
+ {
+ *dstisnull = true;
+ *dst = (Datum) 0;
+ }
+ else
+ {
+ if (isnull1 == false && isnull2 == false)
+ {
+ evec->vector[0] = *entry1;
+ evec->vector[1] = *entry2;
+ }
+ else if (isnull1 == false)
+ {
+ evec->vector[0] = *entry1;
+ evec->vector[1] = *entry1;
+ }
+ else
+ {
+ evec->vector[0] = *entry2;
+ evec->vector[1] = *entry2;
+ }
+
+ *dstisnull = false;
+ *dst = FunctionCall2Coll(&giststate->unionFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(evec),
+ PointerGetDatum(&dstsize));
+ }
+}
+
+bool
+gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b)
+{
+ bool result;
+
+ FunctionCall3Coll(&giststate->equalFn[attno],
+ giststate->supportCollation[attno],
+ a, b,
+ PointerGetDatum(&result));
+ return result;
+}
+
+/*
+ * Decompress all keys in tuple
+ */
+void
+gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
+ OffsetNumber o, GISTENTRY *attdata, bool *isnull)
+{
+ int i;
+
+ for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++)
+ {
+ Datum datum;
+
+ datum = index_getattr(tuple, i + 1, giststate->leafTupdesc, &isnull[i]);
+ gistdentryinit(giststate, i, &attdata[i],
+ datum, r, p, o,
+ false, isnull[i]);
+ }
+}
+
+/*
+ * Forms union of oldtup and addtup, if union == oldtup then return NULL
+ */
+IndexTuple
+gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
+{
+ bool neednew = false;
+ GISTENTRY oldentries[INDEX_MAX_KEYS],
+ addentries[INDEX_MAX_KEYS];
+ bool oldisnull[INDEX_MAX_KEYS],
+ addisnull[INDEX_MAX_KEYS];
+ Datum attr[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ IndexTuple newtup = NULL;
+ int i;
+
+ gistDeCompressAtt(giststate, r, oldtup, NULL,
+ (OffsetNumber) 0, oldentries, oldisnull);
+
+ gistDeCompressAtt(giststate, r, addtup, NULL,
+ (OffsetNumber) 0, addentries, addisnull);
+
+ for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++)
+ {
+ gistMakeUnionKey(giststate, i,
+ oldentries + i, oldisnull[i],
+ addentries + i, addisnull[i],
+ attr + i, isnull + i);
+
+ if (neednew)
+ /* we already need new key, so we can skip check */
+ continue;
+
+ if (isnull[i])
+ /* union of key may be NULL if and only if both keys are NULL */
+ continue;
+
+ if (!addisnull[i])
+ {
+ if (oldisnull[i] ||
+ !gistKeyIsEQ(giststate, i, oldentries[i].key, attr[i]))
+ neednew = true;
+ }
+ }
+
+ if (neednew)
+ {
+ /* need to update key */
+ newtup = gistFormTuple(giststate, r, attr, isnull, false);
+ newtup->t_tid = oldtup->t_tid;
+ }
+
+ return newtup;
+}
+
+/*
+ * Search an upper index page for the entry with lowest penalty for insertion
+ * of the new index key contained in "it".
+ *
+ * Returns the index of the page entry to insert into.
+ */
+OffsetNumber
+gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
+ GISTSTATE *giststate)
+{
+ OffsetNumber result;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ float best_penalty[INDEX_MAX_KEYS];
+ GISTENTRY entry,
+ identry[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ int keep_current_best;
+
+ Assert(!GistPageIsLeaf(p));
+
+ gistDeCompressAtt(giststate, r,
+ it, NULL, (OffsetNumber) 0,
+ identry, isnull);
+
+ /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */
+ result = FirstOffsetNumber;
+
+ /*
+ * The index may have multiple columns, and there's a penalty value for
+ * each column. The penalty associated with a column that appears earlier
+ * in the index definition is strictly more important than the penalty of
+ * a column that appears later in the index definition.
+ *
+ * best_penalty[j] is the best penalty we have seen so far for column j,
+ * or -1 when we haven't yet examined column j. Array entries to the
+ * right of the first -1 are undefined.
+ */
+ best_penalty[0] = -1;
+
+ /*
+ * If we find a tuple that's exactly as good as the currently best one, we
+ * could use either one. When inserting a lot of tuples with the same or
+ * similar keys, it's preferable to descend down the same path when
+ * possible, as that's more cache-friendly. On the other hand, if all
+ * inserts land on the same leaf page after a split, we're never going to
+ * insert anything to the other half of the split, and will end up using
+ * only 50% of the available space. Distributing the inserts evenly would
+ * lead to better space usage, but that hurts cache-locality during
+ * insertion. To get the best of both worlds, when we find a tuple that's
+ * exactly as good as the previous best, choose randomly whether to stick
+ * to the old best, or use the new one. Once we decide to stick to the
+ * old best, we keep sticking to it for any subsequent equally good tuples
+ * we might find. This favors tuples with low offsets, but still allows
+ * some inserts to go to other equally-good subtrees.
+ *
+ * keep_current_best is -1 if we haven't yet had to make a random choice
+ * whether to keep the current best tuple. If we have done so, and
+ * decided to keep it, keep_current_best is 1; if we've decided to
+ * replace, keep_current_best is 0. (This state will be reset to -1 as
+ * soon as we've made the replacement, but sometimes we make the choice in
+ * advance of actually finding a replacement best tuple.)
+ */
+ keep_current_best = -1;
+
+ /*
+ * Loop over tuples on page.
+ */
+ maxoff = PageGetMaxOffsetNumber(p);
+ Assert(maxoff >= FirstOffsetNumber);
+
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));
+ bool zero_penalty;
+ int j;
+
+ zero_penalty = true;
+
+ /* Loop over index attributes. */
+ for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++)
+ {
+ Datum datum;
+ float usize;
+ bool IsNull;
+
+ /* Compute penalty for this column. */
+ datum = index_getattr(itup, j + 1, giststate->leafTupdesc,
+ &IsNull);
+ gistdentryinit(giststate, j, &entry, datum, r, p, i,
+ false, IsNull);
+ usize = gistpenalty(giststate, j, &entry, IsNull,
+ &identry[j], isnull[j]);
+ if (usize > 0)
+ zero_penalty = false;
+
+ if (best_penalty[j] < 0 || usize < best_penalty[j])
+ {
+ /*
+ * New best penalty for column. Tentatively select this tuple
+ * as the target, and record the best penalty. Then reset the
+ * next column's penalty to "unknown" (and indirectly, the
+ * same for all the ones to its right). This will force us to
+ * adopt this tuple's penalty values as the best for all the
+ * remaining columns during subsequent loop iterations.
+ */
+ result = i;
+ best_penalty[j] = usize;
+
+ if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1)
+ best_penalty[j + 1] = -1;
+
+ /* we have new best, so reset keep-it decision */
+ keep_current_best = -1;
+ }
+ else if (best_penalty[j] == usize)
+ {
+ /*
+ * The current tuple is exactly as good for this column as the
+ * best tuple seen so far. The next iteration of this loop
+ * will compare the next column.
+ */
+ }
+ else
+ {
+ /*
+ * The current tuple is worse for this column than the best
+ * tuple seen so far. Skip the remaining columns and move on
+ * to the next tuple, if any.
+ */
+ zero_penalty = false; /* so outer loop won't exit */
+ break;
+ }
+ }
+
+ /*
+ * If we looped past the last column, and did not update "result",
+ * then this tuple is exactly as good as the prior best tuple.
+ */
+ if (j == IndexRelationGetNumberOfKeyAttributes(r) && result != i)
+ {
+ if (keep_current_best == -1)
+ {
+ /* we didn't make the random choice yet for this old best */
+ keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
+ }
+ if (keep_current_best == 0)
+ {
+ /* we choose to use the new tuple */
+ result = i;
+ /* choose again if there are even more exactly-as-good ones */
+ keep_current_best = -1;
+ }
+ }
+
+ /*
+ * If we find a tuple with zero penalty for all columns, and we've
+ * decided we don't want to search for another tuple with equal
+ * penalty, there's no need to examine remaining tuples; just break
+ * out of the loop and return it.
+ */
+ if (zero_penalty)
+ {
+ if (keep_current_best == -1)
+ {
+ /* we didn't make the random choice yet for this old best */
+ keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
+ }
+ if (keep_current_best == 1)
+ break;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * initialize a GiST entry with a decompressed version of key
+ */
+void
+gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
+ Datum k, Relation r, Page pg, OffsetNumber o,
+ bool l, bool isNull)
+{
+ if (!isNull)
+ {
+ GISTENTRY *dep;
+
+ gistentryinit(*e, k, r, pg, o, l);
+
+ /* there may not be a decompress function in opclass */
+ if (!OidIsValid(giststate->decompressFn[nkey].fn_oid))
+ return;
+
+ dep = (GISTENTRY *)
+ DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey],
+ giststate->supportCollation[nkey],
+ PointerGetDatum(e)));
+ /* decompressFn may just return the given pointer */
+ if (dep != e)
+ gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset,
+ dep->leafkey);
+ }
+ else
+ gistentryinit(*e, (Datum) 0, r, pg, o, l);
+}
+
+IndexTuple
+gistFormTuple(GISTSTATE *giststate, Relation r,
+ Datum attdata[], bool isnull[], bool isleaf)
+{
+ Datum compatt[INDEX_MAX_KEYS];
+ int i;
+ IndexTuple res;
+
+ /*
+ * Call the compress method on each attribute.
+ */
+ for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++)
+ {
+ if (isnull[i])
+ compatt[i] = (Datum) 0;
+ else
+ {
+ GISTENTRY centry;
+ GISTENTRY *cep;
+
+ gistentryinit(centry, attdata[i], r, NULL, (OffsetNumber) 0,
+ isleaf);
+ /* there may not be a compress function in opclass */
+ if (OidIsValid(giststate->compressFn[i].fn_oid))
+ cep = (GISTENTRY *)
+ DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[i],
+ giststate->supportCollation[i],
+ PointerGetDatum(&centry)));
+ else
+ cep = &centry;
+ compatt[i] = cep->key;
+ }
+ }
+
+ if (isleaf)
+ {
+ /*
+ * Emplace each included attribute if any.
+ */
+ for (; i < r->rd_att->natts; i++)
+ {
+ if (isnull[i])
+ compatt[i] = (Datum) 0;
+ else
+ compatt[i] = attdata[i];
+ }
+ }
+
+ res = index_form_tuple(isleaf ? giststate->leafTupdesc :
+ giststate->nonLeafTupdesc,
+ compatt, isnull);
+
+ /*
+ * The offset number on tuples on internal pages is unused. For historical
+ * reasons, it is set to 0xffff.
+ */
+ ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff);
+ return res;
+}
+
+/*
+ * initialize a GiST entry with fetched value in key field
+ */
+static Datum
+gistFetchAtt(GISTSTATE *giststate, int nkey, Datum k, Relation r)
+{
+ GISTENTRY fentry;
+ GISTENTRY *fep;
+
+ gistentryinit(fentry, k, r, NULL, (OffsetNumber) 0, false);
+
+ fep = (GISTENTRY *)
+ DatumGetPointer(FunctionCall1Coll(&giststate->fetchFn[nkey],
+ giststate->supportCollation[nkey],
+ PointerGetDatum(&fentry)));
+
+ /* fetchFn set 'key', return it to the caller */
+ return fep->key;
+}
+
+/*
+ * Fetch all keys in tuple.
+ * Returns a new HeapTuple containing the originally-indexed data.
+ */
+HeapTuple
+gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple)
+{
+ MemoryContext oldcxt = MemoryContextSwitchTo(giststate->tempCxt);
+ Datum fetchatt[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ int i;
+
+ for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++)
+ {
+ Datum datum;
+
+ datum = index_getattr(tuple, i + 1, giststate->leafTupdesc, &isnull[i]);
+
+ if (giststate->fetchFn[i].fn_oid != InvalidOid)
+ {
+ if (!isnull[i])
+ fetchatt[i] = gistFetchAtt(giststate, i, datum, r);
+ else
+ fetchatt[i] = (Datum) 0;
+ }
+ else if (giststate->compressFn[i].fn_oid == InvalidOid)
+ {
+ /*
+ * If opclass does not provide compress method that could change
+ * original value, att is necessarily stored in original form.
+ */
+ if (!isnull[i])
+ fetchatt[i] = datum;
+ else
+ fetchatt[i] = (Datum) 0;
+ }
+ else
+ {
+ /*
+ * Index-only scans not supported for this column. Since the
+ * planner chose an index-only scan anyway, it is not interested
+ * in this column, and we can replace it with a NULL.
+ */
+ isnull[i] = true;
+ fetchatt[i] = (Datum) 0;
+ }
+ }
+
+ /*
+ * Get each included attribute.
+ */
+ for (; i < r->rd_att->natts; i++)
+ {
+ fetchatt[i] = index_getattr(tuple, i + 1, giststate->leafTupdesc,
+ &isnull[i]);
+ }
+ MemoryContextSwitchTo(oldcxt);
+
+ return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull);
+}
+
+float
+gistpenalty(GISTSTATE *giststate, int attno,
+ GISTENTRY *orig, bool isNullOrig,
+ GISTENTRY *add, bool isNullAdd)
+{
+ float penalty = 0.0;
+
+ if (giststate->penaltyFn[attno].fn_strict == false ||
+ (isNullOrig == false && isNullAdd == false))
+ {
+ FunctionCall3Coll(&giststate->penaltyFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(orig),
+ PointerGetDatum(add),
+ PointerGetDatum(&penalty));
+ /* disallow negative or NaN penalty */
+ if (isnan(penalty) || penalty < 0.0)
+ penalty = 0.0;
+ }
+ else if (isNullOrig && isNullAdd)
+ penalty = 0.0;
+ else
+ {
+ /* try to prevent mixing null and non-null values */
+ penalty = get_float4_infinity();
+ }
+
+ return penalty;
+}
+
+/*
+ * Initialize a new index page
+ */
+void
+GISTInitBuffer(Buffer b, uint32 f)
+{
+ GISTPageOpaque opaque;
+ Page page;
+ Size pageSize;
+
+ pageSize = BufferGetPageSize(b);
+ page = BufferGetPage(b);
+ PageInit(page, pageSize, sizeof(GISTPageOpaqueData));
+
+ opaque = GistPageGetOpaque(page);
+ /* page was already zeroed by PageInit, so this is not needed: */
+ /* memset(&(opaque->nsn), 0, sizeof(GistNSN)); */
+ opaque->rightlink = InvalidBlockNumber;
+ opaque->flags = f;
+ opaque->gist_page_id = GIST_PAGE_ID;
+}
+
+/*
+ * Verify that a freshly-read page looks sane.
+ */
+void
+gistcheckpage(Relation rel, Buffer buf)
+{
+ Page page = BufferGetPage(buf);
+
+ /*
+ * ReadBuffer verifies that every newly-read page passes
+ * PageHeaderIsValid, which means it either contains a reasonably sane
+ * page header or is all-zero. We have to defend against the all-zero
+ * case, however.
+ */
+ if (PageIsNew(page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" contains unexpected zero page at block %u",
+ RelationGetRelationName(rel),
+ BufferGetBlockNumber(buf)),
+ errhint("Please REINDEX it.")));
+
+ /*
+ * Additionally check that the special area looks sane.
+ */
+ if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData)))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" contains corrupted page at block %u",
+ RelationGetRelationName(rel),
+ BufferGetBlockNumber(buf)),
+ errhint("Please REINDEX it.")));
+}
+
+
+/*
+ * Allocate a new page (either by recycling, or by extending the index file)
+ *
+ * The returned buffer is already pinned and exclusive-locked
+ *
+ * Caller is responsible for initializing the page by calling GISTInitBuffer
+ */
+Buffer
+gistNewBuffer(Relation r)
+{
+ Buffer buffer;
+ bool needLock;
+
+ /* First, try to get a page from FSM */
+ for (;;)
+ {
+ BlockNumber blkno = GetFreeIndexPage(r);
+
+ if (blkno == InvalidBlockNumber)
+ break; /* nothing left in FSM */
+
+ buffer = ReadBuffer(r, blkno);
+
+ /*
+ * We have to guard against the possibility that someone else already
+ * recycled this page; the buffer may be locked if so.
+ */
+ if (ConditionalLockBuffer(buffer))
+ {
+ Page page = BufferGetPage(buffer);
+
+ /*
+ * If the page was never initialized, it's OK to use.
+ */
+ if (PageIsNew(page))
+ return buffer;
+
+ gistcheckpage(r, buffer);
+
+ /*
+ * Otherwise, recycle it if deleted, and too old to have any
+ * processes interested in it.
+ */
+ if (gistPageRecyclable(page))
+ {
+ /*
+ * If we are generating WAL for Hot Standby then create a WAL
+ * record that will allow us to conflict with queries running
+ * on standby, in case they have snapshots older than the
+ * page's deleteXid.
+ */
+ if (XLogStandbyInfoActive() && RelationNeedsWAL(r))
+ gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page));
+
+ return buffer;
+ }
+
+ LockBuffer(buffer, GIST_UNLOCK);
+ }
+
+ /* Can't use it, so release buffer and try again */
+ ReleaseBuffer(buffer);
+ }
+
+ /* Must extend the file */
+ needLock = !RELATION_IS_LOCAL(r);
+
+ if (needLock)
+ LockRelationForExtension(r, ExclusiveLock);
+
+ buffer = ReadBuffer(r, P_NEW);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+
+ if (needLock)
+ UnlockRelationForExtension(r, ExclusiveLock);
+
+ return buffer;
+}
+
+/* Can this page be recycled yet? */
+bool
+gistPageRecyclable(Page page)
+{
+ if (PageIsNew(page))
+ return true;
+ if (GistPageIsDeleted(page))
+ {
+ /*
+ * The page was deleted, but when? If it was just deleted, a scan
+ * might have seen the downlink to it, and will read the page later.
+ * As long as that can happen, we must keep the deleted page around as
+ * a tombstone.
+ *
+ * Compare the deletion XID with RecentGlobalXmin. If deleteXid <
+ * RecentGlobalXmin, then no scan that's still in progress could have
+ * seen its downlink, and we can recycle it.
+ */
+ FullTransactionId deletexid_full = GistPageGetDeleteXid(page);
+ FullTransactionId recentxmin_full = GetFullRecentGlobalXmin();
+
+ if (FullTransactionIdPrecedes(deletexid_full, recentxmin_full))
+ return true;
+ }
+ return false;
+}
+
+bytea *
+gistoptions(Datum reloptions, bool validate)
+{
+ static const relopt_parse_elt tab[] = {
+ {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
+ {"buffering", RELOPT_TYPE_ENUM, offsetof(GiSTOptions, buffering_mode)}
+ };
+
+ return (bytea *) build_reloptions(reloptions, validate,
+ RELOPT_KIND_GIST,
+ sizeof(GiSTOptions),
+ tab, lengthof(tab));
+}
+
+/*
+ * gistproperty() -- Check boolean properties of indexes.
+ *
+ * This is optional for most AMs, but is required for GiST because the core
+ * property code doesn't support AMPROP_DISTANCE_ORDERABLE. We also handle
+ * AMPROP_RETURNABLE here to save opening the rel to call gistcanreturn.
+ */
+bool
+gistproperty(Oid index_oid, int attno,
+ IndexAMProperty prop, const char *propname,
+ bool *res, bool *isnull)
+{
+ Oid opclass,
+ opfamily,
+ opcintype;
+ int16 procno;
+
+ /* Only answer column-level inquiries */
+ if (attno == 0)
+ return false;
+
+ /*
+ * Currently, GiST distance-ordered scans require that there be a distance
+ * function in the opclass with the default types (i.e. the one loaded
+ * into the relcache entry, see initGISTstate). So we assume that if such
+ * a function exists, then there's a reason for it (rather than grubbing
+ * through all the opfamily's operators to find an ordered one).
+ *
+ * Essentially the same code can test whether we support returning the
+ * column data, since that's true if the opclass provides a fetch proc.
+ */
+
+ switch (prop)
+ {
+ case AMPROP_DISTANCE_ORDERABLE:
+ procno = GIST_DISTANCE_PROC;
+ break;
+ case AMPROP_RETURNABLE:
+ procno = GIST_FETCH_PROC;
+ break;
+ default:
+ return false;
+ }
+
+ /* First we need to know the column's opclass. */
+ opclass = get_index_column_opclass(index_oid, attno);
+ if (!OidIsValid(opclass))
+ {
+ *isnull = true;
+ return true;
+ }
+
+ /* Now look up the opclass family and input datatype. */
+ if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype))
+ {
+ *isnull = true;
+ return true;
+ }
+
+ /* And now we can check whether the function is provided. */
+
+ *res = SearchSysCacheExists4(AMPROCNUM,
+ ObjectIdGetDatum(opfamily),
+ ObjectIdGetDatum(opcintype),
+ ObjectIdGetDatum(opcintype),
+ Int16GetDatum(procno));
+
+ /*
+ * Special case: even without a fetch function, AMPROP_RETURNABLE is true
+ * if the opclass has no compress function.
+ */
+ if (prop == AMPROP_RETURNABLE && !*res)
+ {
+ *res = !SearchSysCacheExists4(AMPROCNUM,
+ ObjectIdGetDatum(opfamily),
+ ObjectIdGetDatum(opcintype),
+ ObjectIdGetDatum(opcintype),
+ Int16GetDatum(GIST_COMPRESS_PROC));
+ }
+
+ *isnull = false;
+
+ return true;
+}
+
+/*
+ * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page
+ * splits anyway. This function provides a fake sequence of LSNs for that
+ * purpose.
+ */
+XLogRecPtr
+gistGetFakeLSN(Relation rel)
+{
+ if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+ {
+ /*
+ * Temporary relations are only accessible in our session, so a simple
+ * backend-local counter will do.
+ */
+ static XLogRecPtr counter = FirstNormalUnloggedLSN;
+
+ return counter++;
+ }
+ else if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
+ {
+ /*
+ * WAL-logging on this relation will start after commit, so its LSNs
+ * must be distinct numbers smaller than the LSN at the next commit.
+ * Emit a dummy WAL record if insert-LSN hasn't advanced after the
+ * last call.
+ */
+ static XLogRecPtr lastlsn = InvalidXLogRecPtr;
+ XLogRecPtr currlsn = GetXLogInsertRecPtr();
+
+ /* Shouldn't be called for WAL-logging relations */
+ Assert(!RelationNeedsWAL(rel));
+
+ /* No need for an actual record if we already have a distinct LSN */
+ if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn)
+ currlsn = gistXLogAssignLSN();
+
+ lastlsn = currlsn;
+ return currlsn;
+ }
+ else
+ {
+ /*
+ * Unlogged relations are accessible from other backends, and survive
+ * (clean) restarts. GetFakeLSNForUnloggedRel() handles that for us.
+ */
+ Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED);
+ return GetFakeLSNForUnloggedRel();
+ }
+}
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
new file mode 100644
index 0000000..a9c616c
--- /dev/null
+++ b/src/backend/access/gist/gistvacuum.c
@@ -0,0 +1,658 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistvacuum.c
+ * vacuuming routines for the postgres GiST index access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistvacuum.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "access/transam.h"
+#include "commands/vacuum.h"
+#include "lib/integerset.h"
+#include "miscadmin.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "utils/memutils.h"
+
+/* Working state needed by gistbulkdelete */
+typedef struct
+{
+ IndexVacuumInfo *info;
+ IndexBulkDeleteResult *stats;
+ IndexBulkDeleteCallback callback;
+ void *callback_state;
+ GistNSN startNSN;
+
+ /*
+ * These are used to memorize all internal and empty leaf pages. They are
+ * used for deleting all the empty pages.
+ */
+ IntegerSet *internal_page_set;
+ IntegerSet *empty_leaf_set;
+ MemoryContext page_set_context;
+} GistVacState;
+
+static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state);
+static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno,
+ BlockNumber orig_blkno);
+static void gistvacuum_delete_empty_pages(IndexVacuumInfo *info,
+ GistVacState *vstate);
+static bool gistdeletepage(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ Buffer buffer, OffsetNumber downlink,
+ Buffer leafBuffer);
+
+/*
+ * VACUUM bulkdelete stage: remove index entries.
+ */
+IndexBulkDeleteResult *
+gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ /* allocate stats if first time through, else re-use existing struct */
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+ gistvacuumscan(info, stats, callback, callback_state);
+
+ return stats;
+}
+
+/*
+ * VACUUM cleanup stage: delete empty pages, and update index statistics.
+ */
+IndexBulkDeleteResult *
+gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+ /* No-op in ANALYZE ONLY mode */
+ if (info->analyze_only)
+ return stats;
+
+ /*
+ * If gistbulkdelete was called, we need not do anything, just return the
+ * stats from the latest gistbulkdelete call. If it wasn't called, we
+ * still need to do a pass over the index, to obtain index statistics.
+ */
+ if (stats == NULL)
+ {
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ gistvacuumscan(info, stats, NULL, NULL);
+ }
+
+ /*
+ * It's quite possible for us to be fooled by concurrent page splits into
+ * double-counting some index tuples, so disbelieve any total that exceeds
+ * the underlying heap's count ... if we know that accurately. Otherwise
+ * this might just make matters worse.
+ */
+ if (!info->estimated_count)
+ {
+ if (stats->num_index_tuples > info->num_heap_tuples)
+ stats->num_index_tuples = info->num_heap_tuples;
+ }
+
+ return stats;
+}
+
+/*
+ * gistvacuumscan --- scan the index for VACUUMing purposes
+ *
+ * This scans the index for leaf tuples that are deletable according to the
+ * vacuum callback, and updates the stats. Both btbulkdelete and
+ * btvacuumcleanup invoke this (the latter only if no btbulkdelete call
+ * occurred).
+ *
+ * This also makes note of any empty leaf pages, as well as all internal
+ * pages while looping over all index pages. After scanning all the pages, we
+ * remove the empty pages so that they can be reused. Any deleted pages are
+ * added directly to the free space map. (They should've been added there
+ * when they were originally deleted, already, but it's possible that the FSM
+ * was lost at a crash, for example.)
+ *
+ * The caller is responsible for initially allocating/zeroing a stats struct.
+ */
+static void
+gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ Relation rel = info->index;
+ GistVacState vstate;
+ BlockNumber num_pages;
+ bool needLock;
+ BlockNumber blkno;
+ MemoryContext oldctx;
+
+ /*
+ * Reset counts that will be incremented during the scan; needed in case
+ * of multiple scans during a single VACUUM command.
+ */
+ stats->estimated_count = false;
+ stats->num_index_tuples = 0;
+ stats->pages_deleted = 0;
+ stats->pages_free = 0;
+
+ /*
+ * Create the integer sets to remember all the internal and the empty leaf
+ * pages in page_set_context. Internally, the integer set will remember
+ * this context so that the subsequent allocations for these integer sets
+ * will be done from the same context.
+ */
+ vstate.page_set_context = GenerationContextCreate(CurrentMemoryContext,
+ "GiST VACUUM page set context",
+ 16 * 1024);
+ oldctx = MemoryContextSwitchTo(vstate.page_set_context);
+ vstate.internal_page_set = intset_create();
+ vstate.empty_leaf_set = intset_create();
+ MemoryContextSwitchTo(oldctx);
+
+ /* Set up info to pass down to gistvacuumpage */
+ vstate.info = info;
+ vstate.stats = stats;
+ vstate.callback = callback;
+ vstate.callback_state = callback_state;
+ if (RelationNeedsWAL(rel))
+ vstate.startNSN = GetInsertRecPtr();
+ else
+ vstate.startNSN = gistGetFakeLSN(rel);
+
+ /*
+ * The outer loop iterates over all index pages, in physical order (we
+ * hope the kernel will cooperate in providing read-ahead for speed). It
+ * is critical that we visit all leaf pages, including ones added after we
+ * start the scan, else we might fail to delete some deletable tuples.
+ * Hence, we must repeatedly check the relation length. We must acquire
+ * the relation-extension lock while doing so to avoid a race condition:
+ * if someone else is extending the relation, there is a window where
+ * bufmgr/smgr have created a new all-zero page but it hasn't yet been
+ * write-locked by gistNewBuffer(). If we manage to scan such a page
+ * here, we'll improperly assume it can be recycled. Taking the lock
+ * synchronizes things enough to prevent a problem: either num_pages won't
+ * include the new page, or gistNewBuffer already has write lock on the
+ * buffer and it will be fully initialized before we can examine it. (See
+ * also vacuumlazy.c, which has the same issue.) Also, we need not worry
+ * if a page is added immediately after we look; the page splitting code
+ * already has write-lock on the left page before it adds a right page, so
+ * we must already have processed any tuples due to be moved into such a
+ * page.
+ *
+ * We can skip locking for new or temp relations, however, since no one
+ * else could be accessing them.
+ */
+ needLock = !RELATION_IS_LOCAL(rel);
+
+ blkno = GIST_ROOT_BLKNO;
+ for (;;)
+ {
+ /* Get the current relation length */
+ if (needLock)
+ LockRelationForExtension(rel, ExclusiveLock);
+ num_pages = RelationGetNumberOfBlocks(rel);
+ if (needLock)
+ UnlockRelationForExtension(rel, ExclusiveLock);
+
+ /* Quit if we've scanned the whole relation */
+ if (blkno >= num_pages)
+ break;
+ /* Iterate over pages, then loop back to recheck length */
+ for (; blkno < num_pages; blkno++)
+ gistvacuumpage(&vstate, blkno, blkno);
+ }
+
+ /*
+ * If we found any recyclable pages (and recorded them in the FSM), then
+ * forcibly update the upper-level FSM pages to ensure that searchers can
+ * find them. It's possible that the pages were also found during
+ * previous scans and so this is a waste of time, but it's cheap enough
+ * relative to scanning the index that it shouldn't matter much, and
+ * making sure that free pages are available sooner not later seems
+ * worthwhile.
+ *
+ * Note that if no recyclable pages exist, we don't bother vacuuming the
+ * FSM at all.
+ */
+ if (stats->pages_free > 0)
+ IndexFreeSpaceMapVacuum(rel);
+
+ /* update statistics */
+ stats->num_pages = num_pages;
+
+ /*
+ * If we saw any empty pages, try to unlink them from the tree so that
+ * they can be reused.
+ */
+ gistvacuum_delete_empty_pages(info, &vstate);
+
+ /* we don't need the internal and empty page sets anymore */
+ MemoryContextDelete(vstate.page_set_context);
+ vstate.page_set_context = NULL;
+ vstate.internal_page_set = NULL;
+ vstate.empty_leaf_set = NULL;
+}
+
+/*
+ * gistvacuumpage --- VACUUM one page
+ *
+ * This processes a single page for gistbulkdelete(). In some cases we
+ * must go back and re-examine previously-scanned pages; this routine
+ * recurses when necessary to handle that case.
+ *
+ * blkno is the page to process. orig_blkno is the highest block number
+ * reached by the outer gistvacuumscan loop (the same as blkno, unless we
+ * are recursing to re-examine a previous page).
+ */
+static void
+gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
+{
+ IndexVacuumInfo *info = vstate->info;
+ IndexBulkDeleteCallback callback = vstate->callback;
+ void *callback_state = vstate->callback_state;
+ Relation rel = info->index;
+ Buffer buffer;
+ Page page;
+ BlockNumber recurse_to;
+
+restart:
+ recurse_to = InvalidBlockNumber;
+
+ /* call vacuum_delay_point while not holding any buffer lock */
+ vacuum_delay_point();
+
+ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+ info->strategy);
+
+ /*
+ * We are not going to stay here for a long time, aggressively grab an
+ * exclusive lock.
+ */
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+
+ if (gistPageRecyclable(page))
+ {
+ /* Okay to recycle this page */
+ RecordFreeIndexPage(rel, blkno);
+ vstate->stats->pages_free++;
+ vstate->stats->pages_deleted++;
+ }
+ else if (GistPageIsDeleted(page))
+ {
+ /* Already deleted, but can't recycle yet */
+ vstate->stats->pages_deleted++;
+ }
+ else if (GistPageIsLeaf(page))
+ {
+ OffsetNumber todelete[MaxOffsetNumber];
+ int ntodelete = 0;
+ int nremain;
+ GISTPageOpaque opaque = GistPageGetOpaque(page);
+ OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Check whether we need to recurse back to earlier pages. What we
+ * are concerned about is a page split that happened since we started
+ * the vacuum scan. If the split moved some tuples to a lower page
+ * then we might have missed 'em. If so, set up for tail recursion.
+ *
+ * This is similar to the checks we do during searches, when following
+ * a downlink, but we don't need to jump to higher-numbered pages,
+ * because we will process them later, anyway.
+ */
+ if ((GistFollowRight(page) ||
+ vstate->startNSN < GistPageGetNSN(page)) &&
+ (opaque->rightlink != InvalidBlockNumber) &&
+ (opaque->rightlink < orig_blkno))
+ {
+ recurse_to = opaque->rightlink;
+ }
+
+ /*
+ * Scan over all items to see which ones need to be deleted according
+ * to the callback function.
+ */
+ if (callback)
+ {
+ OffsetNumber off;
+
+ for (off = FirstOffsetNumber;
+ off <= maxoff;
+ off = OffsetNumberNext(off))
+ {
+ ItemId iid = PageGetItemId(page, off);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+ if (callback(&(idxtuple->t_tid), callback_state))
+ todelete[ntodelete++] = off;
+ }
+ }
+
+ /*
+ * Apply any needed deletes. We issue just one WAL record per page,
+ * so as to minimize WAL traffic.
+ */
+ if (ntodelete > 0)
+ {
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(buffer);
+
+ PageIndexMultiDelete(page, todelete, ntodelete);
+ GistMarkTuplesDeleted(page);
+
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+
+ recptr = gistXLogUpdate(buffer,
+ todelete, ntodelete,
+ NULL, 0, InvalidBuffer);
+ PageSetLSN(page, recptr);
+ }
+ else
+ PageSetLSN(page, gistGetFakeLSN(rel));
+
+ END_CRIT_SECTION();
+
+ vstate->stats->tuples_removed += ntodelete;
+ /* must recompute maxoff */
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
+
+ nremain = maxoff - FirstOffsetNumber + 1;
+ if (nremain == 0)
+ {
+ /*
+ * The page is now completely empty. Remember its block number,
+ * so that we will try to delete the page in the second stage.
+ *
+ * Skip this when recursing, because IntegerSet requires that the
+ * values are added in ascending order. The next VACUUM will pick
+ * it up.
+ */
+ if (blkno == orig_blkno)
+ intset_add_member(vstate->empty_leaf_set, blkno);
+ }
+ else
+ vstate->stats->num_index_tuples += nremain;
+ }
+ else
+ {
+ /*
+ * On an internal page, check for "invalid tuples", left behind by an
+ * incomplete page split on PostgreSQL 9.0 or below. These are not
+ * created by newer PostgreSQL versions, but unfortunately, there is
+ * no version number anywhere in a GiST index, so we don't know
+ * whether this index might still contain invalid tuples or not.
+ */
+ OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+ OffsetNumber off;
+
+ for (off = FirstOffsetNumber;
+ off <= maxoff;
+ off = OffsetNumberNext(off))
+ {
+ ItemId iid = PageGetItemId(page, off);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+ if (GistTupleIsInvalid(idxtuple))
+ ereport(LOG,
+ (errmsg("index \"%s\" contains an inner tuple marked as invalid",
+ RelationGetRelationName(rel)),
+ errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
+ errhint("Please REINDEX it.")));
+ }
+
+ /*
+ * Remember the block number of this page, so that we can revisit it
+ * later in gistvacuum_delete_empty_pages(), when we search for
+ * parents of empty leaf pages.
+ */
+ if (blkno == orig_blkno)
+ intset_add_member(vstate->internal_page_set, blkno);
+ }
+
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * This is really tail recursion, but if the compiler is too stupid to
+ * optimize it as such, we'd eat an uncomfortably large amount of stack
+ * space per recursion level (due to the deletable[] array). A failure is
+ * improbable since the number of levels isn't likely to be large ... but
+ * just in case, let's hand-optimize into a loop.
+ */
+ if (recurse_to != InvalidBlockNumber)
+ {
+ blkno = recurse_to;
+ goto restart;
+ }
+}
+
+/*
+ * Scan all internal pages, and try to delete their empty child pages.
+ */
+static void
+gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistVacState *vstate)
+{
+ Relation rel = info->index;
+ BlockNumber empty_pages_remaining;
+ uint64 blkno;
+
+ /*
+ * Rescan all inner pages to find those that have empty child pages.
+ */
+ empty_pages_remaining = intset_num_entries(vstate->empty_leaf_set);
+ intset_begin_iterate(vstate->internal_page_set);
+ while (empty_pages_remaining > 0 &&
+ intset_iterate_next(vstate->internal_page_set, &blkno))
+ {
+ Buffer buffer;
+ Page page;
+ OffsetNumber off,
+ maxoff;
+ OffsetNumber todelete[MaxOffsetNumber];
+ BlockNumber leafs_to_delete[MaxOffsetNumber];
+ int ntodelete;
+ int deleted;
+
+ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber) blkno,
+ RBM_NORMAL, info->strategy);
+
+ LockBuffer(buffer, GIST_SHARE);
+ page = (Page) BufferGetPage(buffer);
+
+ if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
+ {
+ /*
+ * This page was an internal page earlier, but now it's something
+ * else. Shouldn't happen...
+ */
+ Assert(false);
+ UnlockReleaseBuffer(buffer);
+ continue;
+ }
+
+ /*
+ * Scan all the downlinks, and see if any of them point to empty leaf
+ * pages.
+ */
+ maxoff = PageGetMaxOffsetNumber(page);
+ ntodelete = 0;
+ for (off = FirstOffsetNumber;
+ off <= maxoff && ntodelete < maxoff - 1;
+ off = OffsetNumberNext(off))
+ {
+ ItemId iid = PageGetItemId(page, off);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
+ BlockNumber leafblk;
+
+ leafblk = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+ if (intset_is_member(vstate->empty_leaf_set, leafblk))
+ {
+ leafs_to_delete[ntodelete] = leafblk;
+ todelete[ntodelete++] = off;
+ }
+ }
+
+ /*
+ * In order to avoid deadlock, child page must be locked before
+ * parent, so we must release the lock on the parent, lock the child,
+ * and then re-acquire the lock the parent. (And we wouldn't want to
+ * do I/O, while holding a lock, anyway.)
+ *
+ * At the instant that we're not holding a lock on the parent, the
+ * downlink might get moved by a concurrent insert, so we must
+ * re-check that it still points to the same child page after we have
+ * acquired both locks. Also, another backend might have inserted a
+ * tuple to the page, so that it is no longer empty. gistdeletepage()
+ * re-checks all these conditions.
+ */
+ LockBuffer(buffer, GIST_UNLOCK);
+
+ deleted = 0;
+ for (int i = 0; i < ntodelete; i++)
+ {
+ Buffer leafbuf;
+
+ /*
+ * Don't remove the last downlink from the parent. That would
+ * confuse the insertion code.
+ */
+ if (PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
+ break;
+
+ leafbuf = ReadBufferExtended(rel, MAIN_FORKNUM, leafs_to_delete[i],
+ RBM_NORMAL, info->strategy);
+ LockBuffer(leafbuf, GIST_EXCLUSIVE);
+ gistcheckpage(rel, leafbuf);
+
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ if (gistdeletepage(info, vstate->stats,
+ buffer, todelete[i] - deleted,
+ leafbuf))
+ deleted++;
+ LockBuffer(buffer, GIST_UNLOCK);
+
+ UnlockReleaseBuffer(leafbuf);
+ }
+
+ ReleaseBuffer(buffer);
+
+ /* update stats */
+ vstate->stats->pages_removed += deleted;
+
+ /*
+ * We can stop the scan as soon as we have seen the downlinks, even if
+ * we were not able to remove them all.
+ */
+ empty_pages_remaining -= ntodelete;
+ }
+}
+
+/*
+ * gistdeletepage takes a leaf page, and its parent, and tries to delete the
+ * leaf. Both pages must be locked.
+ *
+ * Even if the page was empty when we first saw it, a concurrent inserter might
+ * have added a tuple to it since. Similarly, the downlink might have moved.
+ * We re-check all the conditions, to make sure the page is still deletable,
+ * before modifying anything.
+ *
+ * Returns true, if the page was deleted, and false if a concurrent update
+ * prevented it.
+ */
+static bool
+gistdeletepage(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ Buffer parentBuffer, OffsetNumber downlink,
+ Buffer leafBuffer)
+{
+ Page parentPage = BufferGetPage(parentBuffer);
+ Page leafPage = BufferGetPage(leafBuffer);
+ ItemId iid;
+ IndexTuple idxtuple;
+ XLogRecPtr recptr;
+ FullTransactionId txid;
+
+ /*
+ * Check that the leaf is still empty and deletable.
+ */
+ if (!GistPageIsLeaf(leafPage))
+ {
+ /* a leaf page should never become a non-leaf page */
+ Assert(false);
+ return false;
+ }
+
+ if (GistFollowRight(leafPage))
+ return false; /* don't mess with a concurrent page split */
+
+ if (PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber)
+ return false; /* not empty anymore */
+
+ /*
+ * Ok, the leaf is deletable. Is the downlink in the parent page still
+ * valid? It might have been moved by a concurrent insert. We could try
+ * to re-find it by scanning the page again, possibly moving right if the
+ * was split. But for now, let's keep it simple and just give up. The
+ * next VACUUM will pick it up.
+ */
+ if (PageIsNew(parentPage) || GistPageIsDeleted(parentPage) ||
+ GistPageIsLeaf(parentPage))
+ {
+ /* shouldn't happen, internal pages are never deleted */
+ Assert(false);
+ return false;
+ }
+
+ if (PageGetMaxOffsetNumber(parentPage) < downlink
+ || PageGetMaxOffsetNumber(parentPage) <= FirstOffsetNumber)
+ return false;
+
+ iid = PageGetItemId(parentPage, downlink);
+ idxtuple = (IndexTuple) PageGetItem(parentPage, iid);
+ if (BufferGetBlockNumber(leafBuffer) !=
+ ItemPointerGetBlockNumber(&(idxtuple->t_tid)))
+ return false;
+
+ /*
+ * All good, proceed with the deletion.
+ *
+ * The page cannot be immediately recycled, because in-progress scans that
+ * saw the downlink might still visit it. Mark the page with the current
+ * next-XID counter, so that we know when it can be recycled. Once that
+ * XID becomes older than GlobalXmin, we know that all scans that are
+ * currently in progress must have ended. (That's much more conservative
+ * than needed, but let's keep it safe and simple.)
+ */
+ txid = ReadNextFullTransactionId();
+
+ START_CRIT_SECTION();
+
+ /* mark the page as deleted */
+ MarkBufferDirty(leafBuffer);
+ GistPageSetDeleted(leafPage, txid);
+ stats->pages_deleted++;
+
+ /* remove the downlink from the parent */
+ MarkBufferDirty(parentBuffer);
+ PageIndexTupleDelete(parentPage, downlink);
+
+ if (RelationNeedsWAL(info->index))
+ recptr = gistXLogPageDelete(leafBuffer, txid, parentBuffer, downlink);
+ else
+ recptr = gistGetFakeLSN(info->index);
+ PageSetLSN(parentPage, recptr);
+ PageSetLSN(leafPage, recptr);
+
+ END_CRIT_SECTION();
+
+ return true;
+}
diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c
new file mode 100644
index 0000000..a285736
--- /dev/null
+++ b/src/backend/access/gist/gistvalidate.c
@@ -0,0 +1,281 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistvalidate.c
+ * Opclass validator for GiST.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistvalidate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amvalidate.h"
+#include "access/gist_private.h"
+#include "access/htup_details.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_amproc.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+
+/*
+ * Validator for a GiST opclass.
+ */
+bool
+gistvalidate(Oid opclassoid)
+{
+ bool result = true;
+ HeapTuple classtup;
+ Form_pg_opclass classform;
+ Oid opfamilyoid;
+ Oid opcintype;
+ Oid opckeytype;
+ char *opclassname;
+ HeapTuple familytup;
+ Form_pg_opfamily familyform;
+ char *opfamilyname;
+ CatCList *proclist,
+ *oprlist;
+ List *grouplist;
+ OpFamilyOpFuncGroup *opclassgroup;
+ int i;
+ ListCell *lc;
+
+ /* Fetch opclass information */
+ classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
+ if (!HeapTupleIsValid(classtup))
+ elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
+ classform = (Form_pg_opclass) GETSTRUCT(classtup);
+
+ opfamilyoid = classform->opcfamily;
+ opcintype = classform->opcintype;
+ opckeytype = classform->opckeytype;
+ if (!OidIsValid(opckeytype))
+ opckeytype = opcintype;
+ opclassname = NameStr(classform->opcname);
+
+ /* Fetch opfamily information */
+ familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
+ if (!HeapTupleIsValid(familytup))
+ elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
+ familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
+
+ opfamilyname = NameStr(familyform->opfname);
+
+ /* Fetch all operators and support functions of the opfamily */
+ oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
+ proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
+
+ /* Check individual support functions */
+ for (i = 0; i < proclist->n_members; i++)
+ {
+ HeapTuple proctup = &proclist->members[i]->tuple;
+ Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
+ bool ok;
+
+ /*
+ * All GiST support functions should be registered with matching
+ * left/right types
+ */
+ if (procform->amproclefttype != procform->amprocrighttype)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types",
+ opfamilyname, "gist",
+ format_procedure(procform->amproc))));
+ result = false;
+ }
+
+ /*
+ * We can't check signatures except within the specific opclass, since
+ * we need to know the associated opckeytype in many cases.
+ */
+ if (procform->amproclefttype != opcintype)
+ continue;
+
+ /* Check procedure numbers and function signatures */
+ switch (procform->amprocnum)
+ {
+ case GIST_CONSISTENT_PROC:
+ ok = check_amproc_signature(procform->amproc, BOOLOID, false,
+ 5, 5, INTERNALOID, opcintype,
+ INT2OID, OIDOID, INTERNALOID);
+ break;
+ case GIST_UNION_PROC:
+ ok = check_amproc_signature(procform->amproc, opckeytype, false,
+ 2, 2, INTERNALOID, INTERNALOID);
+ break;
+ case GIST_COMPRESS_PROC:
+ case GIST_DECOMPRESS_PROC:
+ case GIST_FETCH_PROC:
+ ok = check_amproc_signature(procform->amproc, INTERNALOID, true,
+ 1, 1, INTERNALOID);
+ break;
+ case GIST_PENALTY_PROC:
+ ok = check_amproc_signature(procform->amproc, INTERNALOID, true,
+ 3, 3, INTERNALOID,
+ INTERNALOID, INTERNALOID);
+ break;
+ case GIST_PICKSPLIT_PROC:
+ ok = check_amproc_signature(procform->amproc, INTERNALOID, true,
+ 2, 2, INTERNALOID, INTERNALOID);
+ break;
+ case GIST_EQUAL_PROC:
+ ok = check_amproc_signature(procform->amproc, INTERNALOID, false,
+ 3, 3, opckeytype, opckeytype,
+ INTERNALOID);
+ break;
+ case GIST_DISTANCE_PROC:
+ ok = check_amproc_signature(procform->amproc, FLOAT8OID, false,
+ 5, 5, INTERNALOID, opcintype,
+ INT2OID, OIDOID, INTERNALOID);
+ break;
+ case GIST_OPTIONS_PROC:
+ ok = check_amoptsproc_signature(procform->amproc);
+ break;
+ default:
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
+ opfamilyname, "gist",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ continue; /* don't want additional message */
+ }
+
+ if (!ok)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
+ opfamilyname, "gist",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ }
+ }
+
+ /* Check individual operators */
+ for (i = 0; i < oprlist->n_members; i++)
+ {
+ HeapTuple oprtup = &oprlist->members[i]->tuple;
+ Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
+ Oid op_rettype;
+
+ /* TODO: Check that only allowed strategy numbers exist */
+ if (oprform->amopstrategy < 1)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
+ opfamilyname, "gist",
+ format_operator(oprform->amopopr),
+ oprform->amopstrategy)));
+ result = false;
+ }
+
+ /* GiST supports ORDER BY operators */
+ if (oprform->amoppurpose != AMOP_SEARCH)
+ {
+ /* ... but must have matching distance proc */
+ if (!OidIsValid(get_opfamily_proc(opfamilyoid,
+ oprform->amoplefttype,
+ oprform->amoplefttype,
+ GIST_DISTANCE_PROC)))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains unsupported ORDER BY specification for operator %s",
+ opfamilyname, "gist",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+ /* ... and operator result must match the claimed btree opfamily */
+ op_rettype = get_op_rettype(oprform->amopopr);
+ if (!opfamily_can_sort_type(oprform->amopsortfamily, op_rettype))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains incorrect ORDER BY opfamily specification for operator %s",
+ opfamilyname, "gist",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+ }
+ else
+ {
+ /* Search operators must always return bool */
+ op_rettype = BOOLOID;
+ }
+
+ /* Check operator signature */
+ if (!check_amop_signature(oprform->amopopr, op_rettype,
+ oprform->amoplefttype,
+ oprform->amoprighttype))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
+ opfamilyname, "gist",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+ }
+
+ /* Now check for inconsistent groups of operators/functions */
+ grouplist = identify_opfamily_groups(oprlist, proclist);
+ opclassgroup = NULL;
+ foreach(lc, grouplist)
+ {
+ OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
+
+ /* Remember the group exactly matching the test opclass */
+ if (thisgroup->lefttype == opcintype &&
+ thisgroup->righttype == opcintype)
+ opclassgroup = thisgroup;
+
+ /*
+ * There is not a lot we can do to check the operator sets, since each
+ * GiST opclass is more or less a law unto itself, and some contain
+ * only operators that are binary-compatible with the opclass datatype
+ * (meaning that empty operator sets can be OK). That case also means
+ * that we shouldn't insist on nonempty function sets except for the
+ * opclass's own group.
+ */
+ }
+
+ /* Check that the originally-named opclass is complete */
+ for (i = 1; i <= GISTNProcs; i++)
+ {
+ if (opclassgroup &&
+ (opclassgroup->functionset & (((uint64) 1) << i)) != 0)
+ continue; /* got it */
+ if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC ||
+ i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC ||
+ i == GIST_OPTIONS_PROC)
+ continue; /* optional methods */
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator class \"%s\" of access method %s is missing support function %d",
+ opclassname, "gist", i)));
+ result = false;
+ }
+
+ ReleaseCatCacheList(proclist);
+ ReleaseCatCacheList(oprlist);
+ ReleaseSysCache(familytup);
+ ReleaseSysCache(classtup);
+
+ return result;
+}
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
new file mode 100644
index 0000000..b60dba0
--- /dev/null
+++ b/src/backend/access/gist/gistxlog.c
@@ -0,0 +1,717 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistxlog.c
+ * WAL replay logic for GiST.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistxlog.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/bufmask.h"
+#include "access/gist_private.h"
+#include "access/gistxlog.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static MemoryContext opCtx; /* working memory for operations */
+
+/*
+ * Replay the clearing of F_FOLLOW_RIGHT flag on a child page.
+ *
+ * Even if the WAL record includes a full-page image, we have to update the
+ * follow-right flag, because that change is not included in the full-page
+ * image. To be sure that the intermediate state with the wrong flag value is
+ * not visible to concurrent Hot Standby queries, this function handles
+ * restoring the full-page image as well as updating the flag. (Note that
+ * we never need to do anything else to the child page in the current WAL
+ * action.)
+ */
+static void
+gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ Buffer buffer;
+ Page page;
+ XLogRedoAction action;
+
+ /*
+ * Note that we still update the page even if it was restored from a full
+ * page image, because the updated NSN is not included in the image.
+ */
+ action = XLogReadBufferForRedo(record, block_id, &buffer);
+ if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
+ {
+ page = BufferGetPage(buffer);
+
+ GistPageSetNSN(page, lsn);
+ GistClearFollowRight(page);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * redo any page update (except page split)
+ */
+static void
+gistRedoPageUpdateRecord(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ char *begin;
+ char *data;
+ Size datalen;
+ int ninserted = 0;
+
+ data = begin = XLogRecGetBlockData(record, 0, &datalen);
+
+ page = (Page) BufferGetPage(buffer);
+
+ if (xldata->ntodelete == 1 && xldata->ntoinsert == 1)
+ {
+ /*
+ * When replacing one tuple with one other tuple, we must use
+ * PageIndexTupleOverwrite for consistency with gistplacetopage.
+ */
+ OffsetNumber offnum = *((OffsetNumber *) data);
+ IndexTuple itup;
+ Size itupsize;
+
+ data += sizeof(OffsetNumber);
+ itup = (IndexTuple) data;
+ itupsize = IndexTupleSize(itup);
+ if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize))
+ elog(ERROR, "failed to add item to GiST index page, size %d bytes",
+ (int) itupsize);
+ data += itupsize;
+ /* should be nothing left after consuming 1 tuple */
+ Assert(data - begin == datalen);
+ /* update insertion count for assert check below */
+ ninserted++;
+ }
+ else if (xldata->ntodelete > 0)
+ {
+ /* Otherwise, delete old tuples if any */
+ OffsetNumber *todelete = (OffsetNumber *) data;
+
+ data += sizeof(OffsetNumber) * xldata->ntodelete;
+
+ PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+ if (GistPageIsLeaf(page))
+ GistMarkTuplesDeleted(page);
+ }
+
+ /* Add new tuples if any */
+ if (data - begin < datalen)
+ {
+ OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber :
+ OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+ while (data - begin < datalen)
+ {
+ IndexTuple itup = (IndexTuple) data;
+ Size sz = IndexTupleSize(itup);
+ OffsetNumber l;
+
+ data += sz;
+
+ l = PageAddItem(page, (Item) itup, sz, off, false, false);
+ if (l == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to GiST index page, size %d bytes",
+ (int) sz);
+ off++;
+ ninserted++;
+ }
+ }
+
+ /* Check that XLOG record contained expected number of tuples */
+ Assert(ninserted == xldata->ntoinsert);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+
+ /*
+ * Fix follow-right data on left child page
+ *
+ * This must be done while still holding the lock on the target page. Note
+ * that even if the target page no longer exists, we still attempt to
+ * replay the change on the child page.
+ */
+ if (XLogRecHasBlockRef(record, 1))
+ gistRedoClearFollowRight(record, 1);
+
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+
+/*
+ * redo delete on gist index page to remove tuples marked as DEAD during index
+ * tuple insertion
+ */
+static void
+gistRedoDeleteRecord(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ /*
+ * If we have any conflict processing to do, it must happen before we
+ * update the page.
+ *
+ * GiST delete records can conflict with standby queries. You might think
+ * that vacuum records would conflict as well, but we've handled that
+ * already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+ * cleaned by the vacuum of the heap and so we can resolve any conflicts
+ * just once when that arrives. After that we know that no conflicts
+ * exist from individual gist vacuum records on that index.
+ */
+ if (InHotStandby)
+ {
+ RelFileNode rnode;
+
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+ ResolveRecoveryConflictWithSnapshot(xldata->latestRemovedXid, rnode);
+ }
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete)
+ {
+ OffsetNumber *todelete;
+
+ todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete);
+
+ PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+ }
+
+ GistClearPageHasGarbage(page);
+ GistMarkTuplesDeleted(page);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Returns an array of index pointers.
+ */
+static IndexTuple *
+decodePageSplitRecord(char *begin, int len, int *n)
+{
+ char *ptr;
+ int i = 0;
+ IndexTuple *tuples;
+
+ /* extract the number of tuples */
+ memcpy(n, begin, sizeof(int));
+ ptr = begin + sizeof(int);
+
+ tuples = palloc(*n * sizeof(IndexTuple));
+
+ for (i = 0; i < *n; i++)
+ {
+ Assert(ptr - begin < len);
+ tuples[i] = (IndexTuple) ptr;
+ ptr += IndexTupleSize((IndexTuple) ptr);
+ }
+ Assert(ptr - begin == len);
+
+ return tuples;
+}
+
+static void
+gistRedoPageSplitRecord(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record);
+ Buffer firstbuffer = InvalidBuffer;
+ Buffer buffer;
+ Page page;
+ int i;
+ bool isrootsplit = false;
+
+ /*
+ * We must hold lock on the first-listed page throughout the action,
+ * including while updating the left child page (if any). We can unlock
+ * remaining pages in the list as soon as they've been written, because
+ * there is no path for concurrent queries to reach those pages without
+ * first visiting the first-listed page.
+ */
+
+ /* loop around all pages */
+ for (i = 0; i < xldata->npage; i++)
+ {
+ int flags;
+ char *data;
+ Size datalen;
+ int num;
+ BlockNumber blkno;
+ IndexTuple *tuples;
+
+ XLogRecGetBlockTag(record, i + 1, NULL, NULL, &blkno);
+ if (blkno == GIST_ROOT_BLKNO)
+ {
+ Assert(i == 0);
+ isrootsplit = true;
+ }
+
+ buffer = XLogInitBufferForRedo(record, i + 1);
+ page = (Page) BufferGetPage(buffer);
+ data = XLogRecGetBlockData(record, i + 1, &datalen);
+
+ tuples = decodePageSplitRecord(data, datalen, &num);
+
+ /* ok, clear buffer */
+ if (xldata->origleaf && blkno != GIST_ROOT_BLKNO)
+ flags = F_LEAF;
+ else
+ flags = 0;
+ GISTInitBuffer(buffer, flags);
+
+ /* and fill it */
+ gistfillbuffer(page, tuples, num, FirstOffsetNumber);
+
+ if (blkno == GIST_ROOT_BLKNO)
+ {
+ GistPageGetOpaque(page)->rightlink = InvalidBlockNumber;
+ GistPageSetNSN(page, xldata->orignsn);
+ GistClearFollowRight(page);
+ }
+ else
+ {
+ if (i < xldata->npage - 1)
+ {
+ BlockNumber nextblkno;
+
+ XLogRecGetBlockTag(record, i + 2, NULL, NULL, &nextblkno);
+ GistPageGetOpaque(page)->rightlink = nextblkno;
+ }
+ else
+ GistPageGetOpaque(page)->rightlink = xldata->origrlink;
+ GistPageSetNSN(page, xldata->orignsn);
+ if (i < xldata->npage - 1 && !isrootsplit &&
+ xldata->markfollowright)
+ GistMarkFollowRight(page);
+ else
+ GistClearFollowRight(page);
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+
+ if (i == 0)
+ firstbuffer = buffer;
+ else
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /* Fix follow-right data on left child page, if any */
+ if (XLogRecHasBlockRef(record, 0))
+ gistRedoClearFollowRight(record, 0);
+
+ /* Finally, release lock on the first page */
+ UnlockReleaseBuffer(firstbuffer);
+}
+
+/* redo page deletion */
+static void
+gistRedoPageDelete(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+ Buffer parentBuffer;
+ Buffer leafBuffer;
+
+ if (XLogReadBufferForRedo(record, 0, &leafBuffer) == BLK_NEEDS_REDO)
+ {
+ Page page = (Page) BufferGetPage(leafBuffer);
+
+ GistPageSetDeleted(page, xldata->deleteXid);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(leafBuffer);
+ }
+
+ if (XLogReadBufferForRedo(record, 1, &parentBuffer) == BLK_NEEDS_REDO)
+ {
+ Page page = (Page) BufferGetPage(parentBuffer);
+
+ PageIndexTupleDelete(page, xldata->downlinkOffset);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(parentBuffer);
+ }
+
+ if (BufferIsValid(parentBuffer))
+ UnlockReleaseBuffer(parentBuffer);
+ if (BufferIsValid(leafBuffer))
+ UnlockReleaseBuffer(leafBuffer);
+}
+
+static void
+gistRedoPageReuse(XLogReaderState *record)
+{
+ gistxlogPageReuse *xlrec = (gistxlogPageReuse *) XLogRecGetData(record);
+
+ /*
+ * PAGE_REUSE records exist to provide a conflict point when we reuse
+ * pages in the index via the FSM. That's all they do though.
+ *
+ * latestRemovedXid was the page's deleteXid. The deleteXid <
+ * RecentGlobalXmin test in gistPageRecyclable() conceptually mirrors the
+ * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
+ * Consequently, one XID value achieves the same exclusion effect on
+ * master and standby.
+ */
+ if (InHotStandby)
+ {
+ FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid;
+ FullTransactionId nextFullXid = ReadNextFullTransactionId();
+ uint64 diff;
+
+ /*
+ * ResolveRecoveryConflictWithSnapshot operates on 32-bit
+ * TransactionIds, so truncate the logged FullTransactionId. If the
+ * logged value is very old, so that XID wrap-around already happened
+ * on it, there can't be any snapshots that still see it.
+ */
+ nextFullXid = ReadNextFullTransactionId();
+ diff = U64FromFullTransactionId(nextFullXid) -
+ U64FromFullTransactionId(latestRemovedFullXid);
+ if (diff < MaxTransactionId / 2)
+ {
+ TransactionId latestRemovedXid;
+
+ latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid,
+ xlrec->node);
+ }
+ }
+}
+
+void
+gist_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ MemoryContext oldCxt;
+
+ /*
+ * GiST indexes do not require any conflict processing. NB: If we ever
+ * implement a similar optimization we have in b-tree, and remove killed
+ * tuples outside VACUUM, we'll need to handle that here.
+ */
+
+ oldCxt = MemoryContextSwitchTo(opCtx);
+ switch (info)
+ {
+ case XLOG_GIST_PAGE_UPDATE:
+ gistRedoPageUpdateRecord(record);
+ break;
+ case XLOG_GIST_DELETE:
+ gistRedoDeleteRecord(record);
+ break;
+ case XLOG_GIST_PAGE_REUSE:
+ gistRedoPageReuse(record);
+ break;
+ case XLOG_GIST_PAGE_SPLIT:
+ gistRedoPageSplitRecord(record);
+ break;
+ case XLOG_GIST_PAGE_DELETE:
+ gistRedoPageDelete(record);
+ break;
+ case XLOG_GIST_ASSIGN_LSN:
+ /* nop. See gistGetFakeLSN(). */
+ break;
+ default:
+ elog(PANIC, "gist_redo: unknown op code %u", info);
+ }
+
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextReset(opCtx);
+}
+
+void
+gist_xlog_startup(void)
+{
+ opCtx = createTempGistContext();
+}
+
+void
+gist_xlog_cleanup(void)
+{
+ MemoryContextDelete(opCtx);
+}
+
+/*
+ * Mask a Gist page before running consistency checks on it.
+ */
+void
+gist_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+
+ mask_page_lsn_and_checksum(page);
+
+ mask_page_hint_bits(page);
+ mask_unused_space(page);
+
+ /*
+ * NSN is nothing but a special purpose LSN. Hence, mask it for the same
+ * reason as mask_page_lsn_and_checksum.
+ */
+ GistPageSetNSN(page, (uint64) MASK_MARKER);
+
+ /*
+ * We update F_FOLLOW_RIGHT flag on the left child after writing WAL
+ * record. Hence, mask this flag. See gistplacetopage() for details.
+ */
+ GistMarkFollowRight(page);
+
+ if (GistPageIsLeaf(page))
+ {
+ /*
+ * In gist leaf pages, it is possible to modify the LP_FLAGS without
+ * emitting any WAL record. Hence, mask the line pointer flags. See
+ * gistkillitems() for details.
+ */
+ mask_lp_flags(page);
+ }
+
+ /*
+ * During gist redo, we never mark a page as garbage. Hence, mask it to
+ * ignore any differences.
+ */
+ GistClearPageHasGarbage(page);
+}
+
+/*
+ * Write WAL record of a page split.
+ */
+XLogRecPtr
+gistXLogSplit(bool page_is_leaf,
+ SplitedPageLayout *dist,
+ BlockNumber origrlink, GistNSN orignsn,
+ Buffer leftchildbuf, bool markfollowright)
+{
+ gistxlogPageSplit xlrec;
+ SplitedPageLayout *ptr;
+ int npage = 0;
+ XLogRecPtr recptr;
+ int i;
+
+ for (ptr = dist; ptr; ptr = ptr->next)
+ npage++;
+
+ xlrec.origrlink = origrlink;
+ xlrec.orignsn = orignsn;
+ xlrec.origleaf = page_is_leaf;
+ xlrec.npage = (uint16) npage;
+ xlrec.markfollowright = markfollowright;
+
+ XLogBeginInsert();
+
+ /*
+ * Include a full page image of the child buf. (only necessary if a
+ * checkpoint happened since the child page was split)
+ */
+ if (BufferIsValid(leftchildbuf))
+ XLogRegisterBuffer(0, leftchildbuf, REGBUF_STANDARD);
+
+ /*
+ * NOTE: We register a lot of data. The caller must've called
+ * XLogEnsureRecordSpace() to prepare for that. We cannot do it here,
+ * because we're already in a critical section. If you change the number
+ * of buffer or data registrations here, make sure you modify the
+ * XLogEnsureRecordSpace() calls accordingly!
+ */
+ XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageSplit));
+
+ i = 1;
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ XLogRegisterBuffer(i, ptr->buffer, REGBUF_WILL_INIT);
+ XLogRegisterBufData(i, (char *) &(ptr->block.num), sizeof(int));
+ XLogRegisterBufData(i, (char *) ptr->list, ptr->lenlist);
+ i++;
+ }
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT);
+
+ return recptr;
+}
+
+/*
+ * Write XLOG record describing a page deletion. This also includes removal of
+ * downlink from the parent page.
+ */
+XLogRecPtr
+gistXLogPageDelete(Buffer buffer, FullTransactionId xid,
+ Buffer parentBuffer, OffsetNumber downlinkOffset)
+{
+ gistxlogPageDelete xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.deleteXid = xid;
+ xlrec.downlinkOffset = downlinkOffset;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfGistxlogPageDelete);
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+ XLogRegisterBuffer(1, parentBuffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+
+ return recptr;
+}
+
+/*
+ * Write an empty XLOG record to assign a distinct LSN.
+ */
+XLogRecPtr
+gistXLogAssignLSN(void)
+{
+ int dummy = 0;
+
+ /*
+ * Records other than SWITCH_WAL must have content. We use an integer 0 to
+ * follow the restriction.
+ */
+ XLogBeginInsert();
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+ XLogRegisterData((char *) &dummy, sizeof(dummy));
+ return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN);
+}
+
+/*
+ * Write XLOG record about reuse of a deleted page.
+ */
+void
+gistXLogPageReuse(Relation rel, BlockNumber blkno, FullTransactionId latestRemovedXid)
+{
+ gistxlogPageReuse xlrec_reuse;
+
+ /*
+ * Note that we don't register the buffer with the record, because this
+ * operation doesn't modify the page. This record only exists to provide a
+ * conflict point for Hot Standby.
+ */
+
+ /* XLOG stuff */
+ xlrec_reuse.node = rel->rd_node;
+ xlrec_reuse.block = blkno;
+ xlrec_reuse.latestRemovedFullXid = latestRemovedXid;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec_reuse, SizeOfGistxlogPageReuse);
+
+ XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_REUSE);
+}
+
+/*
+ * Write XLOG record describing a page update. The update can include any
+ * number of deletions and/or insertions of tuples on a single index page.
+ *
+ * If this update inserts a downlink for a split page, also record that
+ * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set.
+ *
+ * Note that both the todelete array and the tuples are marked as belonging
+ * to the target buffer; they need not be stored in XLOG if XLogInsert decides
+ * to log the whole buffer contents instead.
+ */
+XLogRecPtr
+gistXLogUpdate(Buffer buffer,
+ OffsetNumber *todelete, int ntodelete,
+ IndexTuple *itup, int ituplen,
+ Buffer leftchildbuf)
+{
+ gistxlogPageUpdate xlrec;
+ int i;
+ XLogRecPtr recptr;
+
+ xlrec.ntodelete = ntodelete;
+ xlrec.ntoinsert = ituplen;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate));
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+ XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete);
+
+ /* new tuples */
+ for (i = 0; i < ituplen; i++)
+ XLogRegisterBufData(0, (char *) (itup[i]), IndexTupleSize(itup[i]));
+
+ /*
+ * Include a full page image of the child buf. (only necessary if a
+ * checkpoint happened since the child page was split)
+ */
+ if (BufferIsValid(leftchildbuf))
+ XLogRegisterBuffer(1, leftchildbuf, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE);
+
+ return recptr;
+}
+
+/*
+ * Write XLOG record describing a delete of leaf index tuples marked as DEAD
+ * during new tuple insertion. One may think that this case is already covered
+ * by gistXLogUpdate(). But deletion of index tuples might conflict with
+ * standby queries and needs special handling.
+ */
+XLogRecPtr
+gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete,
+ TransactionId latestRemovedXid)
+{
+ gistxlogDelete xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.latestRemovedXid = latestRemovedXid;
+ xlrec.ntodelete = ntodelete;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete);
+
+ /*
+ * We need the target-offsets array whether or not we store the whole
+ * buffer, to allow us to find the latestRemovedXid on a standby server.
+ */
+ XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber));
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE);
+
+ return recptr;
+}