summaryrefslogtreecommitdiffstats
path: root/src/backend/access/spgist
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
commit5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree739caf8c461053357daa9f162bef34516c7bf452 /src/backend/access/spgist
parentInitial commit. (diff)
downloadpostgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/access/spgist')
-rw-r--r--src/backend/access/spgist/Makefile28
-rw-r--r--src/backend/access/spgist/README389
-rw-r--r--src/backend/access/spgist/spgdoinsert.c2357
-rw-r--r--src/backend/access/spgist/spginsert.c243
-rw-r--r--src/backend/access/spgist/spgkdtreeproc.c349
-rw-r--r--src/backend/access/spgist/spgproc.c88
-rw-r--r--src/backend/access/spgist/spgquadtreeproc.c471
-rw-r--r--src/backend/access/spgist/spgscan.c1097
-rw-r--r--src/backend/access/spgist/spgtextproc.c699
-rw-r--r--src/backend/access/spgist/spgutils.c1349
-rw-r--r--src/backend/access/spgist/spgvacuum.c975
-rw-r--r--src/backend/access/spgist/spgvalidate.c392
-rw-r--r--src/backend/access/spgist/spgxlog.c1013
13 files changed, 9450 insertions, 0 deletions
diff --git a/src/backend/access/spgist/Makefile b/src/backend/access/spgist/Makefile
new file mode 100644
index 0000000..8ed3b4a
--- /dev/null
+++ b/src/backend/access/spgist/Makefile
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/spgist
+#
+# IDENTIFICATION
+# src/backend/access/spgist/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/spgist
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ spgdoinsert.o \
+ spginsert.o \
+ spgkdtreeproc.o \
+ spgproc.o \
+ spgquadtreeproc.o \
+ spgscan.o \
+ spgtextproc.o \
+ spgutils.o \
+ spgvacuum.o \
+ spgvalidate.o \
+ spgxlog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README
new file mode 100644
index 0000000..7117e02
--- /dev/null
+++ b/src/backend/access/spgist/README
@@ -0,0 +1,389 @@
+src/backend/access/spgist/README
+
+SP-GiST is an abbreviation of space-partitioned GiST. It provides a
+generalized infrastructure for implementing space-partitioned data
+structures, such as quadtrees, k-d trees, and radix trees (tries). When
+implemented in main memory, these structures are usually designed as a set of
+dynamically-allocated nodes linked by pointers. This is not suitable for
+direct storing on disk, since the chains of pointers can be rather long and
+require too many disk accesses. In contrast, disk based data structures
+should have a high fanout to minimize I/O. The challenge is to map tree
+nodes to disk pages in such a way that the search algorithm accesses only a
+few disk pages, even if it traverses many nodes.
+
+
+COMMON STRUCTURE DESCRIPTION
+
+Logically, an SP-GiST tree is a set of tuples, each of which can be either
+an inner or leaf tuple. Each inner tuple contains "nodes", which are
+(label,pointer) pairs, where the pointer (ItemPointerData) is a pointer to
+another inner tuple or to the head of a list of leaf tuples. Inner tuples
+can have different numbers of nodes (children). Branches can be of different
+depth (actually, there is no control or code to support balancing), which
+means that the tree is non-balanced. However, leaf and inner tuples cannot
+be intermixed at the same level: a downlink from a node of an inner tuple
+leads either to one inner tuple, or to a list of leaf tuples.
+
+The SP-GiST core requires that inner and leaf tuples fit on a single index
+page, and even more stringently that the list of leaf tuples reached from a
+single inner-tuple node all be stored on the same index page. (Restricting
+such lists to not cross pages reduces seeks, and allows the list links to be
+stored as simple 2-byte OffsetNumbers.) SP-GiST index opclasses should
+therefore ensure that not too many nodes can be needed in one inner tuple,
+and that inner-tuple prefixes and leaf-node datum values not be too large.
+
+Inner and leaf tuples are stored separately: the former are stored only on
+"inner" pages, the latter only on "leaf" pages. Also, there are special
+restrictions on the root page. Early in an index's life, when there is only
+one page's worth of data, the root page contains an unorganized set of leaf
+tuples. After the first page split has occurred, the root is required to
+contain exactly one inner tuple.
+
+When the search traversal algorithm reaches an inner tuple, it chooses a set
+of nodes to continue tree traverse in depth. If it reaches a leaf page it
+scans a list of leaf tuples to find the ones that match the query. SP-GiST
+also supports ordered (nearest-neighbor) searches - that is during scan pending
+nodes are put into priority queue, so traversal is performed by the
+closest-first model.
+
+
+The insertion algorithm descends the tree similarly, except it must choose
+just one node to descend to from each inner tuple. Insertion might also have
+to modify the inner tuple before it can descend: it could add a new node, or
+it could "split" the tuple to obtain a less-specific prefix that can match
+the value to be inserted. If it's necessary to append a new leaf tuple to a
+list and there is no free space on page, then SP-GiST creates a new inner
+tuple and distributes leaf tuples into a set of lists on, perhaps, several
+pages.
+
+An inner tuple consists of:
+
+ optional prefix value - all successors must be consistent with it.
+ Example:
+ radix tree - prefix value is a common prefix string
+ quad tree - centroid
+ k-d tree - one coordinate
+
+ list of nodes, where node is a (label, pointer) pair.
+ Example of a label: a single character for radix tree
+
+A leaf tuple consists of:
+
+ a leaf value
+ Example:
+ radix tree - the rest of string (postfix)
+ quad and k-d tree - the point itself
+
+ ItemPointer to the corresponding heap tuple
+ nextOffset number of next leaf tuple in a chain on a leaf page
+
+ optional nulls bitmask
+ optional INCLUDE-column values
+
+For compatibility with pre-v14 indexes, a leaf tuple has a nulls bitmask
+only if there are null values (among the leaf value and the INCLUDE values)
+*and* there is at least one INCLUDE column. The null-ness of the leaf
+value can be inferred from whether the tuple is on a "nulls page" (see below)
+so it is not necessary to represent it explicitly. But we include it anyway
+in a bitmask used with INCLUDE values, so that standard tuple deconstruction
+code can be used.
+
+
+NULLS HANDLING
+
+We assume that SPGiST-indexable operators are strict (can never succeed for
+null inputs). It is still desirable to index nulls, so that whole-table
+indexscans are possible and so that "x IS NULL" can be implemented by an
+SPGiST indexscan. However, we prefer that SPGiST index opclasses not have
+to cope with nulls. Therefore, the main tree of an SPGiST index does not
+include any null entries. We store null entries in a separate SPGiST tree
+occupying a disjoint set of pages (in particular, its own root page).
+Insertions and searches in the nulls tree do not use any of the
+opclass-supplied functions, but just use hardwired logic comparable to
+AllTheSame cases in the normal tree.
+
+
+INSERTION ALGORITHM
+
+Insertion algorithm is designed to keep the tree in a consistent state at
+any moment. Here is a simplified insertion algorithm specification
+(numbers refer to notes below):
+
+ Start with the first tuple on the root page (1)
+
+ loop:
+ if (page is leaf) then
+ if (enough space)
+ insert on page and exit (5)
+ else (7)
+ call PickSplitFn() (2)
+ end if
+ else
+ switch (chooseFn())
+ case MatchNode - descend through selected node
+ case AddNode - add node and then retry chooseFn (3, 6)
+ case SplitTuple - split inner tuple to prefix and postfix, then
+ retry chooseFn with the prefix tuple (4, 6)
+ end if
+
+Notes:
+
+(1) Initially, we just dump leaf tuples into the root page until it is full;
+then we split it. Once the root is not a leaf page, it can have only one
+inner tuple, so as to keep the amount of free space on the root as large as
+possible. Both of these rules are meant to postpone doing PickSplit on the
+root for as long as possible, so that the topmost partitioning of the search
+space is as good as we can easily make it.
+
+(2) Current implementation allows to do picksplit and insert a new leaf tuple
+in one operation, if the new list of leaf tuples fits on one page. It's
+always possible for trees with small nodes like quad tree or k-d tree, but
+radix trees may require another picksplit.
+
+(3) Addition of node must keep size of inner tuple small enough to fit on a
+page. After addition, inner tuple could become too large to be stored on
+current page because of other tuples on page. In this case it will be moved
+to another inner page (see notes about page management). When moving tuple to
+another page, we can't change the numbers of other tuples on the page, else
+we'd make downlink pointers to them invalid. To prevent that, SP-GiST leaves
+a "placeholder" tuple, which can be reused later whenever another tuple is
+added to the page. See also Concurrency and Vacuum sections below. Right now
+only radix trees could add a node to the tuple; quad trees and k-d trees
+make all possible nodes at once in PickSplitFn() call.
+
+(4) Prefix value could only partially match a new value, so the SplitTuple
+action allows breaking the current tree branch into upper and lower sections.
+Another way to say it is that we can split the current inner tuple into
+"prefix" and "postfix" parts, where the prefix part is able to match the
+incoming new value. Consider example of insertion into a radix tree. We use
+the following notation, where tuple's id is just for discussion (no such id
+is actually stored):
+
+inner tuple: {tuple id}(prefix string)[ comma separated list of node labels ]
+leaf tuple: {tuple id}<value>
+
+Suppose we need to insert string 'www.gogo.com' into inner tuple
+
+ {1}(www.google.com/)[a, i]
+
+The string does not match the prefix so we cannot descend. We must
+split the inner tuple into two tuples:
+
+ {2}(www.go)[o] - prefix tuple
+ |
+ {3}(gle.com/)[a,i] - postfix tuple
+
+On the next iteration of loop we find that 'www.gogo.com' matches the
+prefix, but not any node label, so we add a node [g] to tuple {2}:
+
+ NIL (no child exists yet)
+ |
+ {2}(www.go)[o, g]
+ |
+ {3}(gle.com/)[a,i]
+
+Now we can descend through the [g] node, which will cause us to update
+the target string to just 'o.com'. Finally, we'll insert a leaf tuple
+bearing that string:
+
+ {4}<o.com>
+ |
+ {2}(www.go)[o, g]
+ |
+ {3}(gle.com/)[a,i]
+
+As we can see, the original tuple's node array moves to postfix tuple without
+any changes. Note also that SP-GiST core assumes that prefix tuple is not
+larger than old inner tuple. That allows us to store prefix tuple directly
+in place of old inner tuple. SP-GiST core will try to store postfix tuple on
+the same page if possible, but will use another page if there is not enough
+free space (see notes 5 and 6). Currently, quad and k-d trees don't use this
+feature, because they have no concept of a prefix being "inconsistent" with
+any new value. They grow their depth only by PickSplitFn() call.
+
+(5) If pointer from node of parent is a NIL pointer, algorithm chooses a leaf
+page to store on. At first, it tries to use the last-used leaf page with the
+largest free space (which we track in each backend) to better utilize disk
+space. If that's not large enough, then the algorithm allocates a new page.
+
+(6) Management of inner pages is very similar to management of leaf pages,
+described in (5).
+
+(7) Actually, current implementation can move the whole list of leaf tuples
+and a new tuple to another page, if the list is short enough. This improves
+space utilization, but doesn't change the basis of the algorithm.
+
+
+CONCURRENCY
+
+While descending the tree, the insertion algorithm holds exclusive lock on
+two tree levels at a time, ie both parent and child pages (but parent and
+child pages can be the same, see notes above). There is a possibility of
+deadlock between two insertions if there are cross-referenced pages in
+different branches. That is, if inner tuple on page M has a child on page N
+while an inner tuple from another branch is on page N and has a child on
+page M, then two insertions descending the two branches could deadlock,
+since they will each hold their parent page's lock while trying to get the
+child page's lock.
+
+Currently, we deal with this by conditionally locking buffers as we descend
+the tree. If we fail to get lock on a buffer, we release both buffers and
+restart the insertion process. This is potentially inefficient, but the
+locking costs of a more deterministic approach seem very high.
+
+To reduce the number of cases where that happens, we introduce a concept of
+"triple parity" of pages: if inner tuple is on page with BlockNumber N, then
+its child tuples should be placed on the same page, or else on a page with
+BlockNumber M where (N+1) mod 3 == M mod 3. This rule ensures that tuples
+on page M will have no children on page N, since (M+1) mod 3 != N mod 3.
+That makes it unlikely that two insertion processes will conflict against
+each other while descending the tree. It's not perfect though: in the first
+place, we could still get a deadlock among three or more insertion processes,
+and in the second place, it's impractical to preserve this invariant in every
+case when we expand or split an inner tuple. So we still have to allow for
+deadlocks.
+
+Insertion may also need to take locks on an additional inner and/or leaf page
+to add tuples of the right type(s), when there's not enough room on the pages
+it descended through. However, we don't care exactly which such page we add
+to, so deadlocks can be avoided by conditionally locking the additional
+buffers: if we fail to get lock on an additional page, just try another one.
+
+Search traversal algorithm is rather traditional. At each non-leaf level, it
+share-locks the page, identifies which node(s) in the current inner tuple
+need to be visited, and puts those addresses on a stack of pages to examine
+later. It then releases lock on the current buffer before visiting the next
+stack item. So only one page is locked at a time, and no deadlock is
+possible. But instead, we have to worry about race conditions: by the time
+we arrive at a pointed-to page, a concurrent insertion could have replaced
+the target inner tuple (or leaf tuple chain) with data placed elsewhere.
+To handle that, whenever the insertion algorithm changes a nonempty downlink
+in an inner tuple, it places a "redirect tuple" in place of the lower-level
+inner tuple or leaf-tuple chain head that the link formerly led to. Scans
+(though not insertions) must be prepared to honor such redirects. Only a
+scan that had already visited the parent level could possibly reach such a
+redirect tuple, so we can remove redirects once all active transactions have
+been flushed out of the system.
+
+
+DEAD TUPLES
+
+Tuples on leaf pages can be in one of four states:
+
+SPGIST_LIVE: normal, live pointer to a heap tuple.
+
+SPGIST_REDIRECT: placeholder that contains a link to another place in the
+index. When a chain of leaf tuples has to be moved to another page, a
+redirect tuple is inserted in place of the chain's head tuple. The parent
+inner tuple's downlink is updated when this happens, but concurrent scans
+might be "in flight" from the parent page to the child page (since they
+release lock on the parent page before attempting to lock the child).
+The redirect pointer serves to tell such a scan where to go. A redirect
+pointer is only needed for as long as such concurrent scans could be in
+progress. Eventually, it's converted into a PLACEHOLDER dead tuple by
+VACUUM, and is then a candidate for replacement. Searches that find such
+a tuple (which should never be part of a chain) should immediately proceed
+to the other place, forgetting about the redirect tuple. Insertions that
+reach such a tuple should raise error, since a valid downlink should never
+point to such a tuple.
+
+SPGIST_DEAD: tuple is dead, but it cannot be removed or moved to a
+different offset on the page because there is a link leading to it from
+some inner tuple elsewhere in the index. (Such a tuple is never part of a
+chain, since we don't need one unless there is nothing live left in its
+chain.) Searches should ignore such entries. If an insertion action
+arrives at such a tuple, it should either replace it in-place (if there's
+room on the page to hold the desired new leaf tuple) or replace it with a
+redirection pointer to wherever it puts the new leaf tuple.
+
+SPGIST_PLACEHOLDER: tuple is dead, and there are known to be no links to
+it from elsewhere. When a live tuple is deleted or moved away, and not
+replaced by a redirect pointer, it is replaced by a placeholder to keep
+the offsets of later tuples on the same page from changing. Placeholders
+can be freely replaced when adding a new tuple to the page, and also
+VACUUM will delete any that are at the end of the range of valid tuple
+offsets. Both searches and insertions should complain if a link from
+elsewhere leads them to a placeholder tuple.
+
+When the root page is also a leaf, all its tuple should be in LIVE state;
+there's no need for the others since there are no links and no need to
+preserve offset numbers.
+
+Tuples on inner pages can be in LIVE, REDIRECT, or PLACEHOLDER states.
+The REDIRECT state has the same function as on leaf pages, to send
+concurrent searches to the place where they need to go after an inner
+tuple is moved to another page. Expired REDIRECT pointers are converted
+to PLACEHOLDER status by VACUUM, and are then candidates for replacement.
+DEAD state is not currently possible, since VACUUM does not attempt to
+remove unused inner tuples.
+
+
+VACUUM
+
+VACUUM (or more precisely, spgbulkdelete) performs a single sequential scan
+over the entire index. On both leaf and inner pages, we can convert old
+REDIRECT tuples into PLACEHOLDER status, and then remove any PLACEHOLDERs
+that are at the end of the page (since they aren't needed to preserve the
+offsets of any live tuples). On leaf pages, we scan for tuples that need
+to be deleted because their heap TIDs match a vacuum target TID.
+
+If we find a deletable tuple that is not at the head of its chain, we
+can simply replace it with a PLACEHOLDER, updating the chain links to
+remove it from the chain. If it is at the head of its chain, but there's
+at least one live tuple remaining in the chain, we move that live tuple
+to the head tuple's offset, replacing it with a PLACEHOLDER to preserve
+the offsets of other tuples. This keeps the parent inner tuple's downlink
+valid. If we find ourselves deleting all live tuples in a chain, we
+replace the head tuple with a DEAD tuple and the rest with PLACEHOLDERS.
+The parent inner tuple's downlink thus points to the DEAD tuple, and the
+rules explained in the previous section keep everything working.
+
+VACUUM doesn't know a-priori which tuples are heads of their chains, but
+it can easily figure that out by constructing a predecessor array that's
+the reverse map of the nextOffset links (ie, when we see tuple x links to
+tuple y, we set predecessor[y] = x). Then head tuples are the ones with
+no predecessor.
+
+Because insertions can occur while VACUUM runs, a pure sequential scan
+could miss deleting some target leaf tuples, because they could get moved
+from a not-yet-visited leaf page to an already-visited leaf page as a
+consequence of a PickSplit or MoveLeafs operation. Failing to delete any
+target TID is not acceptable, so we have to extend the algorithm to cope
+with such cases. We recognize that such a move might have occurred when
+we see a leaf-page REDIRECT tuple whose XID indicates it might have been
+created after the VACUUM scan started. We add the redirection target TID
+to a "pending list" of places we need to recheck. Between pages of the
+main sequential scan, we empty the pending list by visiting each listed
+TID. If it points to an inner tuple (from a PickSplit), add each downlink
+TID to the pending list. If it points to a leaf page, vacuum that page.
+(We could just vacuum the single pointed-to chain, but vacuuming the
+whole page simplifies the code and reduces the odds of VACUUM having to
+modify the same page multiple times.) To ensure that pending-list
+processing can never get into an endless loop, even in the face of
+concurrent index changes, we don't remove list entries immediately but
+only after we've completed all pending-list processing; instead we just
+mark items as done after processing them. Adding a TID that's already in
+the list is a no-op, whether or not that item is marked done yet.
+
+spgbulkdelete also updates the index's free space map.
+
+Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was
+performed; otherwise, it does an spgbulkdelete scan with an empty target
+list, so as to clean up redirections and placeholders, update the free
+space map, and gather statistics.
+
+
+LAST USED PAGE MANAGEMENT
+
+The list of last used pages contains four pages - a leaf page and three
+inner pages, one from each "triple parity" group. (Actually, there's one
+such list for the main tree and a separate one for the nulls tree.) This
+list is stored between calls on the index meta page, but updates are never
+WAL-logged to decrease WAL traffic. Incorrect data on meta page isn't
+critical, because we could allocate a new page at any moment.
+
+
+AUTHORS
+
+ Teodor Sigaev <teodor@sigaev.ru>
+ Oleg Bartunov <oleg@sai.msu.su>
diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
new file mode 100644
index 0000000..e84b5ed
--- /dev/null
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -0,0 +1,2357 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgdoinsert.c
+ * implementation of insert algorithm
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgdoinsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/spgist_private.h"
+#include "access/spgxlog.h"
+#include "access/xloginsert.h"
+#include "common/pg_prng.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "utils/rel.h"
+
+
+/*
+ * SPPageDesc tracks all info about a page we are inserting into. In some
+ * situations it actually identifies a tuple, or even a specific node within
+ * an inner tuple. But any of the fields can be invalid. If the buffer
+ * field is valid, it implies we hold pin and exclusive lock on that buffer.
+ * page pointer should be valid exactly when buffer is.
+ */
+typedef struct SPPageDesc
+{
+ BlockNumber blkno; /* block number, or InvalidBlockNumber */
+ Buffer buffer; /* page's buffer number, or InvalidBuffer */
+ Page page; /* pointer to page buffer, or NULL */
+ OffsetNumber offnum; /* offset of tuple, or InvalidOffsetNumber */
+ int node; /* node number within inner tuple, or -1 */
+} SPPageDesc;
+
+
+/*
+ * Set the item pointer in the nodeN'th entry in inner tuple tup. This
+ * is used to update the parent inner tuple's downlink after a move or
+ * split operation.
+ */
+void
+spgUpdateNodeLink(SpGistInnerTuple tup, int nodeN,
+ BlockNumber blkno, OffsetNumber offset)
+{
+ int i;
+ SpGistNodeTuple node;
+
+ SGITITERATE(tup, i, node)
+ {
+ if (i == nodeN)
+ {
+ ItemPointerSet(&node->t_tid, blkno, offset);
+ return;
+ }
+ }
+
+ elog(ERROR, "failed to find requested node %d in SPGiST inner tuple",
+ nodeN);
+}
+
+/*
+ * Form a new inner tuple containing one more node than the given one, with
+ * the specified label datum, inserted at offset "offset" in the node array.
+ * The new tuple's prefix is the same as the old one's.
+ *
+ * Note that the new node initially has an invalid downlink. We'll find a
+ * page to point it to later.
+ */
+static SpGistInnerTuple
+addNode(SpGistState *state, SpGistInnerTuple tuple, Datum label, int offset)
+{
+ SpGistNodeTuple node,
+ *nodes;
+ int i;
+
+ /* if offset is negative, insert at end */
+ if (offset < 0)
+ offset = tuple->nNodes;
+ else if (offset > tuple->nNodes)
+ elog(ERROR, "invalid offset for adding node to SPGiST inner tuple");
+
+ nodes = palloc(sizeof(SpGistNodeTuple) * (tuple->nNodes + 1));
+ SGITITERATE(tuple, i, node)
+ {
+ if (i < offset)
+ nodes[i] = node;
+ else
+ nodes[i + 1] = node;
+ }
+
+ nodes[offset] = spgFormNodeTuple(state, label, false);
+
+ return spgFormInnerTuple(state,
+ (tuple->prefixSize > 0),
+ SGITDATUM(tuple, state),
+ tuple->nNodes + 1,
+ nodes);
+}
+
+/* qsort comparator for sorting OffsetNumbers */
+static int
+cmpOffsetNumbers(const void *a, const void *b)
+{
+ if (*(const OffsetNumber *) a == *(const OffsetNumber *) b)
+ return 0;
+ return (*(const OffsetNumber *) a > *(const OffsetNumber *) b) ? 1 : -1;
+}
+
+/*
+ * Delete multiple tuples from an index page, preserving tuple offset numbers.
+ *
+ * The first tuple in the given list is replaced with a dead tuple of type
+ * "firststate" (REDIRECT/DEAD/PLACEHOLDER); the remaining tuples are replaced
+ * with dead tuples of type "reststate". If either firststate or reststate
+ * is REDIRECT, blkno/offnum specify where to link to.
+ *
+ * NB: this is used during WAL replay, so beware of trying to make it too
+ * smart. In particular, it shouldn't use "state" except for calling
+ * spgFormDeadTuple(). This is also used in a critical section, so no
+ * pallocs either!
+ */
+void
+spgPageIndexMultiDelete(SpGistState *state, Page page,
+ OffsetNumber *itemnos, int nitems,
+ int firststate, int reststate,
+ BlockNumber blkno, OffsetNumber offnum)
+{
+ OffsetNumber firstItem;
+ OffsetNumber sortednos[MaxIndexTuplesPerPage];
+ SpGistDeadTuple tuple = NULL;
+ int i;
+
+ if (nitems == 0)
+ return; /* nothing to do */
+
+ /*
+ * For efficiency we want to use PageIndexMultiDelete, which requires the
+ * targets to be listed in sorted order, so we have to sort the itemnos
+ * array. (This also greatly simplifies the math for reinserting the
+ * replacement tuples.) However, we must not scribble on the caller's
+ * array, so we have to make a copy.
+ */
+ memcpy(sortednos, itemnos, sizeof(OffsetNumber) * nitems);
+ if (nitems > 1)
+ qsort(sortednos, nitems, sizeof(OffsetNumber), cmpOffsetNumbers);
+
+ PageIndexMultiDelete(page, sortednos, nitems);
+
+ firstItem = itemnos[0];
+
+ for (i = 0; i < nitems; i++)
+ {
+ OffsetNumber itemno = sortednos[i];
+ int tupstate;
+
+ tupstate = (itemno == firstItem) ? firststate : reststate;
+ if (tuple == NULL || tuple->tupstate != tupstate)
+ tuple = spgFormDeadTuple(state, tupstate, blkno, offnum);
+
+ if (PageAddItem(page, (Item) tuple, tuple->size,
+ itemno, false, false) != itemno)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ tuple->size);
+
+ if (tupstate == SPGIST_REDIRECT)
+ SpGistPageGetOpaque(page)->nRedirection++;
+ else if (tupstate == SPGIST_PLACEHOLDER)
+ SpGistPageGetOpaque(page)->nPlaceholder++;
+ }
+}
+
+/*
+ * Update the parent inner tuple's downlink, and mark the parent buffer
+ * dirty (this must be the last change to the parent page in the current
+ * WAL action).
+ */
+static void
+saveNodeLink(Relation index, SPPageDesc *parent,
+ BlockNumber blkno, OffsetNumber offnum)
+{
+ SpGistInnerTuple innerTuple;
+
+ innerTuple = (SpGistInnerTuple) PageGetItem(parent->page,
+ PageGetItemId(parent->page, parent->offnum));
+
+ spgUpdateNodeLink(innerTuple, parent->node, blkno, offnum);
+
+ MarkBufferDirty(parent->buffer);
+}
+
+/*
+ * Add a leaf tuple to a leaf page where there is known to be room for it
+ */
+static void
+addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple,
+ SPPageDesc *current, SPPageDesc *parent, bool isNulls, bool isNew)
+{
+ spgxlogAddLeaf xlrec;
+
+ xlrec.newPage = isNew;
+ xlrec.storesNulls = isNulls;
+
+ /* these will be filled below as needed */
+ xlrec.offnumLeaf = InvalidOffsetNumber;
+ xlrec.offnumHeadLeaf = InvalidOffsetNumber;
+ xlrec.offnumParent = InvalidOffsetNumber;
+ xlrec.nodeI = 0;
+
+ START_CRIT_SECTION();
+
+ if (current->offnum == InvalidOffsetNumber ||
+ SpGistBlockIsRoot(current->blkno))
+ {
+ /* Tuple is not part of a chain */
+ SGLT_SET_NEXTOFFSET(leafTuple, InvalidOffsetNumber);
+ current->offnum = SpGistPageAddNewItem(state, current->page,
+ (Item) leafTuple, leafTuple->size,
+ NULL, false);
+
+ xlrec.offnumLeaf = current->offnum;
+
+ /* Must update parent's downlink if any */
+ if (parent->buffer != InvalidBuffer)
+ {
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+
+ saveNodeLink(index, parent, current->blkno, current->offnum);
+ }
+ }
+ else
+ {
+ /*
+ * Tuple must be inserted into existing chain. We mustn't change the
+ * chain's head address, but we don't need to chase the entire chain
+ * to put the tuple at the end; we can insert it second.
+ *
+ * Also, it's possible that the "chain" consists only of a DEAD tuple,
+ * in which case we should replace the DEAD tuple in-place.
+ */
+ SpGistLeafTuple head;
+ OffsetNumber offnum;
+
+ head = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, current->offnum));
+ if (head->tupstate == SPGIST_LIVE)
+ {
+ SGLT_SET_NEXTOFFSET(leafTuple, SGLT_GET_NEXTOFFSET(head));
+ offnum = SpGistPageAddNewItem(state, current->page,
+ (Item) leafTuple, leafTuple->size,
+ NULL, false);
+
+ /*
+ * re-get head of list because it could have been moved on page,
+ * and set new second element
+ */
+ head = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, current->offnum));
+ SGLT_SET_NEXTOFFSET(head, offnum);
+
+ xlrec.offnumLeaf = offnum;
+ xlrec.offnumHeadLeaf = current->offnum;
+ }
+ else if (head->tupstate == SPGIST_DEAD)
+ {
+ SGLT_SET_NEXTOFFSET(leafTuple, InvalidOffsetNumber);
+ PageIndexTupleDelete(current->page, current->offnum);
+ if (PageAddItem(current->page,
+ (Item) leafTuple, leafTuple->size,
+ current->offnum, false, false) != current->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ leafTuple->size);
+
+ /* WAL replay distinguishes this case by equal offnums */
+ xlrec.offnumLeaf = current->offnum;
+ xlrec.offnumHeadLeaf = current->offnum;
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d", head->tupstate);
+ }
+
+ MarkBufferDirty(current->buffer);
+
+ if (RelationNeedsWAL(index) && !state->isBuild)
+ {
+ XLogRecPtr recptr;
+ int flags;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+ XLogRegisterData((char *) leafTuple, leafTuple->size);
+
+ flags = REGBUF_STANDARD;
+ if (xlrec.newPage)
+ flags |= REGBUF_WILL_INIT;
+ XLogRegisterBuffer(0, current->buffer, flags);
+ if (xlrec.offnumParent != InvalidOffsetNumber)
+ XLogRegisterBuffer(1, parent->buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_LEAF);
+
+ PageSetLSN(current->page, recptr);
+
+ /* update parent only if we actually changed it */
+ if (xlrec.offnumParent != InvalidOffsetNumber)
+ {
+ PageSetLSN(parent->page, recptr);
+ }
+ }
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Count the number and total size of leaf tuples in the chain starting at
+ * current->offnum. Return number into *nToSplit and total size as function
+ * result.
+ *
+ * Klugy special case when considering the root page (i.e., root is a leaf
+ * page, but we're about to split for the first time): return fake large
+ * values to force spgdoinsert() to take the doPickSplit rather than
+ * moveLeafs code path. moveLeafs is not prepared to deal with root page.
+ */
+static int
+checkSplitConditions(Relation index, SpGistState *state,
+ SPPageDesc *current, int *nToSplit)
+{
+ int i,
+ n = 0,
+ totalSize = 0;
+
+ if (SpGistBlockIsRoot(current->blkno))
+ {
+ /* return impossible values to force split */
+ *nToSplit = BLCKSZ;
+ return BLCKSZ;
+ }
+
+ i = current->offnum;
+ while (i != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple it;
+
+ Assert(i >= FirstOffsetNumber &&
+ i <= PageGetMaxOffsetNumber(current->page));
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, i));
+ if (it->tupstate == SPGIST_LIVE)
+ {
+ n++;
+ totalSize += it->size + sizeof(ItemIdData);
+ }
+ else if (it->tupstate == SPGIST_DEAD)
+ {
+ /* We could see a DEAD tuple as first/only chain item */
+ Assert(i == current->offnum);
+ Assert(SGLT_GET_NEXTOFFSET(it) == InvalidOffsetNumber);
+ /* Don't count it in result, because it won't go to other page */
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate);
+
+ i = SGLT_GET_NEXTOFFSET(it);
+ }
+
+ *nToSplit = n;
+
+ return totalSize;
+}
+
+/*
+ * current points to a leaf-tuple chain that we wanted to add newLeafTuple to,
+ * but the chain has to be moved because there's not enough room to add
+ * newLeafTuple to its page. We use this method when the chain contains
+ * very little data so a split would be inefficient. We are sure we can
+ * fit the chain plus newLeafTuple on one other page.
+ */
+static void
+moveLeafs(Relation index, SpGistState *state,
+ SPPageDesc *current, SPPageDesc *parent,
+ SpGistLeafTuple newLeafTuple, bool isNulls)
+{
+ int i,
+ nDelete,
+ nInsert,
+ size;
+ Buffer nbuf;
+ Page npage;
+ SpGistLeafTuple it;
+ OffsetNumber r = InvalidOffsetNumber,
+ startOffset = InvalidOffsetNumber;
+ bool replaceDead = false;
+ OffsetNumber *toDelete;
+ OffsetNumber *toInsert;
+ BlockNumber nblkno;
+ spgxlogMoveLeafs xlrec;
+ char *leafdata,
+ *leafptr;
+
+ /* This doesn't work on root page */
+ Assert(parent->buffer != InvalidBuffer);
+ Assert(parent->buffer != current->buffer);
+
+ /* Locate the tuples to be moved, and count up the space needed */
+ i = PageGetMaxOffsetNumber(current->page);
+ toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * i);
+ toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (i + 1));
+
+ size = newLeafTuple->size + sizeof(ItemIdData);
+
+ nDelete = 0;
+ i = current->offnum;
+ while (i != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple it;
+
+ Assert(i >= FirstOffsetNumber &&
+ i <= PageGetMaxOffsetNumber(current->page));
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, i));
+
+ if (it->tupstate == SPGIST_LIVE)
+ {
+ toDelete[nDelete] = i;
+ size += it->size + sizeof(ItemIdData);
+ nDelete++;
+ }
+ else if (it->tupstate == SPGIST_DEAD)
+ {
+ /* We could see a DEAD tuple as first/only chain item */
+ Assert(i == current->offnum);
+ Assert(SGLT_GET_NEXTOFFSET(it) == InvalidOffsetNumber);
+ /* We don't want to move it, so don't count it in size */
+ toDelete[nDelete] = i;
+ nDelete++;
+ replaceDead = true;
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate);
+
+ i = SGLT_GET_NEXTOFFSET(it);
+ }
+
+ /* Find a leaf page that will hold them */
+ nbuf = SpGistGetBuffer(index, GBUF_LEAF | (isNulls ? GBUF_NULLS : 0),
+ size, &xlrec.newPage);
+ npage = BufferGetPage(nbuf);
+ nblkno = BufferGetBlockNumber(nbuf);
+ Assert(nblkno != current->blkno);
+
+ leafdata = leafptr = palloc(size);
+
+ START_CRIT_SECTION();
+
+ /* copy all the old tuples to new page, unless they're dead */
+ nInsert = 0;
+ if (!replaceDead)
+ {
+ for (i = 0; i < nDelete; i++)
+ {
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, toDelete[i]));
+ Assert(it->tupstate == SPGIST_LIVE);
+
+ /*
+ * Update chain link (notice the chain order gets reversed, but we
+ * don't care). We're modifying the tuple on the source page
+ * here, but it's okay since we're about to delete it.
+ */
+ SGLT_SET_NEXTOFFSET(it, r);
+
+ r = SpGistPageAddNewItem(state, npage, (Item) it, it->size,
+ &startOffset, false);
+
+ toInsert[nInsert] = r;
+ nInsert++;
+
+ /* save modified tuple into leafdata as well */
+ memcpy(leafptr, it, it->size);
+ leafptr += it->size;
+ }
+ }
+
+ /* add the new tuple as well */
+ SGLT_SET_NEXTOFFSET(newLeafTuple, r);
+ r = SpGistPageAddNewItem(state, npage,
+ (Item) newLeafTuple, newLeafTuple->size,
+ &startOffset, false);
+ toInsert[nInsert] = r;
+ nInsert++;
+ memcpy(leafptr, newLeafTuple, newLeafTuple->size);
+ leafptr += newLeafTuple->size;
+
+ /*
+ * Now delete the old tuples, leaving a redirection pointer behind for the
+ * first one, unless we're doing an index build; in which case there can't
+ * be any concurrent scan so we need not provide a redirect.
+ */
+ spgPageIndexMultiDelete(state, current->page, toDelete, nDelete,
+ state->isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT,
+ SPGIST_PLACEHOLDER,
+ nblkno, r);
+
+ /* Update parent's downlink and mark parent page dirty */
+ saveNodeLink(index, parent, nblkno, r);
+
+ /* Mark the leaf pages too */
+ MarkBufferDirty(current->buffer);
+ MarkBufferDirty(nbuf);
+
+ if (RelationNeedsWAL(index) && !state->isBuild)
+ {
+ XLogRecPtr recptr;
+
+ /* prepare WAL info */
+ STORE_STATE(state, xlrec.stateSrc);
+
+ xlrec.nMoves = nDelete;
+ xlrec.replaceDead = replaceDead;
+ xlrec.storesNulls = isNulls;
+
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfSpgxlogMoveLeafs);
+ XLogRegisterData((char *) toDelete,
+ sizeof(OffsetNumber) * nDelete);
+ XLogRegisterData((char *) toInsert,
+ sizeof(OffsetNumber) * nInsert);
+ XLogRegisterData((char *) leafdata, leafptr - leafdata);
+
+ XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD);
+ XLogRegisterBuffer(1, nbuf, REGBUF_STANDARD | (xlrec.newPage ? REGBUF_WILL_INIT : 0));
+ XLogRegisterBuffer(2, parent->buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_MOVE_LEAFS);
+
+ PageSetLSN(current->page, recptr);
+ PageSetLSN(npage, recptr);
+ PageSetLSN(parent->page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Update local free-space cache and release new buffer */
+ SpGistSetLastUsedPage(index, nbuf);
+ UnlockReleaseBuffer(nbuf);
+}
+
+/*
+ * Update previously-created redirection tuple with appropriate destination
+ *
+ * We use this when it's not convenient to know the destination first.
+ * The tuple should have been made with the "impossible" destination of
+ * the metapage.
+ */
+static void
+setRedirectionTuple(SPPageDesc *current, OffsetNumber position,
+ BlockNumber blkno, OffsetNumber offnum)
+{
+ SpGistDeadTuple dt;
+
+ dt = (SpGistDeadTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, position));
+ Assert(dt->tupstate == SPGIST_REDIRECT);
+ Assert(ItemPointerGetBlockNumber(&dt->pointer) == SPGIST_METAPAGE_BLKNO);
+ ItemPointerSet(&dt->pointer, blkno, offnum);
+}
+
+/*
+ * Test to see if the user-defined picksplit function failed to do its job,
+ * ie, it put all the leaf tuples into the same node.
+ * If so, randomly divide the tuples into several nodes (all with the same
+ * label) and return true to select allTheSame mode for this inner tuple.
+ *
+ * (This code is also used to forcibly select allTheSame mode for nulls.)
+ *
+ * If we know that the leaf tuples wouldn't all fit on one page, then we
+ * exclude the last tuple (which is the incoming new tuple that forced a split)
+ * from the check to see if more than one node is used. The reason for this
+ * is that if the existing tuples are put into only one chain, then even if
+ * we move them all to an empty page, there would still not be room for the
+ * new tuple, so we'd get into an infinite loop of picksplit attempts.
+ * Forcing allTheSame mode dodges this problem by ensuring the old tuples will
+ * be split across pages. (Exercise for the reader: figure out why this
+ * fixes the problem even when there is only one old tuple.)
+ */
+static bool
+checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig,
+ bool *includeNew)
+{
+ int theNode;
+ int limit;
+ int i;
+
+ /* For the moment, assume we can include the new leaf tuple */
+ *includeNew = true;
+
+ /* If there's only the new leaf tuple, don't select allTheSame mode */
+ if (in->nTuples <= 1)
+ return false;
+
+ /* If tuple set doesn't fit on one page, ignore the new tuple in test */
+ limit = tooBig ? in->nTuples - 1 : in->nTuples;
+
+ /* Check to see if more than one node is populated */
+ theNode = out->mapTuplesToNodes[0];
+ for (i = 1; i < limit; i++)
+ {
+ if (out->mapTuplesToNodes[i] != theNode)
+ return false;
+ }
+
+ /* Nope, so override the picksplit function's decisions */
+
+ /* If the new tuple is in its own node, it can't be included in split */
+ if (tooBig && out->mapTuplesToNodes[in->nTuples - 1] != theNode)
+ *includeNew = false;
+
+ out->nNodes = 8; /* arbitrary number of child nodes */
+
+ /* Random assignment of tuples to nodes (note we include new tuple) */
+ for (i = 0; i < in->nTuples; i++)
+ out->mapTuplesToNodes[i] = i % out->nNodes;
+
+ /* The opclass may not use node labels, but if it does, duplicate 'em */
+ if (out->nodeLabels)
+ {
+ Datum theLabel = out->nodeLabels[theNode];
+
+ out->nodeLabels = (Datum *) palloc(sizeof(Datum) * out->nNodes);
+ for (i = 0; i < out->nNodes; i++)
+ out->nodeLabels[i] = theLabel;
+ }
+
+ /* We don't touch the prefix or the leaf tuple datum assignments */
+
+ return true;
+}
+
+/*
+ * current points to a leaf-tuple chain that we wanted to add newLeafTuple to,
+ * but the chain has to be split because there's not enough room to add
+ * newLeafTuple to its page.
+ *
+ * This function splits the leaf tuple set according to picksplit's rules,
+ * creating one or more new chains that are spread across the current page
+ * and an additional leaf page (we assume that two leaf pages will be
+ * sufficient). A new inner tuple is created, and the parent downlink
+ * pointer is updated to point to that inner tuple instead of the leaf chain.
+ *
+ * On exit, current contains the address of the new inner tuple.
+ *
+ * Returns true if we successfully inserted newLeafTuple during this function,
+ * false if caller still has to do it (meaning another picksplit operation is
+ * probably needed). Failure could occur if the picksplit result is fairly
+ * unbalanced, or if newLeafTuple is just plain too big to fit on a page.
+ * Because we force the picksplit result to be at least two chains, each
+ * cycle will get rid of at least one leaf tuple from the chain, so the loop
+ * will eventually terminate if lack of balance is the issue. If the tuple
+ * is too big, we assume that repeated picksplit operations will eventually
+ * make it small enough by repeated prefix-stripping. A broken opclass could
+ * make this an infinite loop, though, so spgdoinsert() checks that the
+ * leaf datums get smaller each time.
+ */
+static bool
+doPickSplit(Relation index, SpGistState *state,
+ SPPageDesc *current, SPPageDesc *parent,
+ SpGistLeafTuple newLeafTuple,
+ int level, bool isNulls, bool isNew)
+{
+ bool insertedNew = false;
+ spgPickSplitIn in;
+ spgPickSplitOut out;
+ FmgrInfo *procinfo;
+ bool includeNew;
+ int i,
+ max,
+ n;
+ SpGistInnerTuple innerTuple;
+ SpGistNodeTuple node,
+ *nodes;
+ Buffer newInnerBuffer,
+ newLeafBuffer;
+ uint8 *leafPageSelect;
+ int *leafSizes;
+ OffsetNumber *toDelete;
+ OffsetNumber *toInsert;
+ OffsetNumber redirectTuplePos = InvalidOffsetNumber;
+ OffsetNumber startOffsets[2];
+ SpGistLeafTuple *oldLeafs;
+ SpGistLeafTuple *newLeafs;
+ Datum leafDatums[INDEX_MAX_KEYS];
+ bool leafIsnulls[INDEX_MAX_KEYS];
+ int spaceToDelete;
+ int currentFreeSpace;
+ int totalLeafSizes;
+ bool allTheSame;
+ spgxlogPickSplit xlrec;
+ char *leafdata,
+ *leafptr;
+ SPPageDesc saveCurrent;
+ int nToDelete,
+ nToInsert,
+ maxToInclude;
+
+ in.level = level;
+
+ /*
+ * Allocate per-leaf-tuple work arrays with max possible size
+ */
+ max = PageGetMaxOffsetNumber(current->page);
+ n = max + 1;
+ in.datums = (Datum *) palloc(sizeof(Datum) * n);
+ toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n);
+ toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n);
+ oldLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n);
+ newLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n);
+ leafPageSelect = (uint8 *) palloc(sizeof(uint8) * n);
+
+ STORE_STATE(state, xlrec.stateSrc);
+
+ /*
+ * Form list of leaf tuples which will be distributed as split result;
+ * also, count up the amount of space that will be freed from current.
+ * (Note that in the non-root case, we won't actually delete the old
+ * tuples, only replace them with redirects or placeholders.)
+ */
+ nToInsert = 0;
+ nToDelete = 0;
+ spaceToDelete = 0;
+ if (SpGistBlockIsRoot(current->blkno))
+ {
+ /*
+ * We are splitting the root (which up to now is also a leaf page).
+ * Its tuples are not linked, so scan sequentially to get them all. We
+ * ignore the original value of current->offnum.
+ */
+ for (i = FirstOffsetNumber; i <= max; i++)
+ {
+ SpGistLeafTuple it;
+
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, i));
+ if (it->tupstate == SPGIST_LIVE)
+ {
+ in.datums[nToInsert] =
+ isNulls ? (Datum) 0 : SGLTDATUM(it, state);
+ oldLeafs[nToInsert] = it;
+ nToInsert++;
+ toDelete[nToDelete] = i;
+ nToDelete++;
+ /* we will delete the tuple altogether, so count full space */
+ spaceToDelete += it->size + sizeof(ItemIdData);
+ }
+ else /* tuples on root should be live */
+ elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate);
+ }
+ }
+ else
+ {
+ /* Normal case, just collect the leaf tuples in the chain */
+ i = current->offnum;
+ while (i != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple it;
+
+ Assert(i >= FirstOffsetNumber && i <= max);
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, i));
+ if (it->tupstate == SPGIST_LIVE)
+ {
+ in.datums[nToInsert] =
+ isNulls ? (Datum) 0 : SGLTDATUM(it, state);
+ oldLeafs[nToInsert] = it;
+ nToInsert++;
+ toDelete[nToDelete] = i;
+ nToDelete++;
+ /* we will not delete the tuple, only replace with dead */
+ Assert(it->size >= SGDTSIZE);
+ spaceToDelete += it->size - SGDTSIZE;
+ }
+ else if (it->tupstate == SPGIST_DEAD)
+ {
+ /* We could see a DEAD tuple as first/only chain item */
+ Assert(i == current->offnum);
+ Assert(SGLT_GET_NEXTOFFSET(it) == InvalidOffsetNumber);
+ toDelete[nToDelete] = i;
+ nToDelete++;
+ /* replacing it with redirect will save no space */
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate);
+
+ i = SGLT_GET_NEXTOFFSET(it);
+ }
+ }
+ in.nTuples = nToInsert;
+
+ /*
+ * We may not actually insert new tuple because another picksplit may be
+ * necessary due to too large value, but we will try to allocate enough
+ * space to include it; and in any case it has to be included in the input
+ * for the picksplit function. So don't increment nToInsert yet.
+ */
+ in.datums[in.nTuples] =
+ isNulls ? (Datum) 0 : SGLTDATUM(newLeafTuple, state);
+ oldLeafs[in.nTuples] = newLeafTuple;
+ in.nTuples++;
+
+ memset(&out, 0, sizeof(out));
+
+ if (!isNulls)
+ {
+ /*
+ * Perform split using user-defined method.
+ */
+ procinfo = index_getprocinfo(index, 1, SPGIST_PICKSPLIT_PROC);
+ FunctionCall2Coll(procinfo,
+ index->rd_indcollation[0],
+ PointerGetDatum(&in),
+ PointerGetDatum(&out));
+
+ /*
+ * Form new leaf tuples and count up the total space needed.
+ */
+ totalLeafSizes = 0;
+ for (i = 0; i < in.nTuples; i++)
+ {
+ if (state->leafTupDesc->natts > 1)
+ spgDeformLeafTuple(oldLeafs[i],
+ state->leafTupDesc,
+ leafDatums,
+ leafIsnulls,
+ isNulls);
+
+ leafDatums[spgKeyColumn] = out.leafTupleDatums[i];
+ leafIsnulls[spgKeyColumn] = false;
+
+ newLeafs[i] = spgFormLeafTuple(state, &oldLeafs[i]->heapPtr,
+ leafDatums,
+ leafIsnulls);
+ totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData);
+ }
+ }
+ else
+ {
+ /*
+ * Perform dummy split that puts all tuples into one node.
+ * checkAllTheSame will override this and force allTheSame mode.
+ */
+ out.hasPrefix = false;
+ out.nNodes = 1;
+ out.nodeLabels = NULL;
+ out.mapTuplesToNodes = palloc0(sizeof(int) * in.nTuples);
+
+ /*
+ * Form new leaf tuples and count up the total space needed.
+ */
+ totalLeafSizes = 0;
+ for (i = 0; i < in.nTuples; i++)
+ {
+ if (state->leafTupDesc->natts > 1)
+ spgDeformLeafTuple(oldLeafs[i],
+ state->leafTupDesc,
+ leafDatums,
+ leafIsnulls,
+ isNulls);
+
+ /*
+ * Nulls tree can contain only null key values.
+ */
+ leafDatums[spgKeyColumn] = (Datum) 0;
+ leafIsnulls[spgKeyColumn] = true;
+
+ newLeafs[i] = spgFormLeafTuple(state, &oldLeafs[i]->heapPtr,
+ leafDatums,
+ leafIsnulls);
+ totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData);
+ }
+ }
+
+ /*
+ * Check to see if the picksplit function failed to separate the values,
+ * ie, it put them all into the same child node. If so, select allTheSame
+ * mode and create a random split instead. See comments for
+ * checkAllTheSame as to why we need to know if the new leaf tuples could
+ * fit on one page.
+ */
+ allTheSame = checkAllTheSame(&in, &out,
+ totalLeafSizes > SPGIST_PAGE_CAPACITY,
+ &includeNew);
+
+ /*
+ * If checkAllTheSame decided we must exclude the new tuple, don't
+ * consider it any further.
+ */
+ if (includeNew)
+ maxToInclude = in.nTuples;
+ else
+ {
+ maxToInclude = in.nTuples - 1;
+ totalLeafSizes -= newLeafs[in.nTuples - 1]->size + sizeof(ItemIdData);
+ }
+
+ /*
+ * Allocate per-node work arrays. Since checkAllTheSame could replace
+ * out.nNodes with a value larger than the number of tuples on the input
+ * page, we can't allocate these arrays before here.
+ */
+ nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * out.nNodes);
+ leafSizes = (int *) palloc0(sizeof(int) * out.nNodes);
+
+ /*
+ * Form nodes of inner tuple and inner tuple itself
+ */
+ for (i = 0; i < out.nNodes; i++)
+ {
+ Datum label = (Datum) 0;
+ bool labelisnull = (out.nodeLabels == NULL);
+
+ if (!labelisnull)
+ label = out.nodeLabels[i];
+ nodes[i] = spgFormNodeTuple(state, label, labelisnull);
+ }
+ innerTuple = spgFormInnerTuple(state,
+ out.hasPrefix, out.prefixDatum,
+ out.nNodes, nodes);
+ innerTuple->allTheSame = allTheSame;
+
+ /*
+ * Update nodes[] array to point into the newly formed innerTuple, so that
+ * we can adjust their downlinks below.
+ */
+ SGITITERATE(innerTuple, i, node)
+ {
+ nodes[i] = node;
+ }
+
+ /*
+ * Re-scan new leaf tuples and count up the space needed under each node.
+ */
+ for (i = 0; i < maxToInclude; i++)
+ {
+ n = out.mapTuplesToNodes[i];
+ if (n < 0 || n >= out.nNodes)
+ elog(ERROR, "inconsistent result of SPGiST picksplit function");
+ leafSizes[n] += newLeafs[i]->size + sizeof(ItemIdData);
+ }
+
+ /*
+ * To perform the split, we must insert a new inner tuple, which can't go
+ * on a leaf page; and unless we are splitting the root page, we must then
+ * update the parent tuple's downlink to point to the inner tuple. If
+ * there is room, we'll put the new inner tuple on the same page as the
+ * parent tuple, otherwise we need another non-leaf buffer. But if the
+ * parent page is the root, we can't add the new inner tuple there,
+ * because the root page must have only one inner tuple.
+ */
+ xlrec.initInner = false;
+ if (parent->buffer != InvalidBuffer &&
+ !SpGistBlockIsRoot(parent->blkno) &&
+ (SpGistPageGetFreeSpace(parent->page, 1) >=
+ innerTuple->size + sizeof(ItemIdData)))
+ {
+ /* New inner tuple will fit on parent page */
+ newInnerBuffer = parent->buffer;
+ }
+ else if (parent->buffer != InvalidBuffer)
+ {
+ /* Send tuple to page with next triple parity (see README) */
+ newInnerBuffer = SpGistGetBuffer(index,
+ GBUF_INNER_PARITY(parent->blkno + 1) |
+ (isNulls ? GBUF_NULLS : 0),
+ innerTuple->size + sizeof(ItemIdData),
+ &xlrec.initInner);
+ }
+ else
+ {
+ /* Root page split ... inner tuple will go to root page */
+ newInnerBuffer = InvalidBuffer;
+ }
+
+ /*
+ * The new leaf tuples converted from the existing ones should require the
+ * same or less space, and therefore should all fit onto one page
+ * (although that's not necessarily the current page, since we can't
+ * delete the old tuples but only replace them with placeholders).
+ * However, the incoming new tuple might not also fit, in which case we
+ * might need another picksplit cycle to reduce it some more.
+ *
+ * If there's not room to put everything back onto the current page, then
+ * we decide on a per-node basis which tuples go to the new page. (We do
+ * it like that because leaf tuple chains can't cross pages, so we must
+ * place all leaf tuples belonging to the same parent node on the same
+ * page.)
+ *
+ * If we are splitting the root page (turning it from a leaf page into an
+ * inner page), then no leaf tuples can go back to the current page; they
+ * must all go somewhere else.
+ */
+ if (!SpGistBlockIsRoot(current->blkno))
+ currentFreeSpace = PageGetExactFreeSpace(current->page) + spaceToDelete;
+ else
+ currentFreeSpace = 0; /* prevent assigning any tuples to current */
+
+ xlrec.initDest = false;
+
+ if (totalLeafSizes <= currentFreeSpace)
+ {
+ /* All the leaf tuples will fit on current page */
+ newLeafBuffer = InvalidBuffer;
+ /* mark new leaf tuple as included in insertions, if allowed */
+ if (includeNew)
+ {
+ nToInsert++;
+ insertedNew = true;
+ }
+ for (i = 0; i < nToInsert; i++)
+ leafPageSelect[i] = 0; /* signifies current page */
+ }
+ else if (in.nTuples == 1 && totalLeafSizes > SPGIST_PAGE_CAPACITY)
+ {
+ /*
+ * We're trying to split up a long value by repeated suffixing, but
+ * it's not going to fit yet. Don't bother allocating a second leaf
+ * buffer that we won't be able to use.
+ */
+ newLeafBuffer = InvalidBuffer;
+ Assert(includeNew);
+ Assert(nToInsert == 0);
+ }
+ else
+ {
+ /* We will need another leaf page */
+ uint8 *nodePageSelect;
+ int curspace;
+ int newspace;
+
+ newLeafBuffer = SpGistGetBuffer(index,
+ GBUF_LEAF | (isNulls ? GBUF_NULLS : 0),
+ Min(totalLeafSizes,
+ SPGIST_PAGE_CAPACITY),
+ &xlrec.initDest);
+
+ /*
+ * Attempt to assign node groups to the two pages. We might fail to
+ * do so, even if totalLeafSizes is less than the available space,
+ * because we can't split a group across pages.
+ */
+ nodePageSelect = (uint8 *) palloc(sizeof(uint8) * out.nNodes);
+
+ curspace = currentFreeSpace;
+ newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer));
+ for (i = 0; i < out.nNodes; i++)
+ {
+ if (leafSizes[i] <= curspace)
+ {
+ nodePageSelect[i] = 0; /* signifies current page */
+ curspace -= leafSizes[i];
+ }
+ else
+ {
+ nodePageSelect[i] = 1; /* signifies new leaf page */
+ newspace -= leafSizes[i];
+ }
+ }
+ if (curspace >= 0 && newspace >= 0)
+ {
+ /* Successful assignment, so we can include the new leaf tuple */
+ if (includeNew)
+ {
+ nToInsert++;
+ insertedNew = true;
+ }
+ }
+ else if (includeNew)
+ {
+ /* We must exclude the new leaf tuple from the split */
+ int nodeOfNewTuple = out.mapTuplesToNodes[in.nTuples - 1];
+
+ leafSizes[nodeOfNewTuple] -=
+ newLeafs[in.nTuples - 1]->size + sizeof(ItemIdData);
+
+ /* Repeat the node assignment process --- should succeed now */
+ curspace = currentFreeSpace;
+ newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer));
+ for (i = 0; i < out.nNodes; i++)
+ {
+ if (leafSizes[i] <= curspace)
+ {
+ nodePageSelect[i] = 0; /* signifies current page */
+ curspace -= leafSizes[i];
+ }
+ else
+ {
+ nodePageSelect[i] = 1; /* signifies new leaf page */
+ newspace -= leafSizes[i];
+ }
+ }
+ if (curspace < 0 || newspace < 0)
+ elog(ERROR, "failed to divide leaf tuple groups across pages");
+ }
+ else
+ {
+ /* oops, we already excluded new tuple ... should not get here */
+ elog(ERROR, "failed to divide leaf tuple groups across pages");
+ }
+ /* Expand the per-node assignments to be shown per leaf tuple */
+ for (i = 0; i < nToInsert; i++)
+ {
+ n = out.mapTuplesToNodes[i];
+ leafPageSelect[i] = nodePageSelect[n];
+ }
+ }
+
+ /* Start preparing WAL record */
+ xlrec.nDelete = 0;
+ xlrec.initSrc = isNew;
+ xlrec.storesNulls = isNulls;
+ xlrec.isRootSplit = SpGistBlockIsRoot(current->blkno);
+
+ leafdata = leafptr = (char *) palloc(totalLeafSizes);
+
+ /* Here we begin making the changes to the target pages */
+ START_CRIT_SECTION();
+
+ /*
+ * Delete old leaf tuples from current buffer, except when we're splitting
+ * the root; in that case there's no need because we'll re-init the page
+ * below. We do this first to make room for reinserting new leaf tuples.
+ */
+ if (!SpGistBlockIsRoot(current->blkno))
+ {
+ /*
+ * Init buffer instead of deleting individual tuples, but only if
+ * there aren't any other live tuples and only during build; otherwise
+ * we need to set a redirection tuple for concurrent scans.
+ */
+ if (state->isBuild &&
+ nToDelete + SpGistPageGetOpaque(current->page)->nPlaceholder ==
+ PageGetMaxOffsetNumber(current->page))
+ {
+ SpGistInitBuffer(current->buffer,
+ SPGIST_LEAF | (isNulls ? SPGIST_NULLS : 0));
+ xlrec.initSrc = true;
+ }
+ else if (isNew)
+ {
+ /* don't expose the freshly init'd buffer as a backup block */
+ Assert(nToDelete == 0);
+ }
+ else
+ {
+ xlrec.nDelete = nToDelete;
+
+ if (!state->isBuild)
+ {
+ /*
+ * Need to create redirect tuple (it will point to new inner
+ * tuple) but right now the new tuple's location is not known
+ * yet. So, set the redirection pointer to "impossible" value
+ * and remember its position to update tuple later.
+ */
+ if (nToDelete > 0)
+ redirectTuplePos = toDelete[0];
+ spgPageIndexMultiDelete(state, current->page,
+ toDelete, nToDelete,
+ SPGIST_REDIRECT,
+ SPGIST_PLACEHOLDER,
+ SPGIST_METAPAGE_BLKNO,
+ FirstOffsetNumber);
+ }
+ else
+ {
+ /*
+ * During index build there is not concurrent searches, so we
+ * don't need to create redirection tuple.
+ */
+ spgPageIndexMultiDelete(state, current->page,
+ toDelete, nToDelete,
+ SPGIST_PLACEHOLDER,
+ SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+ }
+ }
+ }
+
+ /*
+ * Put leaf tuples on proper pages, and update downlinks in innerTuple's
+ * nodes.
+ */
+ startOffsets[0] = startOffsets[1] = InvalidOffsetNumber;
+ for (i = 0; i < nToInsert; i++)
+ {
+ SpGistLeafTuple it = newLeafs[i];
+ Buffer leafBuffer;
+ BlockNumber leafBlock;
+ OffsetNumber newoffset;
+
+ /* Which page is it going to? */
+ leafBuffer = leafPageSelect[i] ? newLeafBuffer : current->buffer;
+ leafBlock = BufferGetBlockNumber(leafBuffer);
+
+ /* Link tuple into correct chain for its node */
+ n = out.mapTuplesToNodes[i];
+
+ if (ItemPointerIsValid(&nodes[n]->t_tid))
+ {
+ Assert(ItemPointerGetBlockNumber(&nodes[n]->t_tid) == leafBlock);
+ SGLT_SET_NEXTOFFSET(it, ItemPointerGetOffsetNumber(&nodes[n]->t_tid));
+ }
+ else
+ SGLT_SET_NEXTOFFSET(it, InvalidOffsetNumber);
+
+ /* Insert it on page */
+ newoffset = SpGistPageAddNewItem(state, BufferGetPage(leafBuffer),
+ (Item) it, it->size,
+ &startOffsets[leafPageSelect[i]],
+ false);
+ toInsert[i] = newoffset;
+
+ /* ... and complete the chain linking */
+ ItemPointerSet(&nodes[n]->t_tid, leafBlock, newoffset);
+
+ /* Also copy leaf tuple into WAL data */
+ memcpy(leafptr, newLeafs[i], newLeafs[i]->size);
+ leafptr += newLeafs[i]->size;
+ }
+
+ /*
+ * We're done modifying the other leaf buffer (if any), so mark it dirty.
+ * current->buffer will be marked below, after we're entirely done
+ * modifying it.
+ */
+ if (newLeafBuffer != InvalidBuffer)
+ {
+ MarkBufferDirty(newLeafBuffer);
+ }
+
+ /* Remember current buffer, since we're about to change "current" */
+ saveCurrent = *current;
+
+ /*
+ * Store the new innerTuple
+ */
+ if (newInnerBuffer == parent->buffer && newInnerBuffer != InvalidBuffer)
+ {
+ /*
+ * new inner tuple goes to parent page
+ */
+ Assert(current->buffer != parent->buffer);
+
+ /* Repoint "current" at the new inner tuple */
+ current->blkno = parent->blkno;
+ current->buffer = parent->buffer;
+ current->page = parent->page;
+ xlrec.offnumInner = current->offnum =
+ SpGistPageAddNewItem(state, current->page,
+ (Item) innerTuple, innerTuple->size,
+ NULL, false);
+
+ /*
+ * Update parent node link and mark parent page dirty
+ */
+ xlrec.innerIsParent = true;
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+ saveNodeLink(index, parent, current->blkno, current->offnum);
+
+ /*
+ * Update redirection link (in old current buffer)
+ */
+ if (redirectTuplePos != InvalidOffsetNumber)
+ setRedirectionTuple(&saveCurrent, redirectTuplePos,
+ current->blkno, current->offnum);
+
+ /* Done modifying old current buffer, mark it dirty */
+ MarkBufferDirty(saveCurrent.buffer);
+ }
+ else if (parent->buffer != InvalidBuffer)
+ {
+ /*
+ * new inner tuple will be stored on a new page
+ */
+ Assert(newInnerBuffer != InvalidBuffer);
+
+ /* Repoint "current" at the new inner tuple */
+ current->buffer = newInnerBuffer;
+ current->blkno = BufferGetBlockNumber(current->buffer);
+ current->page = BufferGetPage(current->buffer);
+ xlrec.offnumInner = current->offnum =
+ SpGistPageAddNewItem(state, current->page,
+ (Item) innerTuple, innerTuple->size,
+ NULL, false);
+
+ /* Done modifying new current buffer, mark it dirty */
+ MarkBufferDirty(current->buffer);
+
+ /*
+ * Update parent node link and mark parent page dirty
+ */
+ xlrec.innerIsParent = (parent->buffer == current->buffer);
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+ saveNodeLink(index, parent, current->blkno, current->offnum);
+
+ /*
+ * Update redirection link (in old current buffer)
+ */
+ if (redirectTuplePos != InvalidOffsetNumber)
+ setRedirectionTuple(&saveCurrent, redirectTuplePos,
+ current->blkno, current->offnum);
+
+ /* Done modifying old current buffer, mark it dirty */
+ MarkBufferDirty(saveCurrent.buffer);
+ }
+ else
+ {
+ /*
+ * Splitting root page, which was a leaf but now becomes inner page
+ * (and so "current" continues to point at it)
+ */
+ Assert(SpGistBlockIsRoot(current->blkno));
+ Assert(redirectTuplePos == InvalidOffsetNumber);
+
+ SpGistInitBuffer(current->buffer, (isNulls ? SPGIST_NULLS : 0));
+ xlrec.initInner = true;
+ xlrec.innerIsParent = false;
+
+ xlrec.offnumInner = current->offnum =
+ PageAddItem(current->page, (Item) innerTuple, innerTuple->size,
+ InvalidOffsetNumber, false, false);
+ if (current->offnum != FirstOffsetNumber)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ innerTuple->size);
+
+ /* No parent link to update, nor redirection to do */
+ xlrec.offnumParent = InvalidOffsetNumber;
+ xlrec.nodeI = 0;
+
+ /* Done modifying new current buffer, mark it dirty */
+ MarkBufferDirty(current->buffer);
+
+ /* saveCurrent doesn't represent a different buffer */
+ saveCurrent.buffer = InvalidBuffer;
+ }
+
+ if (RelationNeedsWAL(index) && !state->isBuild)
+ {
+ XLogRecPtr recptr;
+ int flags;
+
+ XLogBeginInsert();
+
+ xlrec.nInsert = nToInsert;
+ XLogRegisterData((char *) &xlrec, SizeOfSpgxlogPickSplit);
+
+ XLogRegisterData((char *) toDelete,
+ sizeof(OffsetNumber) * xlrec.nDelete);
+ XLogRegisterData((char *) toInsert,
+ sizeof(OffsetNumber) * xlrec.nInsert);
+ XLogRegisterData((char *) leafPageSelect,
+ sizeof(uint8) * xlrec.nInsert);
+ XLogRegisterData((char *) innerTuple, innerTuple->size);
+ XLogRegisterData(leafdata, leafptr - leafdata);
+
+ /* Old leaf page */
+ if (BufferIsValid(saveCurrent.buffer))
+ {
+ flags = REGBUF_STANDARD;
+ if (xlrec.initSrc)
+ flags |= REGBUF_WILL_INIT;
+ XLogRegisterBuffer(0, saveCurrent.buffer, flags);
+ }
+
+ /* New leaf page */
+ if (BufferIsValid(newLeafBuffer))
+ {
+ flags = REGBUF_STANDARD;
+ if (xlrec.initDest)
+ flags |= REGBUF_WILL_INIT;
+ XLogRegisterBuffer(1, newLeafBuffer, flags);
+ }
+
+ /* Inner page */
+ flags = REGBUF_STANDARD;
+ if (xlrec.initInner)
+ flags |= REGBUF_WILL_INIT;
+ XLogRegisterBuffer(2, current->buffer, flags);
+
+ /* Parent page, if different from inner page */
+ if (parent->buffer != InvalidBuffer)
+ {
+ if (parent->buffer != current->buffer)
+ XLogRegisterBuffer(3, parent->buffer, REGBUF_STANDARD);
+ else
+ Assert(xlrec.innerIsParent);
+ }
+
+ /* Issue the WAL record */
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_PICKSPLIT);
+
+ /* Update page LSNs on all affected pages */
+ if (newLeafBuffer != InvalidBuffer)
+ {
+ Page page = BufferGetPage(newLeafBuffer);
+
+ PageSetLSN(page, recptr);
+ }
+
+ if (saveCurrent.buffer != InvalidBuffer)
+ {
+ Page page = BufferGetPage(saveCurrent.buffer);
+
+ PageSetLSN(page, recptr);
+ }
+
+ PageSetLSN(current->page, recptr);
+
+ if (parent->buffer != InvalidBuffer)
+ {
+ PageSetLSN(parent->page, recptr);
+ }
+ }
+
+ END_CRIT_SECTION();
+
+ /* Update local free-space cache and unlock buffers */
+ if (newLeafBuffer != InvalidBuffer)
+ {
+ SpGistSetLastUsedPage(index, newLeafBuffer);
+ UnlockReleaseBuffer(newLeafBuffer);
+ }
+ if (saveCurrent.buffer != InvalidBuffer)
+ {
+ SpGistSetLastUsedPage(index, saveCurrent.buffer);
+ UnlockReleaseBuffer(saveCurrent.buffer);
+ }
+
+ return insertedNew;
+}
+
+/*
+ * spgMatchNode action: descend to N'th child node of current inner tuple
+ */
+static void
+spgMatchNodeAction(Relation index, SpGistState *state,
+ SpGistInnerTuple innerTuple,
+ SPPageDesc *current, SPPageDesc *parent, int nodeN)
+{
+ int i;
+ SpGistNodeTuple node;
+
+ /* Release previous parent buffer if any */
+ if (parent->buffer != InvalidBuffer &&
+ parent->buffer != current->buffer)
+ {
+ SpGistSetLastUsedPage(index, parent->buffer);
+ UnlockReleaseBuffer(parent->buffer);
+ }
+
+ /* Repoint parent to specified node of current inner tuple */
+ parent->blkno = current->blkno;
+ parent->buffer = current->buffer;
+ parent->page = current->page;
+ parent->offnum = current->offnum;
+ parent->node = nodeN;
+
+ /* Locate that node */
+ SGITITERATE(innerTuple, i, node)
+ {
+ if (i == nodeN)
+ break;
+ }
+
+ if (i != nodeN)
+ elog(ERROR, "failed to find requested node %d in SPGiST inner tuple",
+ nodeN);
+
+ /* Point current to the downlink location, if any */
+ if (ItemPointerIsValid(&node->t_tid))
+ {
+ current->blkno = ItemPointerGetBlockNumber(&node->t_tid);
+ current->offnum = ItemPointerGetOffsetNumber(&node->t_tid);
+ }
+ else
+ {
+ /* Downlink is empty, so we'll need to find a new page */
+ current->blkno = InvalidBlockNumber;
+ current->offnum = InvalidOffsetNumber;
+ }
+
+ current->buffer = InvalidBuffer;
+ current->page = NULL;
+}
+
+/*
+ * spgAddNode action: add a node to the inner tuple at current
+ */
+static void
+spgAddNodeAction(Relation index, SpGistState *state,
+ SpGistInnerTuple innerTuple,
+ SPPageDesc *current, SPPageDesc *parent,
+ int nodeN, Datum nodeLabel)
+{
+ SpGistInnerTuple newInnerTuple;
+ spgxlogAddNode xlrec;
+
+ /* Should not be applied to nulls */
+ Assert(!SpGistPageStoresNulls(current->page));
+
+ /* Construct new inner tuple with additional node */
+ newInnerTuple = addNode(state, innerTuple, nodeLabel, nodeN);
+
+ /* Prepare WAL record */
+ STORE_STATE(state, xlrec.stateSrc);
+ xlrec.offnum = current->offnum;
+
+ /* we don't fill these unless we need to change the parent downlink */
+ xlrec.parentBlk = -1;
+ xlrec.offnumParent = InvalidOffsetNumber;
+ xlrec.nodeI = 0;
+
+ /* we don't fill these unless tuple has to be moved */
+ xlrec.offnumNew = InvalidOffsetNumber;
+ xlrec.newPage = false;
+
+ if (PageGetExactFreeSpace(current->page) >=
+ newInnerTuple->size - innerTuple->size)
+ {
+ /*
+ * We can replace the inner tuple by new version in-place
+ */
+ START_CRIT_SECTION();
+
+ PageIndexTupleDelete(current->page, current->offnum);
+ if (PageAddItem(current->page,
+ (Item) newInnerTuple, newInnerTuple->size,
+ current->offnum, false, false) != current->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ newInnerTuple->size);
+
+ MarkBufferDirty(current->buffer);
+
+ if (RelationNeedsWAL(index) && !state->isBuild)
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+ XLogRegisterData((char *) newInnerTuple, newInnerTuple->size);
+
+ XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE);
+
+ PageSetLSN(current->page, recptr);
+ }
+
+ END_CRIT_SECTION();
+ }
+ else
+ {
+ /*
+ * move inner tuple to another page, and update parent
+ */
+ SpGistDeadTuple dt;
+ SPPageDesc saveCurrent;
+
+ /*
+ * It should not be possible to get here for the root page, since we
+ * allow only one inner tuple on the root page, and spgFormInnerTuple
+ * always checks that inner tuples don't exceed the size of a page.
+ */
+ if (SpGistBlockIsRoot(current->blkno))
+ elog(ERROR, "cannot enlarge root tuple any more");
+ Assert(parent->buffer != InvalidBuffer);
+
+ saveCurrent = *current;
+
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+
+ /*
+ * obtain new buffer with the same parity as current, since it will be
+ * a child of same parent tuple
+ */
+ current->buffer = SpGistGetBuffer(index,
+ GBUF_INNER_PARITY(current->blkno),
+ newInnerTuple->size + sizeof(ItemIdData),
+ &xlrec.newPage);
+ current->blkno = BufferGetBlockNumber(current->buffer);
+ current->page = BufferGetPage(current->buffer);
+
+ /*
+ * Let's just make real sure new current isn't same as old. Right now
+ * that's impossible, but if SpGistGetBuffer ever got smart enough to
+ * delete placeholder tuples before checking space, maybe it wouldn't
+ * be impossible. The case would appear to work except that WAL
+ * replay would be subtly wrong, so I think a mere assert isn't enough
+ * here.
+ */
+ if (current->blkno == saveCurrent.blkno)
+ elog(ERROR, "SPGiST new buffer shouldn't be same as old buffer");
+
+ /*
+ * New current and parent buffer will both be modified; but note that
+ * parent buffer could be same as either new or old current.
+ */
+ if (parent->buffer == saveCurrent.buffer)
+ xlrec.parentBlk = 0;
+ else if (parent->buffer == current->buffer)
+ xlrec.parentBlk = 1;
+ else
+ xlrec.parentBlk = 2;
+
+ START_CRIT_SECTION();
+
+ /* insert new ... */
+ xlrec.offnumNew = current->offnum =
+ SpGistPageAddNewItem(state, current->page,
+ (Item) newInnerTuple, newInnerTuple->size,
+ NULL, false);
+
+ MarkBufferDirty(current->buffer);
+
+ /* update parent's downlink and mark parent page dirty */
+ saveNodeLink(index, parent, current->blkno, current->offnum);
+
+ /*
+ * Replace old tuple with a placeholder or redirection tuple. Unless
+ * doing an index build, we have to insert a redirection tuple for
+ * possible concurrent scans. We can't just delete it in any case,
+ * because that could change the offsets of other tuples on the page,
+ * breaking downlinks from their parents.
+ */
+ if (state->isBuild)
+ dt = spgFormDeadTuple(state, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber, InvalidOffsetNumber);
+ else
+ dt = spgFormDeadTuple(state, SPGIST_REDIRECT,
+ current->blkno, current->offnum);
+
+ PageIndexTupleDelete(saveCurrent.page, saveCurrent.offnum);
+ if (PageAddItem(saveCurrent.page, (Item) dt, dt->size,
+ saveCurrent.offnum,
+ false, false) != saveCurrent.offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ dt->size);
+
+ if (state->isBuild)
+ SpGistPageGetOpaque(saveCurrent.page)->nPlaceholder++;
+ else
+ SpGistPageGetOpaque(saveCurrent.page)->nRedirection++;
+
+ MarkBufferDirty(saveCurrent.buffer);
+
+ if (RelationNeedsWAL(index) && !state->isBuild)
+ {
+ XLogRecPtr recptr;
+ int flags;
+
+ XLogBeginInsert();
+
+ /* orig page */
+ XLogRegisterBuffer(0, saveCurrent.buffer, REGBUF_STANDARD);
+ /* new page */
+ flags = REGBUF_STANDARD;
+ if (xlrec.newPage)
+ flags |= REGBUF_WILL_INIT;
+ XLogRegisterBuffer(1, current->buffer, flags);
+ /* parent page (if different from orig and new) */
+ if (xlrec.parentBlk == 2)
+ XLogRegisterBuffer(2, parent->buffer, REGBUF_STANDARD);
+
+ XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+ XLogRegisterData((char *) newInnerTuple, newInnerTuple->size);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE);
+
+ /* we don't bother to check if any of these are redundant */
+ PageSetLSN(current->page, recptr);
+ PageSetLSN(parent->page, recptr);
+ PageSetLSN(saveCurrent.page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Release saveCurrent if it's not same as current or parent */
+ if (saveCurrent.buffer != current->buffer &&
+ saveCurrent.buffer != parent->buffer)
+ {
+ SpGistSetLastUsedPage(index, saveCurrent.buffer);
+ UnlockReleaseBuffer(saveCurrent.buffer);
+ }
+ }
+}
+
+/*
+ * spgSplitNode action: split inner tuple at current into prefix and postfix
+ */
+static void
+spgSplitNodeAction(Relation index, SpGistState *state,
+ SpGistInnerTuple innerTuple,
+ SPPageDesc *current, spgChooseOut *out)
+{
+ SpGistInnerTuple prefixTuple,
+ postfixTuple;
+ SpGistNodeTuple node,
+ *nodes;
+ BlockNumber postfixBlkno;
+ OffsetNumber postfixOffset;
+ int i;
+ spgxlogSplitTuple xlrec;
+ Buffer newBuffer = InvalidBuffer;
+
+ /* Should not be applied to nulls */
+ Assert(!SpGistPageStoresNulls(current->page));
+
+ /* Check opclass gave us sane values */
+ if (out->result.splitTuple.prefixNNodes <= 0 ||
+ out->result.splitTuple.prefixNNodes > SGITMAXNNODES)
+ elog(ERROR, "invalid number of prefix nodes: %d",
+ out->result.splitTuple.prefixNNodes);
+ if (out->result.splitTuple.childNodeN < 0 ||
+ out->result.splitTuple.childNodeN >=
+ out->result.splitTuple.prefixNNodes)
+ elog(ERROR, "invalid child node number: %d",
+ out->result.splitTuple.childNodeN);
+
+ /*
+ * Construct new prefix tuple with requested number of nodes. We'll fill
+ * in the childNodeN'th node's downlink below.
+ */
+ nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) *
+ out->result.splitTuple.prefixNNodes);
+
+ for (i = 0; i < out->result.splitTuple.prefixNNodes; i++)
+ {
+ Datum label = (Datum) 0;
+ bool labelisnull;
+
+ labelisnull = (out->result.splitTuple.prefixNodeLabels == NULL);
+ if (!labelisnull)
+ label = out->result.splitTuple.prefixNodeLabels[i];
+ nodes[i] = spgFormNodeTuple(state, label, labelisnull);
+ }
+
+ prefixTuple = spgFormInnerTuple(state,
+ out->result.splitTuple.prefixHasPrefix,
+ out->result.splitTuple.prefixPrefixDatum,
+ out->result.splitTuple.prefixNNodes,
+ nodes);
+
+ /* it must fit in the space that innerTuple now occupies */
+ if (prefixTuple->size > innerTuple->size)
+ elog(ERROR, "SPGiST inner-tuple split must not produce longer prefix");
+
+ /*
+ * Construct new postfix tuple, containing all nodes of innerTuple with
+ * same node datums, but with the prefix specified by the picksplit
+ * function.
+ */
+ nodes = palloc(sizeof(SpGistNodeTuple) * innerTuple->nNodes);
+ SGITITERATE(innerTuple, i, node)
+ {
+ nodes[i] = node;
+ }
+
+ postfixTuple = spgFormInnerTuple(state,
+ out->result.splitTuple.postfixHasPrefix,
+ out->result.splitTuple.postfixPrefixDatum,
+ innerTuple->nNodes, nodes);
+
+ /* Postfix tuple is allTheSame if original tuple was */
+ postfixTuple->allTheSame = innerTuple->allTheSame;
+
+ /* prep data for WAL record */
+ xlrec.newPage = false;
+
+ /*
+ * If we can't fit both tuples on the current page, get a new page for the
+ * postfix tuple. In particular, can't split to the root page.
+ *
+ * For the space calculation, note that prefixTuple replaces innerTuple
+ * but postfixTuple will be a new entry.
+ */
+ if (SpGistBlockIsRoot(current->blkno) ||
+ SpGistPageGetFreeSpace(current->page, 1) + innerTuple->size <
+ prefixTuple->size + postfixTuple->size + sizeof(ItemIdData))
+ {
+ /*
+ * Choose page with next triple parity, because postfix tuple is a
+ * child of prefix one
+ */
+ newBuffer = SpGistGetBuffer(index,
+ GBUF_INNER_PARITY(current->blkno + 1),
+ postfixTuple->size + sizeof(ItemIdData),
+ &xlrec.newPage);
+ }
+
+ START_CRIT_SECTION();
+
+ /*
+ * Replace old tuple by prefix tuple
+ */
+ PageIndexTupleDelete(current->page, current->offnum);
+ xlrec.offnumPrefix = PageAddItem(current->page,
+ (Item) prefixTuple, prefixTuple->size,
+ current->offnum, false, false);
+ if (xlrec.offnumPrefix != current->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ prefixTuple->size);
+
+ /*
+ * put postfix tuple into appropriate page
+ */
+ if (newBuffer == InvalidBuffer)
+ {
+ postfixBlkno = current->blkno;
+ xlrec.offnumPostfix = postfixOffset =
+ SpGistPageAddNewItem(state, current->page,
+ (Item) postfixTuple, postfixTuple->size,
+ NULL, false);
+ xlrec.postfixBlkSame = true;
+ }
+ else
+ {
+ postfixBlkno = BufferGetBlockNumber(newBuffer);
+ xlrec.offnumPostfix = postfixOffset =
+ SpGistPageAddNewItem(state, BufferGetPage(newBuffer),
+ (Item) postfixTuple, postfixTuple->size,
+ NULL, false);
+ MarkBufferDirty(newBuffer);
+ xlrec.postfixBlkSame = false;
+ }
+
+ /*
+ * And set downlink pointer in the prefix tuple to point to postfix tuple.
+ * (We can't avoid this step by doing the above two steps in opposite
+ * order, because there might not be enough space on the page to insert
+ * the postfix tuple first.) We have to update the local copy of the
+ * prefixTuple too, because that's what will be written to WAL.
+ */
+ spgUpdateNodeLink(prefixTuple, out->result.splitTuple.childNodeN,
+ postfixBlkno, postfixOffset);
+ prefixTuple = (SpGistInnerTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, current->offnum));
+ spgUpdateNodeLink(prefixTuple, out->result.splitTuple.childNodeN,
+ postfixBlkno, postfixOffset);
+
+ MarkBufferDirty(current->buffer);
+
+ if (RelationNeedsWAL(index) && !state->isBuild)
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+ XLogRegisterData((char *) prefixTuple, prefixTuple->size);
+ XLogRegisterData((char *) postfixTuple, postfixTuple->size);
+
+ XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD);
+ if (newBuffer != InvalidBuffer)
+ {
+ int flags;
+
+ flags = REGBUF_STANDARD;
+ if (xlrec.newPage)
+ flags |= REGBUF_WILL_INIT;
+ XLogRegisterBuffer(1, newBuffer, flags);
+ }
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_SPLIT_TUPLE);
+
+ PageSetLSN(current->page, recptr);
+
+ if (newBuffer != InvalidBuffer)
+ {
+ PageSetLSN(BufferGetPage(newBuffer), recptr);
+ }
+ }
+
+ END_CRIT_SECTION();
+
+ /* Update local free-space cache and release buffer */
+ if (newBuffer != InvalidBuffer)
+ {
+ SpGistSetLastUsedPage(index, newBuffer);
+ UnlockReleaseBuffer(newBuffer);
+ }
+}
+
+/*
+ * Insert one item into the index.
+ *
+ * Returns true on success, false if we failed to complete the insertion
+ * (typically because of conflict with a concurrent insert). In the latter
+ * case, caller should re-call spgdoinsert() with the same args.
+ */
+bool
+spgdoinsert(Relation index, SpGistState *state,
+ ItemPointer heapPtr, Datum *datums, bool *isnulls)
+{
+ bool result = true;
+ TupleDesc leafDescriptor = state->leafTupDesc;
+ bool isnull = isnulls[spgKeyColumn];
+ int level = 0;
+ Datum leafDatums[INDEX_MAX_KEYS];
+ int leafSize;
+ int bestLeafSize;
+ int numNoProgressCycles = 0;
+ SPPageDesc current,
+ parent;
+ FmgrInfo *procinfo = NULL;
+
+ /*
+ * Look up FmgrInfo of the user-defined choose function once, to save
+ * cycles in the loop below.
+ */
+ if (!isnull)
+ procinfo = index_getprocinfo(index, 1, SPGIST_CHOOSE_PROC);
+
+ /*
+ * Prepare the leaf datum to insert.
+ *
+ * If an optional "compress" method is provided, then call it to form the
+ * leaf key datum from the input datum. Otherwise, store the input datum
+ * as is. Since we don't use index_form_tuple in this AM, we have to make
+ * sure value to be inserted is not toasted; FormIndexDatum doesn't
+ * guarantee that. But we assume the "compress" method to return an
+ * untoasted value.
+ */
+ if (!isnull)
+ {
+ if (OidIsValid(index_getprocid(index, 1, SPGIST_COMPRESS_PROC)))
+ {
+ FmgrInfo *compressProcinfo = NULL;
+
+ compressProcinfo = index_getprocinfo(index, 1, SPGIST_COMPRESS_PROC);
+ leafDatums[spgKeyColumn] =
+ FunctionCall1Coll(compressProcinfo,
+ index->rd_indcollation[spgKeyColumn],
+ datums[spgKeyColumn]);
+ }
+ else
+ {
+ Assert(state->attLeafType.type == state->attType.type);
+
+ if (state->attType.attlen == -1)
+ leafDatums[spgKeyColumn] =
+ PointerGetDatum(PG_DETOAST_DATUM(datums[spgKeyColumn]));
+ else
+ leafDatums[spgKeyColumn] = datums[spgKeyColumn];
+ }
+ }
+ else
+ leafDatums[spgKeyColumn] = (Datum) 0;
+
+ /* Likewise, ensure that any INCLUDE values are not toasted */
+ for (int i = spgFirstIncludeColumn; i < leafDescriptor->natts; i++)
+ {
+ if (!isnulls[i])
+ {
+ if (TupleDescAttr(leafDescriptor, i)->attlen == -1)
+ leafDatums[i] = PointerGetDatum(PG_DETOAST_DATUM(datums[i]));
+ else
+ leafDatums[i] = datums[i];
+ }
+ else
+ leafDatums[i] = (Datum) 0;
+ }
+
+ /*
+ * Compute space needed for a leaf tuple containing the given data.
+ */
+ leafSize = SpGistGetLeafTupleSize(leafDescriptor, leafDatums, isnulls);
+ /* Account for an item pointer, too */
+ leafSize += sizeof(ItemIdData);
+
+ /*
+ * If it isn't gonna fit, and the opclass can't reduce the datum size by
+ * suffixing, bail out now rather than doing a lot of useless work.
+ */
+ if (leafSize > SPGIST_PAGE_CAPACITY &&
+ (isnull || !state->config.longValuesOK))
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ leafSize - sizeof(ItemIdData),
+ SPGIST_PAGE_CAPACITY - sizeof(ItemIdData),
+ RelationGetRelationName(index)),
+ errhint("Values larger than a buffer page cannot be indexed.")));
+ bestLeafSize = leafSize;
+
+ /* Initialize "current" to the appropriate root page */
+ current.blkno = isnull ? SPGIST_NULL_BLKNO : SPGIST_ROOT_BLKNO;
+ current.buffer = InvalidBuffer;
+ current.page = NULL;
+ current.offnum = FirstOffsetNumber;
+ current.node = -1;
+
+ /* "parent" is invalid for the moment */
+ parent.blkno = InvalidBlockNumber;
+ parent.buffer = InvalidBuffer;
+ parent.page = NULL;
+ parent.offnum = InvalidOffsetNumber;
+ parent.node = -1;
+
+ /*
+ * Before entering the loop, try to clear any pending interrupt condition.
+ * If a query cancel is pending, we might as well accept it now not later;
+ * while if a non-canceling condition is pending, servicing it here avoids
+ * having to restart the insertion and redo all the work so far.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ for (;;)
+ {
+ bool isNew = false;
+
+ /*
+ * Bail out if query cancel is pending. We must have this somewhere
+ * in the loop since a broken opclass could produce an infinite
+ * picksplit loop. However, because we'll be holding buffer lock(s)
+ * after the first iteration, ProcessInterrupts() wouldn't be able to
+ * throw a cancel error here. Hence, if we see that an interrupt is
+ * pending, break out of the loop and deal with the situation below.
+ * Set result = false because we must restart the insertion if the
+ * interrupt isn't a query-cancel-or-die case.
+ */
+ if (INTERRUPTS_PENDING_CONDITION())
+ {
+ result = false;
+ break;
+ }
+
+ if (current.blkno == InvalidBlockNumber)
+ {
+ /*
+ * Create a leaf page. If leafSize is too large to fit on a page,
+ * we won't actually use the page yet, but it simplifies the API
+ * for doPickSplit to always have a leaf page at hand; so just
+ * quietly limit our request to a page size.
+ */
+ current.buffer =
+ SpGistGetBuffer(index,
+ GBUF_LEAF | (isnull ? GBUF_NULLS : 0),
+ Min(leafSize, SPGIST_PAGE_CAPACITY),
+ &isNew);
+ current.blkno = BufferGetBlockNumber(current.buffer);
+ }
+ else if (parent.buffer == InvalidBuffer)
+ {
+ /* we hold no parent-page lock, so no deadlock is possible */
+ current.buffer = ReadBuffer(index, current.blkno);
+ LockBuffer(current.buffer, BUFFER_LOCK_EXCLUSIVE);
+ }
+ else if (current.blkno != parent.blkno)
+ {
+ /* descend to a new child page */
+ current.buffer = ReadBuffer(index, current.blkno);
+
+ /*
+ * Attempt to acquire lock on child page. We must beware of
+ * deadlock against another insertion process descending from that
+ * page to our parent page (see README). If we fail to get lock,
+ * abandon the insertion and tell our caller to start over.
+ *
+ * XXX this could be improved, because failing to get lock on a
+ * buffer is not proof of a deadlock situation; the lock might be
+ * held by a reader, or even just background writer/checkpointer
+ * process. Perhaps it'd be worth retrying after sleeping a bit?
+ */
+ if (!ConditionalLockBuffer(current.buffer))
+ {
+ ReleaseBuffer(current.buffer);
+ UnlockReleaseBuffer(parent.buffer);
+ return false;
+ }
+ }
+ else
+ {
+ /* inner tuple can be stored on the same page as parent one */
+ current.buffer = parent.buffer;
+ }
+ current.page = BufferGetPage(current.buffer);
+
+ /* should not arrive at a page of the wrong type */
+ if (isnull ? !SpGistPageStoresNulls(current.page) :
+ SpGistPageStoresNulls(current.page))
+ elog(ERROR, "SPGiST index page %u has wrong nulls flag",
+ current.blkno);
+
+ if (SpGistPageIsLeaf(current.page))
+ {
+ SpGistLeafTuple leafTuple;
+ int nToSplit,
+ sizeToSplit;
+
+ leafTuple = spgFormLeafTuple(state, heapPtr, leafDatums, isnulls);
+ if (leafTuple->size + sizeof(ItemIdData) <=
+ SpGistPageGetFreeSpace(current.page, 1))
+ {
+ /* it fits on page, so insert it and we're done */
+ addLeafTuple(index, state, leafTuple,
+ &current, &parent, isnull, isNew);
+ break;
+ }
+ else if ((sizeToSplit =
+ checkSplitConditions(index, state, &current,
+ &nToSplit)) < SPGIST_PAGE_CAPACITY / 2 &&
+ nToSplit < 64 &&
+ leafTuple->size + sizeof(ItemIdData) + sizeToSplit <= SPGIST_PAGE_CAPACITY)
+ {
+ /*
+ * the amount of data is pretty small, so just move the whole
+ * chain to another leaf page rather than splitting it.
+ */
+ Assert(!isNew);
+ moveLeafs(index, state, &current, &parent, leafTuple, isnull);
+ break; /* we're done */
+ }
+ else
+ {
+ /* picksplit */
+ if (doPickSplit(index, state, &current, &parent,
+ leafTuple, level, isnull, isNew))
+ break; /* doPickSplit installed new tuples */
+
+ /* leaf tuple will not be inserted yet */
+ pfree(leafTuple);
+
+ /*
+ * current now describes new inner tuple, go insert into it
+ */
+ Assert(!SpGistPageIsLeaf(current.page));
+ goto process_inner_tuple;
+ }
+ }
+ else /* non-leaf page */
+ {
+ /*
+ * Apply the opclass choose function to figure out how to insert
+ * the given datum into the current inner tuple.
+ */
+ SpGistInnerTuple innerTuple;
+ spgChooseIn in;
+ spgChooseOut out;
+
+ /*
+ * spgAddNode and spgSplitTuple cases will loop back to here to
+ * complete the insertion operation. Just in case the choose
+ * function is broken and produces add or split requests
+ * repeatedly, check for query cancel (see comments above).
+ */
+ process_inner_tuple:
+ if (INTERRUPTS_PENDING_CONDITION())
+ {
+ result = false;
+ break;
+ }
+
+ innerTuple = (SpGistInnerTuple) PageGetItem(current.page,
+ PageGetItemId(current.page, current.offnum));
+
+ in.datum = datums[spgKeyColumn];
+ in.leafDatum = leafDatums[spgKeyColumn];
+ in.level = level;
+ in.allTheSame = innerTuple->allTheSame;
+ in.hasPrefix = (innerTuple->prefixSize > 0);
+ in.prefixDatum = SGITDATUM(innerTuple, state);
+ in.nNodes = innerTuple->nNodes;
+ in.nodeLabels = spgExtractNodeLabels(state, innerTuple);
+
+ memset(&out, 0, sizeof(out));
+
+ if (!isnull)
+ {
+ /* use user-defined choose method */
+ FunctionCall2Coll(procinfo,
+ index->rd_indcollation[0],
+ PointerGetDatum(&in),
+ PointerGetDatum(&out));
+ }
+ else
+ {
+ /* force "match" action (to insert to random subnode) */
+ out.resultType = spgMatchNode;
+ }
+
+ if (innerTuple->allTheSame)
+ {
+ /*
+ * It's not allowed to do an AddNode at an allTheSame tuple.
+ * Opclass must say "match", in which case we choose a random
+ * one of the nodes to descend into, or "split".
+ */
+ if (out.resultType == spgAddNode)
+ elog(ERROR, "cannot add a node to an allTheSame inner tuple");
+ else if (out.resultType == spgMatchNode)
+ out.result.matchNode.nodeN =
+ pg_prng_uint64_range(&pg_global_prng_state,
+ 0, innerTuple->nNodes - 1);
+ }
+
+ switch (out.resultType)
+ {
+ case spgMatchNode:
+ /* Descend to N'th child node */
+ spgMatchNodeAction(index, state, innerTuple,
+ &current, &parent,
+ out.result.matchNode.nodeN);
+ /* Adjust level as per opclass request */
+ level += out.result.matchNode.levelAdd;
+ /* Replace leafDatum and recompute leafSize */
+ if (!isnull)
+ {
+ leafDatums[spgKeyColumn] = out.result.matchNode.restDatum;
+ leafSize = SpGistGetLeafTupleSize(leafDescriptor,
+ leafDatums, isnulls);
+ leafSize += sizeof(ItemIdData);
+ }
+
+ /*
+ * Check new tuple size; fail if it can't fit, unless the
+ * opclass says it can handle the situation by suffixing.
+ *
+ * However, the opclass can only shorten the leaf datum,
+ * which may not be enough to ever make the tuple fit,
+ * since INCLUDE columns might alone use more than a page.
+ * Depending on the opclass' behavior, that could lead to
+ * an infinite loop --- spgtextproc.c, for example, will
+ * just repeatedly generate an empty-string leaf datum
+ * once it runs out of data. Actual bugs in opclasses
+ * might cause infinite looping, too. To detect such a
+ * loop, check to see if we are making progress by
+ * reducing the leafSize in each pass. This is a bit
+ * tricky though. Because of alignment considerations,
+ * the total tuple size might not decrease on every pass.
+ * Also, there are edge cases where the choose method
+ * might seem to not make progress for a cycle or two.
+ * Somewhat arbitrarily, we allow up to 10 no-progress
+ * iterations before failing. (This limit should be more
+ * than MAXALIGN, to accommodate opclasses that trim one
+ * byte from the leaf datum per pass.)
+ */
+ if (leafSize > SPGIST_PAGE_CAPACITY)
+ {
+ bool ok = false;
+
+ if (state->config.longValuesOK && !isnull)
+ {
+ if (leafSize < bestLeafSize)
+ {
+ ok = true;
+ bestLeafSize = leafSize;
+ numNoProgressCycles = 0;
+ }
+ else if (++numNoProgressCycles < 10)
+ ok = true;
+ }
+ if (!ok)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ leafSize - sizeof(ItemIdData),
+ SPGIST_PAGE_CAPACITY - sizeof(ItemIdData),
+ RelationGetRelationName(index)),
+ errhint("Values larger than a buffer page cannot be indexed.")));
+ }
+
+ /*
+ * Loop around and attempt to insert the new leafDatum at
+ * "current" (which might reference an existing child
+ * tuple, or might be invalid to force us to find a new
+ * page for the tuple).
+ */
+ break;
+ case spgAddNode:
+ /* AddNode is not sensible if nodes don't have labels */
+ if (in.nodeLabels == NULL)
+ elog(ERROR, "cannot add a node to an inner tuple without node labels");
+ /* Add node to inner tuple, per request */
+ spgAddNodeAction(index, state, innerTuple,
+ &current, &parent,
+ out.result.addNode.nodeN,
+ out.result.addNode.nodeLabel);
+
+ /*
+ * Retry insertion into the enlarged node. We assume that
+ * we'll get a MatchNode result this time.
+ */
+ goto process_inner_tuple;
+ break;
+ case spgSplitTuple:
+ /* Split inner tuple, per request */
+ spgSplitNodeAction(index, state, innerTuple,
+ &current, &out);
+
+ /* Retry insertion into the split node */
+ goto process_inner_tuple;
+ break;
+ default:
+ elog(ERROR, "unrecognized SPGiST choose result: %d",
+ (int) out.resultType);
+ break;
+ }
+ }
+ } /* end loop */
+
+ /*
+ * Release any buffers we're still holding. Beware of possibility that
+ * current and parent reference same buffer.
+ */
+ if (current.buffer != InvalidBuffer)
+ {
+ SpGistSetLastUsedPage(index, current.buffer);
+ UnlockReleaseBuffer(current.buffer);
+ }
+ if (parent.buffer != InvalidBuffer &&
+ parent.buffer != current.buffer)
+ {
+ SpGistSetLastUsedPage(index, parent.buffer);
+ UnlockReleaseBuffer(parent.buffer);
+ }
+
+ /*
+ * We do not support being called while some outer function is holding a
+ * buffer lock (or any other reason to postpone query cancels). If that
+ * were the case, telling the caller to retry would create an infinite
+ * loop.
+ */
+ Assert(INTERRUPTS_CAN_BE_PROCESSED());
+
+ /*
+ * Finally, check for interrupts again. If there was a query cancel,
+ * ProcessInterrupts() will be able to throw the error here. If it was
+ * some other kind of interrupt that can just be cleared, return false to
+ * tell our caller to retry.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ return result;
+}
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
new file mode 100644
index 0000000..bfb7404
--- /dev/null
+++ b/src/backend/access/spgist/spginsert.c
@@ -0,0 +1,243 @@
+/*-------------------------------------------------------------------------
+ *
+ * spginsert.c
+ * Externally visible index creation/insertion routines
+ *
+ * All the actual insertion logic is in spgdoinsert.c.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spginsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/spgist_private.h"
+#include "access/spgxlog.h"
+#include "access/tableam.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+typedef struct
+{
+ SpGistState spgstate; /* SPGiST's working state */
+ int64 indtuples; /* total number of tuples indexed */
+ MemoryContext tmpCtx; /* per-tuple temporary context */
+} SpGistBuildState;
+
+
+/* Callback to process one heap tuple during table_index_build_scan */
+static void
+spgistBuildCallback(Relation index, ItemPointer tid, Datum *values,
+ bool *isnull, bool tupleIsAlive, void *state)
+{
+ SpGistBuildState *buildstate = (SpGistBuildState *) state;
+ MemoryContext oldCtx;
+
+ /* Work in temp context, and reset it after each tuple */
+ oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+
+ /*
+ * Even though no concurrent insertions can be happening, we still might
+ * get a buffer-locking failure due to bgwriter or checkpointer taking a
+ * lock on some buffer. So we need to be willing to retry. We can flush
+ * any temp data when retrying.
+ */
+ while (!spgdoinsert(index, &buildstate->spgstate, tid,
+ values, isnull))
+ {
+ MemoryContextReset(buildstate->tmpCtx);
+ }
+
+ /* Update total tuple count */
+ buildstate->indtuples += 1;
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(buildstate->tmpCtx);
+}
+
+/*
+ * Build an SP-GiST index.
+ */
+IndexBuildResult *
+spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+ IndexBuildResult *result;
+ double reltuples;
+ SpGistBuildState buildstate;
+ Buffer metabuffer,
+ rootbuffer,
+ nullbuffer;
+
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ /*
+ * Initialize the meta page and root pages
+ */
+ metabuffer = SpGistNewBuffer(index);
+ rootbuffer = SpGistNewBuffer(index);
+ nullbuffer = SpGistNewBuffer(index);
+
+ Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO);
+ Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_ROOT_BLKNO);
+ Assert(BufferGetBlockNumber(nullbuffer) == SPGIST_NULL_BLKNO);
+
+ START_CRIT_SECTION();
+
+ SpGistInitMetapage(BufferGetPage(metabuffer));
+ MarkBufferDirty(metabuffer);
+ SpGistInitBuffer(rootbuffer, SPGIST_LEAF);
+ MarkBufferDirty(rootbuffer);
+ SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
+ MarkBufferDirty(nullbuffer);
+
+
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(metabuffer);
+ UnlockReleaseBuffer(rootbuffer);
+ UnlockReleaseBuffer(nullbuffer);
+
+ /*
+ * Now insert all the heap data into the index
+ */
+ initSpGistState(&buildstate.spgstate, index);
+ buildstate.spgstate.isBuild = true;
+ buildstate.indtuples = 0;
+
+ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST build temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+ spgistBuildCallback, (void *) &buildstate,
+ NULL);
+
+ MemoryContextDelete(buildstate.tmpCtx);
+
+ SpGistUpdateMetaPage(index);
+
+ /*
+ * We didn't write WAL records as we built the index, so if WAL-logging is
+ * required, write all pages to the WAL now.
+ */
+ if (RelationNeedsWAL(index))
+ {
+ log_newpage_range(index, MAIN_FORKNUM,
+ 0, RelationGetNumberOfBlocks(index),
+ true);
+ }
+
+ result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult));
+ result->heap_tuples = reltuples;
+ result->index_tuples = buildstate.indtuples;
+
+ return result;
+}
+
+/*
+ * Build an empty SPGiST index in the initialization fork
+ */
+void
+spgbuildempty(Relation index)
+{
+ Page page;
+
+ /* Construct metapage. */
+ page = (Page) palloc(BLCKSZ);
+ SpGistInitMetapage(page);
+
+ /*
+ * Write the page and log it unconditionally. This is important
+ * particularly for indexes created on tablespaces and databases whose
+ * creation happened after the last redo pointer as recovery removes any
+ * of their existing content when the corresponding create records are
+ * replayed.
+ */
+ PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
+ smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
+ (char *) page, true);
+ log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
+ SPGIST_METAPAGE_BLKNO, page, true);
+
+ /* Likewise for the root page. */
+ SpGistInitPage(page, SPGIST_LEAF);
+
+ PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
+ smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_ROOT_BLKNO,
+ (char *) page, true);
+ log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
+ SPGIST_ROOT_BLKNO, page, true);
+
+ /* Likewise for the null-tuples root page. */
+ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
+
+ PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
+ smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_NULL_BLKNO,
+ (char *) page, true);
+ log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
+ SPGIST_NULL_BLKNO, page, true);
+
+ /*
+ * An immediate sync is required even if we xlog'd the pages, because the
+ * writes did not go through shared buffers and therefore a concurrent
+ * checkpoint may have moved the redo pointer past our xlog record.
+ */
+ smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM);
+}
+
+/*
+ * Insert one new tuple into an SPGiST index.
+ */
+bool
+spginsert(Relation index, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ IndexInfo *indexInfo)
+{
+ SpGistState spgstate;
+ MemoryContext oldCtx;
+ MemoryContext insertCtx;
+
+ insertCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST insert temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+ oldCtx = MemoryContextSwitchTo(insertCtx);
+
+ initSpGistState(&spgstate, index);
+
+ /*
+ * We might have to repeat spgdoinsert() multiple times, if conflicts
+ * occur with concurrent insertions. If so, reset the insertCtx each time
+ * to avoid cumulative memory consumption. That means we also have to
+ * redo initSpGistState(), but it's cheap enough not to matter.
+ */
+ while (!spgdoinsert(index, &spgstate, ht_ctid, values, isnull))
+ {
+ MemoryContextReset(insertCtx);
+ initSpGistState(&spgstate, index);
+ }
+
+ SpGistUpdateMetaPage(index);
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextDelete(insertCtx);
+
+ /* return false since we've not done any unique check */
+ return false;
+}
diff --git a/src/backend/access/spgist/spgkdtreeproc.c b/src/backend/access/spgist/spgkdtreeproc.c
new file mode 100644
index 0000000..d6bf675
--- /dev/null
+++ b/src/backend/access/spgist/spgkdtreeproc.c
@@ -0,0 +1,349 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgkdtreeproc.c
+ * implementation of k-d tree over points for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgkdtreeproc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/spgist.h"
+#include "access/spgist_private.h"
+#include "access/stratnum.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/float.h"
+#include "utils/geo_decls.h"
+
+
+Datum
+spg_kd_config(PG_FUNCTION_ARGS)
+{
+ /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */
+ spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1);
+
+ cfg->prefixType = FLOAT8OID;
+ cfg->labelType = VOIDOID; /* we don't need node labels */
+ cfg->canReturnData = true;
+ cfg->longValuesOK = false;
+ PG_RETURN_VOID();
+}
+
+static int
+getSide(double coord, bool isX, Point *tst)
+{
+ double tstcoord = (isX) ? tst->x : tst->y;
+
+ if (coord == tstcoord)
+ return 0;
+ else if (coord > tstcoord)
+ return 1;
+ else
+ return -1;
+}
+
+Datum
+spg_kd_choose(PG_FUNCTION_ARGS)
+{
+ spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0);
+ spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1);
+ Point *inPoint = DatumGetPointP(in->datum);
+ double coord;
+
+ if (in->allTheSame)
+ elog(ERROR, "allTheSame should not occur for k-d trees");
+
+ Assert(in->hasPrefix);
+ coord = DatumGetFloat8(in->prefixDatum);
+
+ Assert(in->nNodes == 2);
+
+ out->resultType = spgMatchNode;
+ out->result.matchNode.nodeN =
+ (getSide(coord, in->level % 2, inPoint) > 0) ? 0 : 1;
+ out->result.matchNode.levelAdd = 1;
+ out->result.matchNode.restDatum = PointPGetDatum(inPoint);
+
+ PG_RETURN_VOID();
+}
+
+typedef struct SortedPoint
+{
+ Point *p;
+ int i;
+} SortedPoint;
+
+static int
+x_cmp(const void *a, const void *b)
+{
+ SortedPoint *pa = (SortedPoint *) a;
+ SortedPoint *pb = (SortedPoint *) b;
+
+ if (pa->p->x == pb->p->x)
+ return 0;
+ return (pa->p->x > pb->p->x) ? 1 : -1;
+}
+
+static int
+y_cmp(const void *a, const void *b)
+{
+ SortedPoint *pa = (SortedPoint *) a;
+ SortedPoint *pb = (SortedPoint *) b;
+
+ if (pa->p->y == pb->p->y)
+ return 0;
+ return (pa->p->y > pb->p->y) ? 1 : -1;
+}
+
+
+Datum
+spg_kd_picksplit(PG_FUNCTION_ARGS)
+{
+ spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0);
+ spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1);
+ int i;
+ int middle;
+ SortedPoint *sorted;
+ double coord;
+
+ sorted = palloc(sizeof(*sorted) * in->nTuples);
+ for (i = 0; i < in->nTuples; i++)
+ {
+ sorted[i].p = DatumGetPointP(in->datums[i]);
+ sorted[i].i = i;
+ }
+
+ qsort(sorted, in->nTuples, sizeof(*sorted),
+ (in->level % 2) ? x_cmp : y_cmp);
+ middle = in->nTuples >> 1;
+ coord = (in->level % 2) ? sorted[middle].p->x : sorted[middle].p->y;
+
+ out->hasPrefix = true;
+ out->prefixDatum = Float8GetDatum(coord);
+
+ out->nNodes = 2;
+ out->nodeLabels = NULL; /* we don't need node labels */
+
+ out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples);
+ out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples);
+
+ /*
+ * Note: points that have coordinates exactly equal to coord may get
+ * classified into either node, depending on where they happen to fall in
+ * the sorted list. This is okay as long as the inner_consistent function
+ * descends into both sides for such cases. This is better than the
+ * alternative of trying to have an exact boundary, because it keeps the
+ * tree balanced even when we have many instances of the same point value.
+ * So we should never trigger the allTheSame logic.
+ */
+ for (i = 0; i < in->nTuples; i++)
+ {
+ Point *p = sorted[i].p;
+ int n = sorted[i].i;
+
+ out->mapTuplesToNodes[n] = (i < middle) ? 0 : 1;
+ out->leafTupleDatums[n] = PointPGetDatum(p);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+spg_kd_inner_consistent(PG_FUNCTION_ARGS)
+{
+ spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
+ spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
+ double coord;
+ int which;
+ int i;
+ BOX bboxes[2];
+
+ Assert(in->hasPrefix);
+ coord = DatumGetFloat8(in->prefixDatum);
+
+ if (in->allTheSame)
+ elog(ERROR, "allTheSame should not occur for k-d trees");
+
+ Assert(in->nNodes == 2);
+
+ /* "which" is a bitmask of children that satisfy all constraints */
+ which = (1 << 1) | (1 << 2);
+
+ for (i = 0; i < in->nkeys; i++)
+ {
+ Point *query = DatumGetPointP(in->scankeys[i].sk_argument);
+ BOX *boxQuery;
+
+ switch (in->scankeys[i].sk_strategy)
+ {
+ case RTLeftStrategyNumber:
+ if ((in->level % 2) != 0 && FPlt(query->x, coord))
+ which &= (1 << 1);
+ break;
+ case RTRightStrategyNumber:
+ if ((in->level % 2) != 0 && FPgt(query->x, coord))
+ which &= (1 << 2);
+ break;
+ case RTSameStrategyNumber:
+ if ((in->level % 2) != 0)
+ {
+ if (FPlt(query->x, coord))
+ which &= (1 << 1);
+ else if (FPgt(query->x, coord))
+ which &= (1 << 2);
+ }
+ else
+ {
+ if (FPlt(query->y, coord))
+ which &= (1 << 1);
+ else if (FPgt(query->y, coord))
+ which &= (1 << 2);
+ }
+ break;
+ case RTBelowStrategyNumber:
+ case RTOldBelowStrategyNumber:
+ if ((in->level % 2) == 0 && FPlt(query->y, coord))
+ which &= (1 << 1);
+ break;
+ case RTAboveStrategyNumber:
+ case RTOldAboveStrategyNumber:
+ if ((in->level % 2) == 0 && FPgt(query->y, coord))
+ which &= (1 << 2);
+ break;
+ case RTContainedByStrategyNumber:
+
+ /*
+ * For this operator, the query is a box not a point. We
+ * cheat to the extent of assuming that DatumGetPointP won't
+ * do anything that would be bad for a pointer-to-box.
+ */
+ boxQuery = DatumGetBoxP(in->scankeys[i].sk_argument);
+
+ if ((in->level % 2) != 0)
+ {
+ if (FPlt(boxQuery->high.x, coord))
+ which &= (1 << 1);
+ else if (FPgt(boxQuery->low.x, coord))
+ which &= (1 << 2);
+ }
+ else
+ {
+ if (FPlt(boxQuery->high.y, coord))
+ which &= (1 << 1);
+ else if (FPgt(boxQuery->low.y, coord))
+ which &= (1 << 2);
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d",
+ in->scankeys[i].sk_strategy);
+ break;
+ }
+
+ if (which == 0)
+ break; /* no need to consider remaining conditions */
+ }
+
+ /* We must descend into the children identified by which */
+ out->nNodes = 0;
+
+ /* Fast-path for no matching children */
+ if (!which)
+ PG_RETURN_VOID();
+
+ out->nodeNumbers = (int *) palloc(sizeof(int) * 2);
+
+ /*
+ * When ordering scan keys are specified, we've to calculate distance for
+ * them. In order to do that, we need calculate bounding boxes for both
+ * children nodes. Calculation of those bounding boxes on non-zero level
+ * require knowledge of bounding box of upper node. So, we save bounding
+ * boxes to traversalValues.
+ */
+ if (in->norderbys > 0)
+ {
+ BOX infArea;
+ BOX *area;
+
+ out->distances = (double **) palloc(sizeof(double *) * in->nNodes);
+ out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes);
+
+ if (in->level == 0)
+ {
+ float8 inf = get_float8_infinity();
+
+ infArea.high.x = inf;
+ infArea.high.y = inf;
+ infArea.low.x = -inf;
+ infArea.low.y = -inf;
+ area = &infArea;
+ }
+ else
+ {
+ area = (BOX *) in->traversalValue;
+ Assert(area);
+ }
+
+ bboxes[0].low = area->low;
+ bboxes[1].high = area->high;
+
+ if (in->level % 2)
+ {
+ /* split box by x */
+ bboxes[0].high.x = bboxes[1].low.x = coord;
+ bboxes[0].high.y = area->high.y;
+ bboxes[1].low.y = area->low.y;
+ }
+ else
+ {
+ /* split box by y */
+ bboxes[0].high.y = bboxes[1].low.y = coord;
+ bboxes[0].high.x = area->high.x;
+ bboxes[1].low.x = area->low.x;
+ }
+ }
+
+ for (i = 1; i <= 2; i++)
+ {
+ if (which & (1 << i))
+ {
+ out->nodeNumbers[out->nNodes] = i - 1;
+
+ if (in->norderbys > 0)
+ {
+ MemoryContext oldCtx = MemoryContextSwitchTo(in->traversalMemoryContext);
+ BOX *box = box_copy(&bboxes[i - 1]);
+
+ MemoryContextSwitchTo(oldCtx);
+
+ out->traversalValues[out->nNodes] = box;
+
+ out->distances[out->nNodes] = spg_key_orderbys_distances(BoxPGetDatum(box), false,
+ in->orderbys, in->norderbys);
+ }
+
+ out->nNodes++;
+ }
+ }
+
+ /* Set up level increments, too */
+ out->levelAdds = (int *) palloc(sizeof(int) * 2);
+ out->levelAdds[0] = 1;
+ out->levelAdds[1] = 1;
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * spg_kd_leaf_consistent() is the same as spg_quad_leaf_consistent(),
+ * since we support the same operators and the same leaf data type.
+ * So we just borrow that function.
+ */
diff --git a/src/backend/access/spgist/spgproc.c b/src/backend/access/spgist/spgproc.c
new file mode 100644
index 0000000..4cfb675
--- /dev/null
+++ b/src/backend/access/spgist/spgproc.c
@@ -0,0 +1,88 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgproc.c
+ * Common supporting procedures for SP-GiST opclasses.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgproc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/spgist_private.h"
+#include "utils/builtins.h"
+#include "utils/float.h"
+#include "utils/geo_decls.h"
+
+#define point_point_distance(p1,p2) \
+ DatumGetFloat8(DirectFunctionCall2(point_distance, \
+ PointPGetDatum(p1), PointPGetDatum(p2)))
+
+/* Point-box distance in the assumption that box is aligned by axis */
+static double
+point_box_distance(Point *point, BOX *box)
+{
+ double dx,
+ dy;
+
+ if (isnan(point->x) || isnan(box->low.x) ||
+ isnan(point->y) || isnan(box->low.y))
+ return get_float8_nan();
+
+ if (point->x < box->low.x)
+ dx = box->low.x - point->x;
+ else if (point->x > box->high.x)
+ dx = point->x - box->high.x;
+ else
+ dx = 0.0;
+
+ if (point->y < box->low.y)
+ dy = box->low.y - point->y;
+ else if (point->y > box->high.y)
+ dy = point->y - box->high.y;
+ else
+ dy = 0.0;
+
+ return HYPOT(dx, dy);
+}
+
+/*
+ * Returns distances from given key to array of ordering scan keys. Leaf key
+ * is expected to be point, non-leaf key is expected to be box. Scan key
+ * arguments are expected to be points.
+ */
+double *
+spg_key_orderbys_distances(Datum key, bool isLeaf,
+ ScanKey orderbys, int norderbys)
+{
+ int sk_num;
+ double *distances = (double *) palloc(norderbys * sizeof(double)),
+ *distance = distances;
+
+ for (sk_num = 0; sk_num < norderbys; ++sk_num, ++orderbys, ++distance)
+ {
+ Point *point = DatumGetPointP(orderbys->sk_argument);
+
+ *distance = isLeaf ? point_point_distance(point, DatumGetPointP(key))
+ : point_box_distance(point, DatumGetBoxP(key));
+ }
+
+ return distances;
+}
+
+BOX *
+box_copy(BOX *orig)
+{
+ BOX *result = palloc(sizeof(BOX));
+
+ *result = *orig;
+ return result;
+}
diff --git a/src/backend/access/spgist/spgquadtreeproc.c b/src/backend/access/spgist/spgquadtreeproc.c
new file mode 100644
index 0000000..ce6464f
--- /dev/null
+++ b/src/backend/access/spgist/spgquadtreeproc.c
@@ -0,0 +1,471 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgquadtreeproc.c
+ * implementation of quad tree over points for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgquadtreeproc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/spgist.h"
+#include "access/spgist_private.h"
+#include "access/stratnum.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/float.h"
+#include "utils/geo_decls.h"
+
+Datum
+spg_quad_config(PG_FUNCTION_ARGS)
+{
+ /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */
+ spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1);
+
+ cfg->prefixType = POINTOID;
+ cfg->labelType = VOIDOID; /* we don't need node labels */
+ cfg->canReturnData = true;
+ cfg->longValuesOK = false;
+ PG_RETURN_VOID();
+}
+
+#define SPTEST(f, x, y) \
+ DatumGetBool(DirectFunctionCall2(f, PointPGetDatum(x), PointPGetDatum(y)))
+
+/*
+ * Determine which quadrant a point falls into, relative to the centroid.
+ *
+ * Quadrants are identified like this:
+ *
+ * 4 | 1
+ * ----+-----
+ * 3 | 2
+ *
+ * Points on one of the axes are taken to lie in the lowest-numbered
+ * adjacent quadrant.
+ */
+static int16
+getQuadrant(Point *centroid, Point *tst)
+{
+ if ((SPTEST(point_above, tst, centroid) ||
+ SPTEST(point_horiz, tst, centroid)) &&
+ (SPTEST(point_right, tst, centroid) ||
+ SPTEST(point_vert, tst, centroid)))
+ return 1;
+
+ if (SPTEST(point_below, tst, centroid) &&
+ (SPTEST(point_right, tst, centroid) ||
+ SPTEST(point_vert, tst, centroid)))
+ return 2;
+
+ if ((SPTEST(point_below, tst, centroid) ||
+ SPTEST(point_horiz, tst, centroid)) &&
+ SPTEST(point_left, tst, centroid))
+ return 3;
+
+ if (SPTEST(point_above, tst, centroid) &&
+ SPTEST(point_left, tst, centroid))
+ return 4;
+
+ elog(ERROR, "getQuadrant: impossible case");
+ return 0;
+}
+
+/* Returns bounding box of a given quadrant inside given bounding box */
+static BOX *
+getQuadrantArea(BOX *bbox, Point *centroid, int quadrant)
+{
+ BOX *result = (BOX *) palloc(sizeof(BOX));
+
+ switch (quadrant)
+ {
+ case 1:
+ result->high = bbox->high;
+ result->low = *centroid;
+ break;
+ case 2:
+ result->high.x = bbox->high.x;
+ result->high.y = centroid->y;
+ result->low.x = centroid->x;
+ result->low.y = bbox->low.y;
+ break;
+ case 3:
+ result->high = *centroid;
+ result->low = bbox->low;
+ break;
+ case 4:
+ result->high.x = centroid->x;
+ result->high.y = bbox->high.y;
+ result->low.x = bbox->low.x;
+ result->low.y = centroid->y;
+ break;
+ }
+
+ return result;
+}
+
+Datum
+spg_quad_choose(PG_FUNCTION_ARGS)
+{
+ spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0);
+ spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1);
+ Point *inPoint = DatumGetPointP(in->datum),
+ *centroid;
+
+ if (in->allTheSame)
+ {
+ out->resultType = spgMatchNode;
+ /* nodeN will be set by core */
+ out->result.matchNode.levelAdd = 0;
+ out->result.matchNode.restDatum = PointPGetDatum(inPoint);
+ PG_RETURN_VOID();
+ }
+
+ Assert(in->hasPrefix);
+ centroid = DatumGetPointP(in->prefixDatum);
+
+ Assert(in->nNodes == 4);
+
+ out->resultType = spgMatchNode;
+ out->result.matchNode.nodeN = getQuadrant(centroid, inPoint) - 1;
+ out->result.matchNode.levelAdd = 0;
+ out->result.matchNode.restDatum = PointPGetDatum(inPoint);
+
+ PG_RETURN_VOID();
+}
+
+#ifdef USE_MEDIAN
+static int
+x_cmp(const void *a, const void *b, void *arg)
+{
+ Point *pa = *(Point **) a;
+ Point *pb = *(Point **) b;
+
+ if (pa->x == pb->x)
+ return 0;
+ return (pa->x > pb->x) ? 1 : -1;
+}
+
+static int
+y_cmp(const void *a, const void *b, void *arg)
+{
+ Point *pa = *(Point **) a;
+ Point *pb = *(Point **) b;
+
+ if (pa->y == pb->y)
+ return 0;
+ return (pa->y > pb->y) ? 1 : -1;
+}
+#endif
+
+Datum
+spg_quad_picksplit(PG_FUNCTION_ARGS)
+{
+ spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0);
+ spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1);
+ int i;
+ Point *centroid;
+
+#ifdef USE_MEDIAN
+ /* Use the median values of x and y as the centroid point */
+ Point **sorted;
+
+ sorted = palloc(sizeof(*sorted) * in->nTuples);
+ for (i = 0; i < in->nTuples; i++)
+ sorted[i] = DatumGetPointP(in->datums[i]);
+
+ centroid = palloc(sizeof(*centroid));
+
+ qsort(sorted, in->nTuples, sizeof(*sorted), x_cmp);
+ centroid->x = sorted[in->nTuples >> 1]->x;
+ qsort(sorted, in->nTuples, sizeof(*sorted), y_cmp);
+ centroid->y = sorted[in->nTuples >> 1]->y;
+#else
+ /* Use the average values of x and y as the centroid point */
+ centroid = palloc0(sizeof(*centroid));
+
+ for (i = 0; i < in->nTuples; i++)
+ {
+ centroid->x += DatumGetPointP(in->datums[i])->x;
+ centroid->y += DatumGetPointP(in->datums[i])->y;
+ }
+
+ centroid->x /= in->nTuples;
+ centroid->y /= in->nTuples;
+#endif
+
+ out->hasPrefix = true;
+ out->prefixDatum = PointPGetDatum(centroid);
+
+ out->nNodes = 4;
+ out->nodeLabels = NULL; /* we don't need node labels */
+
+ out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples);
+ out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples);
+
+ for (i = 0; i < in->nTuples; i++)
+ {
+ Point *p = DatumGetPointP(in->datums[i]);
+ int quadrant = getQuadrant(centroid, p) - 1;
+
+ out->leafTupleDatums[i] = PointPGetDatum(p);
+ out->mapTuplesToNodes[i] = quadrant;
+ }
+
+ PG_RETURN_VOID();
+}
+
+
+Datum
+spg_quad_inner_consistent(PG_FUNCTION_ARGS)
+{
+ spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
+ spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
+ Point *centroid;
+ BOX infbbox;
+ BOX *bbox = NULL;
+ int which;
+ int i;
+
+ Assert(in->hasPrefix);
+ centroid = DatumGetPointP(in->prefixDatum);
+
+ /*
+ * When ordering scan keys are specified, we've to calculate distance for
+ * them. In order to do that, we need calculate bounding boxes for all
+ * children nodes. Calculation of those bounding boxes on non-zero level
+ * require knowledge of bounding box of upper node. So, we save bounding
+ * boxes to traversalValues.
+ */
+ if (in->norderbys > 0)
+ {
+ out->distances = (double **) palloc(sizeof(double *) * in->nNodes);
+ out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes);
+
+ if (in->level == 0)
+ {
+ double inf = get_float8_infinity();
+
+ infbbox.high.x = inf;
+ infbbox.high.y = inf;
+ infbbox.low.x = -inf;
+ infbbox.low.y = -inf;
+ bbox = &infbbox;
+ }
+ else
+ {
+ bbox = in->traversalValue;
+ Assert(bbox);
+ }
+ }
+
+ if (in->allTheSame)
+ {
+ /* Report that all nodes should be visited */
+ out->nNodes = in->nNodes;
+ out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes);
+ for (i = 0; i < in->nNodes; i++)
+ {
+ out->nodeNumbers[i] = i;
+
+ if (in->norderbys > 0)
+ {
+ MemoryContext oldCtx = MemoryContextSwitchTo(in->traversalMemoryContext);
+
+ /* Use parent quadrant box as traversalValue */
+ BOX *quadrant = box_copy(bbox);
+
+ MemoryContextSwitchTo(oldCtx);
+
+ out->traversalValues[i] = quadrant;
+ out->distances[i] = spg_key_orderbys_distances(BoxPGetDatum(quadrant), false,
+ in->orderbys, in->norderbys);
+ }
+ }
+ PG_RETURN_VOID();
+ }
+
+ Assert(in->nNodes == 4);
+
+ /* "which" is a bitmask of quadrants that satisfy all constraints */
+ which = (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
+
+ for (i = 0; i < in->nkeys; i++)
+ {
+ Point *query = DatumGetPointP(in->scankeys[i].sk_argument);
+ BOX *boxQuery;
+
+ switch (in->scankeys[i].sk_strategy)
+ {
+ case RTLeftStrategyNumber:
+ if (SPTEST(point_right, centroid, query))
+ which &= (1 << 3) | (1 << 4);
+ break;
+ case RTRightStrategyNumber:
+ if (SPTEST(point_left, centroid, query))
+ which &= (1 << 1) | (1 << 2);
+ break;
+ case RTSameStrategyNumber:
+ which &= (1 << getQuadrant(centroid, query));
+ break;
+ case RTBelowStrategyNumber:
+ case RTOldBelowStrategyNumber:
+ if (SPTEST(point_above, centroid, query))
+ which &= (1 << 2) | (1 << 3);
+ break;
+ case RTAboveStrategyNumber:
+ case RTOldAboveStrategyNumber:
+ if (SPTEST(point_below, centroid, query))
+ which &= (1 << 1) | (1 << 4);
+ break;
+ case RTContainedByStrategyNumber:
+
+ /*
+ * For this operator, the query is a box not a point. We
+ * cheat to the extent of assuming that DatumGetPointP won't
+ * do anything that would be bad for a pointer-to-box.
+ */
+ boxQuery = DatumGetBoxP(in->scankeys[i].sk_argument);
+
+ if (DatumGetBool(DirectFunctionCall2(box_contain_pt,
+ PointerGetDatum(boxQuery),
+ PointerGetDatum(centroid))))
+ {
+ /* centroid is in box, so all quadrants are OK */
+ }
+ else
+ {
+ /* identify quadrant(s) containing all corners of box */
+ Point p;
+ int r = 0;
+
+ p = boxQuery->low;
+ r |= 1 << getQuadrant(centroid, &p);
+ p.y = boxQuery->high.y;
+ r |= 1 << getQuadrant(centroid, &p);
+ p = boxQuery->high;
+ r |= 1 << getQuadrant(centroid, &p);
+ p.x = boxQuery->low.x;
+ r |= 1 << getQuadrant(centroid, &p);
+
+ which &= r;
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d",
+ in->scankeys[i].sk_strategy);
+ break;
+ }
+
+ if (which == 0)
+ break; /* no need to consider remaining conditions */
+ }
+
+ out->levelAdds = palloc(sizeof(int) * 4);
+ for (i = 0; i < 4; ++i)
+ out->levelAdds[i] = 1;
+
+ /* We must descend into the quadrant(s) identified by which */
+ out->nodeNumbers = (int *) palloc(sizeof(int) * 4);
+ out->nNodes = 0;
+
+ for (i = 1; i <= 4; i++)
+ {
+ if (which & (1 << i))
+ {
+ out->nodeNumbers[out->nNodes] = i - 1;
+
+ if (in->norderbys > 0)
+ {
+ MemoryContext oldCtx = MemoryContextSwitchTo(in->traversalMemoryContext);
+ BOX *quadrant = getQuadrantArea(bbox, centroid, i);
+
+ MemoryContextSwitchTo(oldCtx);
+
+ out->traversalValues[out->nNodes] = quadrant;
+
+ out->distances[out->nNodes] = spg_key_orderbys_distances(BoxPGetDatum(quadrant), false,
+ in->orderbys, in->norderbys);
+ }
+
+ out->nNodes++;
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+
+Datum
+spg_quad_leaf_consistent(PG_FUNCTION_ARGS)
+{
+ spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0);
+ spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1);
+ Point *datum = DatumGetPointP(in->leafDatum);
+ bool res;
+ int i;
+
+ /* all tests are exact */
+ out->recheck = false;
+
+ /* leafDatum is what it is... */
+ out->leafValue = in->leafDatum;
+
+ /* Perform the required comparison(s) */
+ res = true;
+ for (i = 0; i < in->nkeys; i++)
+ {
+ Point *query = DatumGetPointP(in->scankeys[i].sk_argument);
+
+ switch (in->scankeys[i].sk_strategy)
+ {
+ case RTLeftStrategyNumber:
+ res = SPTEST(point_left, datum, query);
+ break;
+ case RTRightStrategyNumber:
+ res = SPTEST(point_right, datum, query);
+ break;
+ case RTSameStrategyNumber:
+ res = SPTEST(point_eq, datum, query);
+ break;
+ case RTBelowStrategyNumber:
+ case RTOldBelowStrategyNumber:
+ res = SPTEST(point_below, datum, query);
+ break;
+ case RTAboveStrategyNumber:
+ case RTOldAboveStrategyNumber:
+ res = SPTEST(point_above, datum, query);
+ break;
+ case RTContainedByStrategyNumber:
+
+ /*
+ * For this operator, the query is a box not a point. We
+ * cheat to the extent of assuming that DatumGetPointP won't
+ * do anything that would be bad for a pointer-to-box.
+ */
+ res = SPTEST(box_contain_pt, query, datum);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d",
+ in->scankeys[i].sk_strategy);
+ break;
+ }
+
+ if (!res)
+ break;
+ }
+
+ if (res && in->norderbys > 0)
+ /* ok, it passes -> let's compute the distances */
+ out->distances = spg_key_orderbys_distances(in->leafDatum, true,
+ in->orderbys, in->norderbys);
+
+ PG_RETURN_BOOL(res);
+}
diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c
new file mode 100644
index 0000000..87a345d
--- /dev/null
+++ b/src/backend/access/spgist/spgscan.c
@@ -0,0 +1,1097 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgscan.c
+ * routines for scanning SP-GiST indexes
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/spgist_private.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "utils/datum.h"
+#include "utils/float.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr,
+ Datum leafValue, bool isNull,
+ SpGistLeafTuple leafTuple, bool recheck,
+ bool recheckDistances, double *distances);
+
+/*
+ * Pairing heap comparison function for the SpGistSearchItem queue.
+ * KNN-searches currently only support NULLS LAST. So, preserve this logic
+ * here.
+ */
+static int
+pairingheap_SpGistSearchItem_cmp(const pairingheap_node *a,
+ const pairingheap_node *b, void *arg)
+{
+ const SpGistSearchItem *sa = (const SpGistSearchItem *) a;
+ const SpGistSearchItem *sb = (const SpGistSearchItem *) b;
+ SpGistScanOpaque so = (SpGistScanOpaque) arg;
+ int i;
+
+ if (sa->isNull)
+ {
+ if (!sb->isNull)
+ return -1;
+ }
+ else if (sb->isNull)
+ {
+ return 1;
+ }
+ else
+ {
+ /* Order according to distance comparison */
+ for (i = 0; i < so->numberOfNonNullOrderBys; i++)
+ {
+ if (isnan(sa->distances[i]) && isnan(sb->distances[i]))
+ continue; /* NaN == NaN */
+ if (isnan(sa->distances[i]))
+ return -1; /* NaN > number */
+ if (isnan(sb->distances[i]))
+ return 1; /* number < NaN */
+ if (sa->distances[i] != sb->distances[i])
+ return (sa->distances[i] < sb->distances[i]) ? 1 : -1;
+ }
+ }
+
+ /* Leaf items go before inner pages, to ensure a depth-first search */
+ if (sa->isLeaf && !sb->isLeaf)
+ return 1;
+ if (!sa->isLeaf && sb->isLeaf)
+ return -1;
+
+ return 0;
+}
+
+static void
+spgFreeSearchItem(SpGistScanOpaque so, SpGistSearchItem *item)
+{
+ /* value is of type attType if isLeaf, else of type attLeafType */
+ /* (no, that is not backwards; yes, it's confusing) */
+ if (!(item->isLeaf ? so->state.attType.attbyval :
+ so->state.attLeafType.attbyval) &&
+ DatumGetPointer(item->value) != NULL)
+ pfree(DatumGetPointer(item->value));
+
+ if (item->leafTuple)
+ pfree(item->leafTuple);
+
+ if (item->traversalValue)
+ pfree(item->traversalValue);
+
+ pfree(item);
+}
+
+/*
+ * Add SpGistSearchItem to queue
+ *
+ * Called in queue context
+ */
+static void
+spgAddSearchItemToQueue(SpGistScanOpaque so, SpGistSearchItem *item)
+{
+ pairingheap_add(so->scanQueue, &item->phNode);
+}
+
+static SpGistSearchItem *
+spgAllocSearchItem(SpGistScanOpaque so, bool isnull, double *distances)
+{
+ /* allocate distance array only for non-NULL items */
+ SpGistSearchItem *item =
+ palloc(SizeOfSpGistSearchItem(isnull ? 0 : so->numberOfNonNullOrderBys));
+
+ item->isNull = isnull;
+
+ if (!isnull && so->numberOfNonNullOrderBys > 0)
+ memcpy(item->distances, distances,
+ sizeof(item->distances[0]) * so->numberOfNonNullOrderBys);
+
+ return item;
+}
+
+static void
+spgAddStartItem(SpGistScanOpaque so, bool isnull)
+{
+ SpGistSearchItem *startEntry =
+ spgAllocSearchItem(so, isnull, so->zeroDistances);
+
+ ItemPointerSet(&startEntry->heapPtr,
+ isnull ? SPGIST_NULL_BLKNO : SPGIST_ROOT_BLKNO,
+ FirstOffsetNumber);
+ startEntry->isLeaf = false;
+ startEntry->level = 0;
+ startEntry->value = (Datum) 0;
+ startEntry->leafTuple = NULL;
+ startEntry->traversalValue = NULL;
+ startEntry->recheck = false;
+ startEntry->recheckDistances = false;
+
+ spgAddSearchItemToQueue(so, startEntry);
+}
+
+/*
+ * Initialize queue to search the root page, resetting
+ * any previously active scan
+ */
+static void
+resetSpGistScanOpaque(SpGistScanOpaque so)
+{
+ MemoryContext oldCtx;
+
+ MemoryContextReset(so->traversalCxt);
+
+ oldCtx = MemoryContextSwitchTo(so->traversalCxt);
+
+ /* initialize queue only for distance-ordered scans */
+ so->scanQueue = pairingheap_allocate(pairingheap_SpGistSearchItem_cmp, so);
+
+ if (so->searchNulls)
+ /* Add a work item to scan the null index entries */
+ spgAddStartItem(so, true);
+
+ if (so->searchNonNulls)
+ /* Add a work item to scan the non-null index entries */
+ spgAddStartItem(so, false);
+
+ MemoryContextSwitchTo(oldCtx);
+
+ if (so->numberOfOrderBys > 0)
+ {
+ /* Must pfree distances to avoid memory leak */
+ int i;
+
+ for (i = 0; i < so->nPtrs; i++)
+ if (so->distances[i])
+ pfree(so->distances[i]);
+ }
+
+ if (so->want_itup)
+ {
+ /* Must pfree reconstructed tuples to avoid memory leak */
+ int i;
+
+ for (i = 0; i < so->nPtrs; i++)
+ pfree(so->reconTups[i]);
+ }
+ so->iPtr = so->nPtrs = 0;
+}
+
+/*
+ * Prepare scan keys in SpGistScanOpaque from caller-given scan keys
+ *
+ * Sets searchNulls, searchNonNulls, numberOfKeys, keyData fields of *so.
+ *
+ * The point here is to eliminate null-related considerations from what the
+ * opclass consistent functions need to deal with. We assume all SPGiST-
+ * indexable operators are strict, so any null RHS value makes the scan
+ * condition unsatisfiable. We also pull out any IS NULL/IS NOT NULL
+ * conditions; their effect is reflected into searchNulls/searchNonNulls.
+ */
+static void
+spgPrepareScanKeys(IndexScanDesc scan)
+{
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+ bool qual_ok;
+ bool haveIsNull;
+ bool haveNotNull;
+ int nkeys;
+ int i;
+
+ so->numberOfOrderBys = scan->numberOfOrderBys;
+ so->orderByData = scan->orderByData;
+
+ if (so->numberOfOrderBys <= 0)
+ so->numberOfNonNullOrderBys = 0;
+ else
+ {
+ int j = 0;
+
+ /*
+ * Remove all NULL keys, but remember their offsets in the original
+ * array.
+ */
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ {
+ ScanKey skey = &so->orderByData[i];
+
+ if (skey->sk_flags & SK_ISNULL)
+ so->nonNullOrderByOffsets[i] = -1;
+ else
+ {
+ if (i != j)
+ so->orderByData[j] = *skey;
+
+ so->nonNullOrderByOffsets[i] = j++;
+ }
+ }
+
+ so->numberOfNonNullOrderBys = j;
+ }
+
+ if (scan->numberOfKeys <= 0)
+ {
+ /* If no quals, whole-index scan is required */
+ so->searchNulls = true;
+ so->searchNonNulls = true;
+ so->numberOfKeys = 0;
+ return;
+ }
+
+ /* Examine the given quals */
+ qual_ok = true;
+ haveIsNull = haveNotNull = false;
+ nkeys = 0;
+ for (i = 0; i < scan->numberOfKeys; i++)
+ {
+ ScanKey skey = &scan->keyData[i];
+
+ if (skey->sk_flags & SK_SEARCHNULL)
+ haveIsNull = true;
+ else if (skey->sk_flags & SK_SEARCHNOTNULL)
+ haveNotNull = true;
+ else if (skey->sk_flags & SK_ISNULL)
+ {
+ /* ordinary qual with null argument - unsatisfiable */
+ qual_ok = false;
+ break;
+ }
+ else
+ {
+ /* ordinary qual, propagate into so->keyData */
+ so->keyData[nkeys++] = *skey;
+ /* this effectively creates a not-null requirement */
+ haveNotNull = true;
+ }
+ }
+
+ /* IS NULL in combination with something else is unsatisfiable */
+ if (haveIsNull && haveNotNull)
+ qual_ok = false;
+
+ /* Emit results */
+ if (qual_ok)
+ {
+ so->searchNulls = haveIsNull;
+ so->searchNonNulls = haveNotNull;
+ so->numberOfKeys = nkeys;
+ }
+ else
+ {
+ so->searchNulls = false;
+ so->searchNonNulls = false;
+ so->numberOfKeys = 0;
+ }
+}
+
+IndexScanDesc
+spgbeginscan(Relation rel, int keysz, int orderbysz)
+{
+ IndexScanDesc scan;
+ SpGistScanOpaque so;
+ int i;
+
+ scan = RelationGetIndexScan(rel, keysz, orderbysz);
+
+ so = (SpGistScanOpaque) palloc0(sizeof(SpGistScanOpaqueData));
+ if (keysz > 0)
+ so->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * keysz);
+ else
+ so->keyData = NULL;
+ initSpGistState(&so->state, scan->indexRelation);
+
+ so->tempCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST search temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+ so->traversalCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST traversal-value context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * Set up reconTupDesc and xs_hitupdesc in case it's an index-only scan,
+ * making sure that the key column is shown as being of type attType.
+ * (It's rather annoying to do this work when it might be wasted, but for
+ * most opclasses we can re-use the index reldesc instead of making one.)
+ */
+ so->reconTupDesc = scan->xs_hitupdesc =
+ getSpGistTupleDesc(rel, &so->state.attType);
+
+ /* Allocate various arrays needed for order-by scans */
+ if (scan->numberOfOrderBys > 0)
+ {
+ /* This will be filled in spgrescan, but allocate the space here */
+ so->orderByTypes = (Oid *)
+ palloc(sizeof(Oid) * scan->numberOfOrderBys);
+ so->nonNullOrderByOffsets = (int *)
+ palloc(sizeof(int) * scan->numberOfOrderBys);
+
+ /* These arrays have constant contents, so we can fill them now */
+ so->zeroDistances = (double *)
+ palloc(sizeof(double) * scan->numberOfOrderBys);
+ so->infDistances = (double *)
+ palloc(sizeof(double) * scan->numberOfOrderBys);
+
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ {
+ so->zeroDistances[i] = 0.0;
+ so->infDistances[i] = get_float8_infinity();
+ }
+
+ scan->xs_orderbyvals = (Datum *)
+ palloc0(sizeof(Datum) * scan->numberOfOrderBys);
+ scan->xs_orderbynulls = (bool *)
+ palloc(sizeof(bool) * scan->numberOfOrderBys);
+ memset(scan->xs_orderbynulls, true,
+ sizeof(bool) * scan->numberOfOrderBys);
+ }
+
+ fmgr_info_copy(&so->innerConsistentFn,
+ index_getprocinfo(rel, 1, SPGIST_INNER_CONSISTENT_PROC),
+ CurrentMemoryContext);
+
+ fmgr_info_copy(&so->leafConsistentFn,
+ index_getprocinfo(rel, 1, SPGIST_LEAF_CONSISTENT_PROC),
+ CurrentMemoryContext);
+
+ so->indexCollation = rel->rd_indcollation[0];
+
+ scan->opaque = so;
+
+ return scan;
+}
+
+void
+spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys)
+{
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+
+ /* copy scankeys into local storage */
+ if (scankey && scan->numberOfKeys > 0)
+ memmove(scan->keyData, scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+
+ /* initialize order-by data if needed */
+ if (orderbys && scan->numberOfOrderBys > 0)
+ {
+ int i;
+
+ memmove(scan->orderByData, orderbys,
+ scan->numberOfOrderBys * sizeof(ScanKeyData));
+
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ {
+ ScanKey skey = &scan->orderByData[i];
+
+ /*
+ * Look up the datatype returned by the original ordering
+ * operator. SP-GiST always uses a float8 for the distance
+ * function, but the ordering operator could be anything else.
+ *
+ * XXX: The distance function is only allowed to be lossy if the
+ * ordering operator's result type is float4 or float8. Otherwise
+ * we don't know how to return the distance to the executor. But
+ * we cannot check that here, as we won't know if the distance
+ * function is lossy until it returns *recheck = true for the
+ * first time.
+ */
+ so->orderByTypes[i] = get_func_rettype(skey->sk_func.fn_oid);
+ }
+ }
+
+ /* preprocess scankeys, set up the representation in *so */
+ spgPrepareScanKeys(scan);
+
+ /* set up starting queue entries */
+ resetSpGistScanOpaque(so);
+
+ /* count an indexscan for stats */
+ pgstat_count_index_scan(scan->indexRelation);
+}
+
+void
+spgendscan(IndexScanDesc scan)
+{
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+
+ MemoryContextDelete(so->tempCxt);
+ MemoryContextDelete(so->traversalCxt);
+
+ if (so->keyData)
+ pfree(so->keyData);
+
+ if (so->state.leafTupDesc &&
+ so->state.leafTupDesc != RelationGetDescr(so->state.index))
+ FreeTupleDesc(so->state.leafTupDesc);
+
+ if (so->state.deadTupleStorage)
+ pfree(so->state.deadTupleStorage);
+
+ if (scan->numberOfOrderBys > 0)
+ {
+ pfree(so->orderByTypes);
+ pfree(so->nonNullOrderByOffsets);
+ pfree(so->zeroDistances);
+ pfree(so->infDistances);
+ pfree(scan->xs_orderbyvals);
+ pfree(scan->xs_orderbynulls);
+ }
+
+ pfree(so);
+}
+
+/*
+ * Leaf SpGistSearchItem constructor, called in queue context
+ */
+static SpGistSearchItem *
+spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
+ Datum leafValue, bool recheck, bool recheckDistances,
+ bool isnull, double *distances)
+{
+ SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances);
+
+ item->level = level;
+ item->heapPtr = leafTuple->heapPtr;
+
+ /*
+ * If we need the reconstructed value, copy it to queue cxt out of tmp
+ * cxt. Caution: the leaf_consistent method may not have supplied a value
+ * if we didn't ask it to, and mildly-broken methods might supply one of
+ * the wrong type. The correct leafValue type is attType not leafType.
+ */
+ if (so->want_itup)
+ {
+ item->value = isnull ? (Datum) 0 :
+ datumCopy(leafValue, so->state.attType.attbyval,
+ so->state.attType.attlen);
+
+ /*
+ * If we're going to need to reconstruct INCLUDE attributes, store the
+ * whole leaf tuple so we can get the INCLUDE attributes out of it.
+ */
+ if (so->state.leafTupDesc->natts > 1)
+ {
+ item->leafTuple = palloc(leafTuple->size);
+ memcpy(item->leafTuple, leafTuple, leafTuple->size);
+ }
+ else
+ item->leafTuple = NULL;
+ }
+ else
+ {
+ item->value = (Datum) 0;
+ item->leafTuple = NULL;
+ }
+ item->traversalValue = NULL;
+ item->isLeaf = true;
+ item->recheck = recheck;
+ item->recheckDistances = recheckDistances;
+
+ return item;
+}
+
+/*
+ * Test whether a leaf tuple satisfies all the scan keys
+ *
+ * *reportedSome is set to true if:
+ * the scan is not ordered AND the item satisfies the scankeys
+ */
+static bool
+spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item,
+ SpGistLeafTuple leafTuple, bool isnull,
+ bool *reportedSome, storeRes_func storeRes)
+{
+ Datum leafValue;
+ double *distances;
+ bool result;
+ bool recheck;
+ bool recheckDistances;
+
+ if (isnull)
+ {
+ /* Should not have arrived on a nulls page unless nulls are wanted */
+ Assert(so->searchNulls);
+ leafValue = (Datum) 0;
+ distances = NULL;
+ recheck = false;
+ recheckDistances = false;
+ result = true;
+ }
+ else
+ {
+ spgLeafConsistentIn in;
+ spgLeafConsistentOut out;
+
+ /* use temp context for calling leaf_consistent */
+ MemoryContext oldCxt = MemoryContextSwitchTo(so->tempCxt);
+
+ in.scankeys = so->keyData;
+ in.nkeys = so->numberOfKeys;
+ in.orderbys = so->orderByData;
+ in.norderbys = so->numberOfNonNullOrderBys;
+ Assert(!item->isLeaf); /* else reconstructedValue would be wrong type */
+ in.reconstructedValue = item->value;
+ in.traversalValue = item->traversalValue;
+ in.level = item->level;
+ in.returnData = so->want_itup;
+ in.leafDatum = SGLTDATUM(leafTuple, &so->state);
+
+ out.leafValue = (Datum) 0;
+ out.recheck = false;
+ out.distances = NULL;
+ out.recheckDistances = false;
+
+ result = DatumGetBool(FunctionCall2Coll(&so->leafConsistentFn,
+ so->indexCollation,
+ PointerGetDatum(&in),
+ PointerGetDatum(&out)));
+ recheck = out.recheck;
+ recheckDistances = out.recheckDistances;
+ leafValue = out.leafValue;
+ distances = out.distances;
+
+ MemoryContextSwitchTo(oldCxt);
+ }
+
+ if (result)
+ {
+ /* item passes the scankeys */
+ if (so->numberOfNonNullOrderBys > 0)
+ {
+ /* the scan is ordered -> add the item to the queue */
+ MemoryContext oldCxt = MemoryContextSwitchTo(so->traversalCxt);
+ SpGistSearchItem *heapItem = spgNewHeapItem(so, item->level,
+ leafTuple,
+ leafValue,
+ recheck,
+ recheckDistances,
+ isnull,
+ distances);
+
+ spgAddSearchItemToQueue(so, heapItem);
+
+ MemoryContextSwitchTo(oldCxt);
+ }
+ else
+ {
+ /* non-ordered scan, so report the item right away */
+ Assert(!recheckDistances);
+ storeRes(so, &leafTuple->heapPtr, leafValue, isnull,
+ leafTuple, recheck, false, NULL);
+ *reportedSome = true;
+ }
+ }
+
+ return result;
+}
+
+/* A bundle initializer for inner_consistent methods */
+static void
+spgInitInnerConsistentIn(spgInnerConsistentIn *in,
+ SpGistScanOpaque so,
+ SpGistSearchItem *item,
+ SpGistInnerTuple innerTuple)
+{
+ in->scankeys = so->keyData;
+ in->orderbys = so->orderByData;
+ in->nkeys = so->numberOfKeys;
+ in->norderbys = so->numberOfNonNullOrderBys;
+ Assert(!item->isLeaf); /* else reconstructedValue would be wrong type */
+ in->reconstructedValue = item->value;
+ in->traversalMemoryContext = so->traversalCxt;
+ in->traversalValue = item->traversalValue;
+ in->level = item->level;
+ in->returnData = so->want_itup;
+ in->allTheSame = innerTuple->allTheSame;
+ in->hasPrefix = (innerTuple->prefixSize > 0);
+ in->prefixDatum = SGITDATUM(innerTuple, &so->state);
+ in->nNodes = innerTuple->nNodes;
+ in->nodeLabels = spgExtractNodeLabels(&so->state, innerTuple);
+}
+
+static SpGistSearchItem *
+spgMakeInnerItem(SpGistScanOpaque so,
+ SpGistSearchItem *parentItem,
+ SpGistNodeTuple tuple,
+ spgInnerConsistentOut *out, int i, bool isnull,
+ double *distances)
+{
+ SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances);
+
+ item->heapPtr = tuple->t_tid;
+ item->level = out->levelAdds ? parentItem->level + out->levelAdds[i]
+ : parentItem->level;
+
+ /* Must copy value out of temp context */
+ /* (recall that reconstructed values are of type leafType) */
+ item->value = out->reconstructedValues
+ ? datumCopy(out->reconstructedValues[i],
+ so->state.attLeafType.attbyval,
+ so->state.attLeafType.attlen)
+ : (Datum) 0;
+
+ item->leafTuple = NULL;
+
+ /*
+ * Elements of out.traversalValues should be allocated in
+ * in.traversalMemoryContext, which is actually a long lived context of
+ * index scan.
+ */
+ item->traversalValue =
+ out->traversalValues ? out->traversalValues[i] : NULL;
+
+ item->isLeaf = false;
+ item->recheck = false;
+ item->recheckDistances = false;
+
+ return item;
+}
+
+static void
+spgInnerTest(SpGistScanOpaque so, SpGistSearchItem *item,
+ SpGistInnerTuple innerTuple, bool isnull)
+{
+ MemoryContext oldCxt = MemoryContextSwitchTo(so->tempCxt);
+ spgInnerConsistentOut out;
+ int nNodes = innerTuple->nNodes;
+ int i;
+
+ memset(&out, 0, sizeof(out));
+
+ if (!isnull)
+ {
+ spgInnerConsistentIn in;
+
+ spgInitInnerConsistentIn(&in, so, item, innerTuple);
+
+ /* use user-defined inner consistent method */
+ FunctionCall2Coll(&so->innerConsistentFn,
+ so->indexCollation,
+ PointerGetDatum(&in),
+ PointerGetDatum(&out));
+ }
+ else
+ {
+ /* force all children to be visited */
+ out.nNodes = nNodes;
+ out.nodeNumbers = (int *) palloc(sizeof(int) * nNodes);
+ for (i = 0; i < nNodes; i++)
+ out.nodeNumbers[i] = i;
+ }
+
+ /* If allTheSame, they should all or none of them match */
+ if (innerTuple->allTheSame && out.nNodes != 0 && out.nNodes != nNodes)
+ elog(ERROR, "inconsistent inner_consistent results for allTheSame inner tuple");
+
+ if (out.nNodes)
+ {
+ /* collect node pointers */
+ SpGistNodeTuple node;
+ SpGistNodeTuple *nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * nNodes);
+
+ SGITITERATE(innerTuple, i, node)
+ {
+ nodes[i] = node;
+ }
+
+ MemoryContextSwitchTo(so->traversalCxt);
+
+ for (i = 0; i < out.nNodes; i++)
+ {
+ int nodeN = out.nodeNumbers[i];
+ SpGistSearchItem *innerItem;
+ double *distances;
+
+ Assert(nodeN >= 0 && nodeN < nNodes);
+
+ node = nodes[nodeN];
+
+ if (!ItemPointerIsValid(&node->t_tid))
+ continue;
+
+ /*
+ * Use infinity distances if innerConsistentFn() failed to return
+ * them or if is a NULL item (their distances are really unused).
+ */
+ distances = out.distances ? out.distances[i] : so->infDistances;
+
+ innerItem = spgMakeInnerItem(so, item, node, &out, i, isnull,
+ distances);
+
+ spgAddSearchItemToQueue(so, innerItem);
+ }
+ }
+
+ MemoryContextSwitchTo(oldCxt);
+}
+
+/* Returns a next item in an (ordered) scan or null if the index is exhausted */
+static SpGistSearchItem *
+spgGetNextQueueItem(SpGistScanOpaque so)
+{
+ if (pairingheap_is_empty(so->scanQueue))
+ return NULL; /* Done when both heaps are empty */
+
+ /* Return item; caller is responsible to pfree it */
+ return (SpGistSearchItem *) pairingheap_remove_first(so->scanQueue);
+}
+
+enum SpGistSpecialOffsetNumbers
+{
+ SpGistBreakOffsetNumber = InvalidOffsetNumber,
+ SpGistRedirectOffsetNumber = MaxOffsetNumber + 1,
+ SpGistErrorOffsetNumber = MaxOffsetNumber + 2
+};
+
+static OffsetNumber
+spgTestLeafTuple(SpGistScanOpaque so,
+ SpGistSearchItem *item,
+ Page page, OffsetNumber offset,
+ bool isnull, bool isroot,
+ bool *reportedSome,
+ storeRes_func storeRes)
+{
+ SpGistLeafTuple leafTuple = (SpGistLeafTuple)
+ PageGetItem(page, PageGetItemId(page, offset));
+
+ if (leafTuple->tupstate != SPGIST_LIVE)
+ {
+ if (!isroot) /* all tuples on root should be live */
+ {
+ if (leafTuple->tupstate == SPGIST_REDIRECT)
+ {
+ /* redirection tuple should be first in chain */
+ Assert(offset == ItemPointerGetOffsetNumber(&item->heapPtr));
+ /* transfer attention to redirect point */
+ item->heapPtr = ((SpGistDeadTuple) leafTuple)->pointer;
+ Assert(ItemPointerGetBlockNumber(&item->heapPtr) != SPGIST_METAPAGE_BLKNO);
+ return SpGistRedirectOffsetNumber;
+ }
+
+ if (leafTuple->tupstate == SPGIST_DEAD)
+ {
+ /* dead tuple should be first in chain */
+ Assert(offset == ItemPointerGetOffsetNumber(&item->heapPtr));
+ /* No live entries on this page */
+ Assert(SGLT_GET_NEXTOFFSET(leafTuple) == InvalidOffsetNumber);
+ return SpGistBreakOffsetNumber;
+ }
+ }
+
+ /* We should not arrive at a placeholder */
+ elog(ERROR, "unexpected SPGiST tuple state: %d", leafTuple->tupstate);
+ return SpGistErrorOffsetNumber;
+ }
+
+ Assert(ItemPointerIsValid(&leafTuple->heapPtr));
+
+ spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes);
+
+ return SGLT_GET_NEXTOFFSET(leafTuple);
+}
+
+/*
+ * Walk the tree and report all tuples passing the scan quals to the storeRes
+ * subroutine.
+ *
+ * If scanWholeIndex is true, we'll do just that. If not, we'll stop at the
+ * next page boundary once we have reported at least one tuple.
+ */
+static void
+spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
+ storeRes_func storeRes, Snapshot snapshot)
+{
+ Buffer buffer = InvalidBuffer;
+ bool reportedSome = false;
+
+ while (scanWholeIndex || !reportedSome)
+ {
+ SpGistSearchItem *item = spgGetNextQueueItem(so);
+
+ if (item == NULL)
+ break; /* No more items in queue -> done */
+
+redirect:
+ /* Check for interrupts, just in case of infinite loop */
+ CHECK_FOR_INTERRUPTS();
+
+ if (item->isLeaf)
+ {
+ /* We store heap items in the queue only in case of ordered search */
+ Assert(so->numberOfNonNullOrderBys > 0);
+ storeRes(so, &item->heapPtr, item->value, item->isNull,
+ item->leafTuple, item->recheck,
+ item->recheckDistances, item->distances);
+ reportedSome = true;
+ }
+ else
+ {
+ BlockNumber blkno = ItemPointerGetBlockNumber(&item->heapPtr);
+ OffsetNumber offset = ItemPointerGetOffsetNumber(&item->heapPtr);
+ Page page;
+ bool isnull;
+
+ if (buffer == InvalidBuffer)
+ {
+ buffer = ReadBuffer(index, blkno);
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ }
+ else if (blkno != BufferGetBlockNumber(buffer))
+ {
+ UnlockReleaseBuffer(buffer);
+ buffer = ReadBuffer(index, blkno);
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ }
+
+ /* else new pointer points to the same page, no work needed */
+
+ page = BufferGetPage(buffer);
+ TestForOldSnapshot(snapshot, index, page);
+
+ isnull = SpGistPageStoresNulls(page) ? true : false;
+
+ if (SpGistPageIsLeaf(page))
+ {
+ /* Page is a leaf - that is, all it's tuples are heap items */
+ OffsetNumber max = PageGetMaxOffsetNumber(page);
+
+ if (SpGistBlockIsRoot(blkno))
+ {
+ /* When root is a leaf, examine all its tuples */
+ for (offset = FirstOffsetNumber; offset <= max; offset++)
+ (void) spgTestLeafTuple(so, item, page, offset,
+ isnull, true,
+ &reportedSome, storeRes);
+ }
+ else
+ {
+ /* Normal case: just examine the chain we arrived at */
+ while (offset != InvalidOffsetNumber)
+ {
+ Assert(offset >= FirstOffsetNumber && offset <= max);
+ offset = spgTestLeafTuple(so, item, page, offset,
+ isnull, false,
+ &reportedSome, storeRes);
+ if (offset == SpGistRedirectOffsetNumber)
+ goto redirect;
+ }
+ }
+ }
+ else /* page is inner */
+ {
+ SpGistInnerTuple innerTuple = (SpGistInnerTuple)
+ PageGetItem(page, PageGetItemId(page, offset));
+
+ if (innerTuple->tupstate != SPGIST_LIVE)
+ {
+ if (innerTuple->tupstate == SPGIST_REDIRECT)
+ {
+ /* transfer attention to redirect point */
+ item->heapPtr = ((SpGistDeadTuple) innerTuple)->pointer;
+ Assert(ItemPointerGetBlockNumber(&item->heapPtr) !=
+ SPGIST_METAPAGE_BLKNO);
+ goto redirect;
+ }
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ innerTuple->tupstate);
+ }
+
+ spgInnerTest(so, item, innerTuple, isnull);
+ }
+ }
+
+ /* done with this scan item */
+ spgFreeSearchItem(so, item);
+ /* clear temp context before proceeding to the next one */
+ MemoryContextReset(so->tempCxt);
+ }
+
+ if (buffer != InvalidBuffer)
+ UnlockReleaseBuffer(buffer);
+}
+
+
+/* storeRes subroutine for getbitmap case */
+static void
+storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr,
+ Datum leafValue, bool isnull,
+ SpGistLeafTuple leafTuple, bool recheck,
+ bool recheckDistances, double *distances)
+{
+ Assert(!recheckDistances && !distances);
+ tbm_add_tuples(so->tbm, heapPtr, 1, recheck);
+ so->ntids++;
+}
+
+int64
+spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+
+ /* Copy want_itup to *so so we don't need to pass it around separately */
+ so->want_itup = false;
+
+ so->tbm = tbm;
+ so->ntids = 0;
+
+ spgWalk(scan->indexRelation, so, true, storeBitmap, scan->xs_snapshot);
+
+ return so->ntids;
+}
+
+/* storeRes subroutine for gettuple case */
+static void
+storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
+ Datum leafValue, bool isnull,
+ SpGistLeafTuple leafTuple, bool recheck,
+ bool recheckDistances, double *nonNullDistances)
+{
+ Assert(so->nPtrs < MaxIndexTuplesPerPage);
+ so->heapPtrs[so->nPtrs] = *heapPtr;
+ so->recheck[so->nPtrs] = recheck;
+ so->recheckDistances[so->nPtrs] = recheckDistances;
+
+ if (so->numberOfOrderBys > 0)
+ {
+ if (isnull || so->numberOfNonNullOrderBys <= 0)
+ so->distances[so->nPtrs] = NULL;
+ else
+ {
+ IndexOrderByDistance *distances =
+ palloc(sizeof(distances[0]) * so->numberOfOrderBys);
+ int i;
+
+ for (i = 0; i < so->numberOfOrderBys; i++)
+ {
+ int offset = so->nonNullOrderByOffsets[i];
+
+ if (offset >= 0)
+ {
+ /* Copy non-NULL distance value */
+ distances[i].value = nonNullDistances[offset];
+ distances[i].isnull = false;
+ }
+ else
+ {
+ /* Set distance's NULL flag. */
+ distances[i].value = 0.0;
+ distances[i].isnull = true;
+ }
+ }
+
+ so->distances[so->nPtrs] = distances;
+ }
+ }
+
+ if (so->want_itup)
+ {
+ /*
+ * Reconstruct index data. We have to copy the datum out of the temp
+ * context anyway, so we may as well create the tuple here.
+ */
+ Datum leafDatums[INDEX_MAX_KEYS];
+ bool leafIsnulls[INDEX_MAX_KEYS];
+
+ /* We only need to deform the old tuple if it has INCLUDE attributes */
+ if (so->state.leafTupDesc->natts > 1)
+ spgDeformLeafTuple(leafTuple, so->state.leafTupDesc,
+ leafDatums, leafIsnulls, isnull);
+
+ leafDatums[spgKeyColumn] = leafValue;
+ leafIsnulls[spgKeyColumn] = isnull;
+
+ so->reconTups[so->nPtrs] = heap_form_tuple(so->reconTupDesc,
+ leafDatums,
+ leafIsnulls);
+ }
+ so->nPtrs++;
+}
+
+bool
+spggettuple(IndexScanDesc scan, ScanDirection dir)
+{
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+
+ if (dir != ForwardScanDirection)
+ elog(ERROR, "SP-GiST only supports forward scan direction");
+
+ /* Copy want_itup to *so so we don't need to pass it around separately */
+ so->want_itup = scan->xs_want_itup;
+
+ for (;;)
+ {
+ if (so->iPtr < so->nPtrs)
+ {
+ /* continuing to return reported tuples */
+ scan->xs_heaptid = so->heapPtrs[so->iPtr];
+ scan->xs_recheck = so->recheck[so->iPtr];
+ scan->xs_hitup = so->reconTups[so->iPtr];
+
+ if (so->numberOfOrderBys > 0)
+ index_store_float8_orderby_distances(scan, so->orderByTypes,
+ so->distances[so->iPtr],
+ so->recheckDistances[so->iPtr]);
+ so->iPtr++;
+ return true;
+ }
+
+ if (so->numberOfOrderBys > 0)
+ {
+ /* Must pfree distances to avoid memory leak */
+ int i;
+
+ for (i = 0; i < so->nPtrs; i++)
+ if (so->distances[i])
+ pfree(so->distances[i]);
+ }
+
+ if (so->want_itup)
+ {
+ /* Must pfree reconstructed tuples to avoid memory leak */
+ int i;
+
+ for (i = 0; i < so->nPtrs; i++)
+ pfree(so->reconTups[i]);
+ }
+ so->iPtr = so->nPtrs = 0;
+
+ spgWalk(scan->indexRelation, so, false, storeGettuple,
+ scan->xs_snapshot);
+
+ if (so->nPtrs == 0)
+ break; /* must have completed scan */
+ }
+
+ return false;
+}
+
+bool
+spgcanreturn(Relation index, int attno)
+{
+ SpGistCache *cache;
+
+ /* INCLUDE attributes can always be fetched for index-only scans */
+ if (attno > 1)
+ return true;
+
+ /* We can do it if the opclass config function says so */
+ cache = spgGetCache(index);
+
+ return cache->config.canReturnData;
+}
diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c
new file mode 100644
index 0000000..199d921
--- /dev/null
+++ b/src/backend/access/spgist/spgtextproc.c
@@ -0,0 +1,699 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgtextproc.c
+ * implementation of radix tree (compressed trie) over text
+ *
+ * In a text_ops SPGiST index, inner tuples can have a prefix which is the
+ * common prefix of all strings indexed under that tuple. The node labels
+ * represent the next byte of the string(s) after the prefix. Assuming we
+ * always use the longest possible prefix, we will get more than one node
+ * label unless the prefix length is restricted by SPGIST_MAX_PREFIX_LENGTH.
+ *
+ * To reconstruct the indexed string for any index entry, concatenate the
+ * inner-tuple prefixes and node labels starting at the root and working
+ * down to the leaf entry, then append the datum in the leaf entry.
+ * (While descending the tree, "level" is the number of bytes reconstructed
+ * so far.)
+ *
+ * However, there are two special cases for node labels: -1 indicates that
+ * there are no more bytes after the prefix-so-far, and -2 indicates that we
+ * had to split an existing allTheSame tuple (in such a case we have to create
+ * a node label that doesn't correspond to any string byte). In either case,
+ * the node label does not contribute anything to the reconstructed string.
+ *
+ * Previously, we used a node label of zero for both special cases, but
+ * this was problematic because one can't tell whether a string ending at
+ * the current level can be pushed down into such a child node. For
+ * backwards compatibility, we still support such node labels for reading;
+ * but no new entries will ever be pushed down into a zero-labeled child.
+ * No new entries ever get pushed into a -2-labeled child, either.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgtextproc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/spgist.h"
+#include "catalog/pg_type.h"
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/pg_locale.h"
+#include "utils/varlena.h"
+
+
+/*
+ * In the worst case, an inner tuple in a text radix tree could have as many
+ * as 258 nodes (one for each possible byte value, plus the two special
+ * cases). Each node can take 16 bytes on MAXALIGN=8 machines. The inner
+ * tuple must fit on an index page of size BLCKSZ. Rather than assuming we
+ * know the exact amount of overhead imposed by page headers, tuple headers,
+ * etc, we leave 100 bytes for that (the actual overhead should be no more
+ * than 56 bytes at this writing, so there is slop in this number).
+ * So we can safely create prefixes up to BLCKSZ - 258 * 16 - 100 bytes long.
+ * Unfortunately, because 258 * 16 is over 4K, there is no safe prefix length
+ * when BLCKSZ is less than 8K; it is always possible to get "SPGiST inner
+ * tuple size exceeds maximum" if there are too many distinct next-byte values
+ * at a given place in the tree. Since use of nonstandard block sizes appears
+ * to be negligible in the field, we just live with that fact for now,
+ * choosing a max prefix size of 32 bytes when BLCKSZ is configured smaller
+ * than default.
+ */
+#define SPGIST_MAX_PREFIX_LENGTH Max((int) (BLCKSZ - 258 * 16 - 100), 32)
+
+/*
+ * Strategy for collation aware operator on text is equal to btree strategy
+ * plus value of 10.
+ *
+ * Current collation aware strategies and their corresponding btree strategies:
+ * 11 BTLessStrategyNumber
+ * 12 BTLessEqualStrategyNumber
+ * 14 BTGreaterEqualStrategyNumber
+ * 15 BTGreaterStrategyNumber
+ */
+#define SPG_STRATEGY_ADDITION (10)
+#define SPG_IS_COLLATION_AWARE_STRATEGY(s) ((s) > SPG_STRATEGY_ADDITION \
+ && (s) != RTPrefixStrategyNumber)
+
+/* Struct for sorting values in picksplit */
+typedef struct spgNodePtr
+{
+ Datum d;
+ int i;
+ int16 c;
+} spgNodePtr;
+
+
+Datum
+spg_text_config(PG_FUNCTION_ARGS)
+{
+ /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */
+ spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1);
+
+ cfg->prefixType = TEXTOID;
+ cfg->labelType = INT2OID;
+ cfg->canReturnData = true;
+ cfg->longValuesOK = true; /* suffixing will shorten long values */
+ PG_RETURN_VOID();
+}
+
+/*
+ * Form a text datum from the given not-necessarily-null-terminated string,
+ * using short varlena header format if possible
+ */
+static Datum
+formTextDatum(const char *data, int datalen)
+{
+ char *p;
+
+ p = (char *) palloc(datalen + VARHDRSZ);
+
+ if (datalen + VARHDRSZ_SHORT <= VARATT_SHORT_MAX)
+ {
+ SET_VARSIZE_SHORT(p, datalen + VARHDRSZ_SHORT);
+ if (datalen)
+ memcpy(p + VARHDRSZ_SHORT, data, datalen);
+ }
+ else
+ {
+ SET_VARSIZE(p, datalen + VARHDRSZ);
+ memcpy(p + VARHDRSZ, data, datalen);
+ }
+
+ return PointerGetDatum(p);
+}
+
+/*
+ * Find the length of the common prefix of a and b
+ */
+static int
+commonPrefix(const char *a, const char *b, int lena, int lenb)
+{
+ int i = 0;
+
+ while (i < lena && i < lenb && *a == *b)
+ {
+ a++;
+ b++;
+ i++;
+ }
+
+ return i;
+}
+
+/*
+ * Binary search an array of int16 datums for a match to c
+ *
+ * On success, *i gets the match location; on failure, it gets where to insert
+ */
+static bool
+searchChar(Datum *nodeLabels, int nNodes, int16 c, int *i)
+{
+ int StopLow = 0,
+ StopHigh = nNodes;
+
+ while (StopLow < StopHigh)
+ {
+ int StopMiddle = (StopLow + StopHigh) >> 1;
+ int16 middle = DatumGetInt16(nodeLabels[StopMiddle]);
+
+ if (c < middle)
+ StopHigh = StopMiddle;
+ else if (c > middle)
+ StopLow = StopMiddle + 1;
+ else
+ {
+ *i = StopMiddle;
+ return true;
+ }
+ }
+
+ *i = StopHigh;
+ return false;
+}
+
+Datum
+spg_text_choose(PG_FUNCTION_ARGS)
+{
+ spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0);
+ spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1);
+ text *inText = DatumGetTextPP(in->datum);
+ char *inStr = VARDATA_ANY(inText);
+ int inSize = VARSIZE_ANY_EXHDR(inText);
+ char *prefixStr = NULL;
+ int prefixSize = 0;
+ int commonLen = 0;
+ int16 nodeChar = 0;
+ int i = 0;
+
+ /* Check for prefix match, set nodeChar to first byte after prefix */
+ if (in->hasPrefix)
+ {
+ text *prefixText = DatumGetTextPP(in->prefixDatum);
+
+ prefixStr = VARDATA_ANY(prefixText);
+ prefixSize = VARSIZE_ANY_EXHDR(prefixText);
+
+ commonLen = commonPrefix(inStr + in->level,
+ prefixStr,
+ inSize - in->level,
+ prefixSize);
+
+ if (commonLen == prefixSize)
+ {
+ if (inSize - in->level > commonLen)
+ nodeChar = *(unsigned char *) (inStr + in->level + commonLen);
+ else
+ nodeChar = -1;
+ }
+ else
+ {
+ /* Must split tuple because incoming value doesn't match prefix */
+ out->resultType = spgSplitTuple;
+
+ if (commonLen == 0)
+ {
+ out->result.splitTuple.prefixHasPrefix = false;
+ }
+ else
+ {
+ out->result.splitTuple.prefixHasPrefix = true;
+ out->result.splitTuple.prefixPrefixDatum =
+ formTextDatum(prefixStr, commonLen);
+ }
+ out->result.splitTuple.prefixNNodes = 1;
+ out->result.splitTuple.prefixNodeLabels =
+ (Datum *) palloc(sizeof(Datum));
+ out->result.splitTuple.prefixNodeLabels[0] =
+ Int16GetDatum(*(unsigned char *) (prefixStr + commonLen));
+
+ out->result.splitTuple.childNodeN = 0;
+
+ if (prefixSize - commonLen == 1)
+ {
+ out->result.splitTuple.postfixHasPrefix = false;
+ }
+ else
+ {
+ out->result.splitTuple.postfixHasPrefix = true;
+ out->result.splitTuple.postfixPrefixDatum =
+ formTextDatum(prefixStr + commonLen + 1,
+ prefixSize - commonLen - 1);
+ }
+
+ PG_RETURN_VOID();
+ }
+ }
+ else if (inSize > in->level)
+ {
+ nodeChar = *(unsigned char *) (inStr + in->level);
+ }
+ else
+ {
+ nodeChar = -1;
+ }
+
+ /* Look up nodeChar in the node label array */
+ if (searchChar(in->nodeLabels, in->nNodes, nodeChar, &i))
+ {
+ /*
+ * Descend to existing node. (If in->allTheSame, the core code will
+ * ignore our nodeN specification here, but that's OK. We still have
+ * to provide the correct levelAdd and restDatum values, and those are
+ * the same regardless of which node gets chosen by core.)
+ */
+ int levelAdd;
+
+ out->resultType = spgMatchNode;
+ out->result.matchNode.nodeN = i;
+ levelAdd = commonLen;
+ if (nodeChar >= 0)
+ levelAdd++;
+ out->result.matchNode.levelAdd = levelAdd;
+ if (inSize - in->level - levelAdd > 0)
+ out->result.matchNode.restDatum =
+ formTextDatum(inStr + in->level + levelAdd,
+ inSize - in->level - levelAdd);
+ else
+ out->result.matchNode.restDatum =
+ formTextDatum(NULL, 0);
+ }
+ else if (in->allTheSame)
+ {
+ /*
+ * Can't use AddNode action, so split the tuple. The upper tuple has
+ * the same prefix as before and uses a dummy node label -2 for the
+ * lower tuple. The lower tuple has no prefix and the same node
+ * labels as the original tuple.
+ *
+ * Note: it might seem tempting to shorten the upper tuple's prefix,
+ * if it has one, then use its last byte as label for the lower tuple.
+ * But that doesn't win since we know the incoming value matches the
+ * whole prefix: we'd just end up splitting the lower tuple again.
+ */
+ out->resultType = spgSplitTuple;
+ out->result.splitTuple.prefixHasPrefix = in->hasPrefix;
+ out->result.splitTuple.prefixPrefixDatum = in->prefixDatum;
+ out->result.splitTuple.prefixNNodes = 1;
+ out->result.splitTuple.prefixNodeLabels = (Datum *) palloc(sizeof(Datum));
+ out->result.splitTuple.prefixNodeLabels[0] = Int16GetDatum(-2);
+ out->result.splitTuple.childNodeN = 0;
+ out->result.splitTuple.postfixHasPrefix = false;
+ }
+ else
+ {
+ /* Add a node for the not-previously-seen nodeChar value */
+ out->resultType = spgAddNode;
+ out->result.addNode.nodeLabel = Int16GetDatum(nodeChar);
+ out->result.addNode.nodeN = i;
+ }
+
+ PG_RETURN_VOID();
+}
+
+/* qsort comparator to sort spgNodePtr structs by "c" */
+static int
+cmpNodePtr(const void *a, const void *b)
+{
+ const spgNodePtr *aa = (const spgNodePtr *) a;
+ const spgNodePtr *bb = (const spgNodePtr *) b;
+
+ return aa->c - bb->c;
+}
+
+Datum
+spg_text_picksplit(PG_FUNCTION_ARGS)
+{
+ spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0);
+ spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1);
+ text *text0 = DatumGetTextPP(in->datums[0]);
+ int i,
+ commonLen;
+ spgNodePtr *nodes;
+
+ /* Identify longest common prefix, if any */
+ commonLen = VARSIZE_ANY_EXHDR(text0);
+ for (i = 1; i < in->nTuples && commonLen > 0; i++)
+ {
+ text *texti = DatumGetTextPP(in->datums[i]);
+ int tmp = commonPrefix(VARDATA_ANY(text0),
+ VARDATA_ANY(texti),
+ VARSIZE_ANY_EXHDR(text0),
+ VARSIZE_ANY_EXHDR(texti));
+
+ if (tmp < commonLen)
+ commonLen = tmp;
+ }
+
+ /*
+ * Limit the prefix length, if necessary, to ensure that the resulting
+ * inner tuple will fit on a page.
+ */
+ commonLen = Min(commonLen, SPGIST_MAX_PREFIX_LENGTH);
+
+ /* Set node prefix to be that string, if it's not empty */
+ if (commonLen == 0)
+ {
+ out->hasPrefix = false;
+ }
+ else
+ {
+ out->hasPrefix = true;
+ out->prefixDatum = formTextDatum(VARDATA_ANY(text0), commonLen);
+ }
+
+ /* Extract the node label (first non-common byte) from each value */
+ nodes = (spgNodePtr *) palloc(sizeof(spgNodePtr) * in->nTuples);
+
+ for (i = 0; i < in->nTuples; i++)
+ {
+ text *texti = DatumGetTextPP(in->datums[i]);
+
+ if (commonLen < VARSIZE_ANY_EXHDR(texti))
+ nodes[i].c = *(unsigned char *) (VARDATA_ANY(texti) + commonLen);
+ else
+ nodes[i].c = -1; /* use -1 if string is all common */
+ nodes[i].i = i;
+ nodes[i].d = in->datums[i];
+ }
+
+ /*
+ * Sort by label values so that we can group the values into nodes. This
+ * also ensures that the nodes are ordered by label value, allowing the
+ * use of binary search in searchChar.
+ */
+ qsort(nodes, in->nTuples, sizeof(*nodes), cmpNodePtr);
+
+ /* And emit results */
+ out->nNodes = 0;
+ out->nodeLabels = (Datum *) palloc(sizeof(Datum) * in->nTuples);
+ out->mapTuplesToNodes = (int *) palloc(sizeof(int) * in->nTuples);
+ out->leafTupleDatums = (Datum *) palloc(sizeof(Datum) * in->nTuples);
+
+ for (i = 0; i < in->nTuples; i++)
+ {
+ text *texti = DatumGetTextPP(nodes[i].d);
+ Datum leafD;
+
+ if (i == 0 || nodes[i].c != nodes[i - 1].c)
+ {
+ out->nodeLabels[out->nNodes] = Int16GetDatum(nodes[i].c);
+ out->nNodes++;
+ }
+
+ if (commonLen < VARSIZE_ANY_EXHDR(texti))
+ leafD = formTextDatum(VARDATA_ANY(texti) + commonLen + 1,
+ VARSIZE_ANY_EXHDR(texti) - commonLen - 1);
+ else
+ leafD = formTextDatum(NULL, 0);
+
+ out->leafTupleDatums[nodes[i].i] = leafD;
+ out->mapTuplesToNodes[nodes[i].i] = out->nNodes - 1;
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+spg_text_inner_consistent(PG_FUNCTION_ARGS)
+{
+ spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
+ spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
+ bool collate_is_c = lc_collate_is_c(PG_GET_COLLATION());
+ text *reconstructedValue;
+ text *reconstrText;
+ int maxReconstrLen;
+ text *prefixText = NULL;
+ int prefixSize = 0;
+ int i;
+
+ /*
+ * Reconstruct values represented at this tuple, including parent data,
+ * prefix of this tuple if any, and the node label if it's non-dummy.
+ * in->level should be the length of the previously reconstructed value,
+ * and the number of bytes added here is prefixSize or prefixSize + 1.
+ *
+ * Note: we assume that in->reconstructedValue isn't toasted and doesn't
+ * have a short varlena header. This is okay because it must have been
+ * created by a previous invocation of this routine, and we always emit
+ * long-format reconstructed values.
+ */
+ reconstructedValue = (text *) DatumGetPointer(in->reconstructedValue);
+ Assert(reconstructedValue == NULL ? in->level == 0 :
+ VARSIZE_ANY_EXHDR(reconstructedValue) == in->level);
+
+ maxReconstrLen = in->level + 1;
+ if (in->hasPrefix)
+ {
+ prefixText = DatumGetTextPP(in->prefixDatum);
+ prefixSize = VARSIZE_ANY_EXHDR(prefixText);
+ maxReconstrLen += prefixSize;
+ }
+
+ reconstrText = palloc(VARHDRSZ + maxReconstrLen);
+ SET_VARSIZE(reconstrText, VARHDRSZ + maxReconstrLen);
+
+ if (in->level)
+ memcpy(VARDATA(reconstrText),
+ VARDATA(reconstructedValue),
+ in->level);
+ if (prefixSize)
+ memcpy(((char *) VARDATA(reconstrText)) + in->level,
+ VARDATA_ANY(prefixText),
+ prefixSize);
+ /* last byte of reconstrText will be filled in below */
+
+ /*
+ * Scan the child nodes. For each one, complete the reconstructed value
+ * and see if it's consistent with the query. If so, emit an entry into
+ * the output arrays.
+ */
+ out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes);
+ out->levelAdds = (int *) palloc(sizeof(int) * in->nNodes);
+ out->reconstructedValues = (Datum *) palloc(sizeof(Datum) * in->nNodes);
+ out->nNodes = 0;
+
+ for (i = 0; i < in->nNodes; i++)
+ {
+ int16 nodeChar = DatumGetInt16(in->nodeLabels[i]);
+ int thisLen;
+ bool res = true;
+ int j;
+
+ /* If nodeChar is a dummy value, don't include it in data */
+ if (nodeChar <= 0)
+ thisLen = maxReconstrLen - 1;
+ else
+ {
+ ((unsigned char *) VARDATA(reconstrText))[maxReconstrLen - 1] = nodeChar;
+ thisLen = maxReconstrLen;
+ }
+
+ for (j = 0; j < in->nkeys; j++)
+ {
+ StrategyNumber strategy = in->scankeys[j].sk_strategy;
+ text *inText;
+ int inSize;
+ int r;
+
+ /*
+ * If it's a collation-aware operator, but the collation is C, we
+ * can treat it as non-collation-aware. With non-C collation we
+ * need to traverse whole tree :-( so there's no point in making
+ * any check here. (Note also that our reconstructed value may
+ * well end with a partial multibyte character, so that applying
+ * any encoding-sensitive test to it would be risky anyhow.)
+ */
+ if (SPG_IS_COLLATION_AWARE_STRATEGY(strategy))
+ {
+ if (collate_is_c)
+ strategy -= SPG_STRATEGY_ADDITION;
+ else
+ continue;
+ }
+
+ inText = DatumGetTextPP(in->scankeys[j].sk_argument);
+ inSize = VARSIZE_ANY_EXHDR(inText);
+
+ r = memcmp(VARDATA(reconstrText), VARDATA_ANY(inText),
+ Min(inSize, thisLen));
+
+ switch (strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ if (r > 0)
+ res = false;
+ break;
+ case BTEqualStrategyNumber:
+ if (r != 0 || inSize < thisLen)
+ res = false;
+ break;
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+ if (r < 0)
+ res = false;
+ break;
+ case RTPrefixStrategyNumber:
+ if (r != 0)
+ res = false;
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d",
+ in->scankeys[j].sk_strategy);
+ break;
+ }
+
+ if (!res)
+ break; /* no need to consider remaining conditions */
+ }
+
+ if (res)
+ {
+ out->nodeNumbers[out->nNodes] = i;
+ out->levelAdds[out->nNodes] = thisLen - in->level;
+ SET_VARSIZE(reconstrText, VARHDRSZ + thisLen);
+ out->reconstructedValues[out->nNodes] =
+ datumCopy(PointerGetDatum(reconstrText), false, -1);
+ out->nNodes++;
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+spg_text_leaf_consistent(PG_FUNCTION_ARGS)
+{
+ spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0);
+ spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1);
+ int level = in->level;
+ text *leafValue,
+ *reconstrValue = NULL;
+ char *fullValue;
+ int fullLen;
+ bool res;
+ int j;
+
+ /* all tests are exact */
+ out->recheck = false;
+
+ leafValue = DatumGetTextPP(in->leafDatum);
+
+ /* As above, in->reconstructedValue isn't toasted or short. */
+ if (DatumGetPointer(in->reconstructedValue))
+ reconstrValue = (text *) DatumGetPointer(in->reconstructedValue);
+
+ Assert(reconstrValue == NULL ? level == 0 :
+ VARSIZE_ANY_EXHDR(reconstrValue) == level);
+
+ /* Reconstruct the full string represented by this leaf tuple */
+ fullLen = level + VARSIZE_ANY_EXHDR(leafValue);
+ if (VARSIZE_ANY_EXHDR(leafValue) == 0 && level > 0)
+ {
+ fullValue = VARDATA(reconstrValue);
+ out->leafValue = PointerGetDatum(reconstrValue);
+ }
+ else
+ {
+ text *fullText = palloc(VARHDRSZ + fullLen);
+
+ SET_VARSIZE(fullText, VARHDRSZ + fullLen);
+ fullValue = VARDATA(fullText);
+ if (level)
+ memcpy(fullValue, VARDATA(reconstrValue), level);
+ if (VARSIZE_ANY_EXHDR(leafValue) > 0)
+ memcpy(fullValue + level, VARDATA_ANY(leafValue),
+ VARSIZE_ANY_EXHDR(leafValue));
+ out->leafValue = PointerGetDatum(fullText);
+ }
+
+ /* Perform the required comparison(s) */
+ res = true;
+ for (j = 0; j < in->nkeys; j++)
+ {
+ StrategyNumber strategy = in->scankeys[j].sk_strategy;
+ text *query = DatumGetTextPP(in->scankeys[j].sk_argument);
+ int queryLen = VARSIZE_ANY_EXHDR(query);
+ int r;
+
+ if (strategy == RTPrefixStrategyNumber)
+ {
+ /*
+ * if level >= length of query then reconstrValue must begin with
+ * query (prefix) string, so we don't need to check it again.
+ */
+ res = (level >= queryLen) ||
+ DatumGetBool(DirectFunctionCall2Coll(text_starts_with,
+ PG_GET_COLLATION(),
+ out->leafValue,
+ PointerGetDatum(query)));
+
+ if (!res) /* no need to consider remaining conditions */
+ break;
+
+ continue;
+ }
+
+ if (SPG_IS_COLLATION_AWARE_STRATEGY(strategy))
+ {
+ /* Collation-aware comparison */
+ strategy -= SPG_STRATEGY_ADDITION;
+
+ /* If asserts enabled, verify encoding of reconstructed string */
+ Assert(pg_verifymbstr(fullValue, fullLen, false));
+
+ r = varstr_cmp(fullValue, fullLen,
+ VARDATA_ANY(query), queryLen,
+ PG_GET_COLLATION());
+ }
+ else
+ {
+ /* Non-collation-aware comparison */
+ r = memcmp(fullValue, VARDATA_ANY(query), Min(queryLen, fullLen));
+
+ if (r == 0)
+ {
+ if (queryLen > fullLen)
+ r = -1;
+ else if (queryLen < fullLen)
+ r = 1;
+ }
+ }
+
+ switch (strategy)
+ {
+ case BTLessStrategyNumber:
+ res = (r < 0);
+ break;
+ case BTLessEqualStrategyNumber:
+ res = (r <= 0);
+ break;
+ case BTEqualStrategyNumber:
+ res = (r == 0);
+ break;
+ case BTGreaterEqualStrategyNumber:
+ res = (r >= 0);
+ break;
+ case BTGreaterStrategyNumber:
+ res = (r > 0);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d",
+ in->scankeys[j].sk_strategy);
+ res = false;
+ break;
+ }
+
+ if (!res)
+ break; /* no need to consider remaining conditions */
+ }
+
+ PG_RETURN_BOOL(res);
+}
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
new file mode 100644
index 0000000..f2da02e
--- /dev/null
+++ b/src/backend/access/spgist/spgutils.c
@@ -0,0 +1,1349 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgutils.c
+ * various support functions for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgutils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/amvalidate.h"
+#include "access/htup_details.h"
+#include "access/reloptions.h"
+#include "access/spgist_private.h"
+#include "access/toast_compression.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/pg_amop.h"
+#include "commands/vacuum.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parse_coerce.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "utils/builtins.h"
+#include "utils/catcache.h"
+#include "utils/index_selfuncs.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+
+
+/*
+ * SP-GiST handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+spghandler(PG_FUNCTION_ARGS)
+{
+ IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+ amroutine->amstrategies = 0;
+ amroutine->amsupport = SPGISTNProc;
+ amroutine->amoptsprocnum = SPGIST_OPTIONS_PROC;
+ amroutine->amcanorder = false;
+ amroutine->amcanorderbyop = true;
+ amroutine->amcanbackward = false;
+ amroutine->amcanunique = false;
+ amroutine->amcanmulticol = false;
+ amroutine->amoptionalkey = true;
+ amroutine->amsearcharray = false;
+ amroutine->amsearchnulls = true;
+ amroutine->amstorage = true;
+ amroutine->amclusterable = false;
+ amroutine->ampredlocks = false;
+ amroutine->amcanparallel = false;
+ amroutine->amcaninclude = true;
+ amroutine->amusemaintenanceworkmem = false;
+ amroutine->amparallelvacuumoptions =
+ VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
+ amroutine->amkeytype = InvalidOid;
+
+ amroutine->ambuild = spgbuild;
+ amroutine->ambuildempty = spgbuildempty;
+ amroutine->aminsert = spginsert;
+ amroutine->ambulkdelete = spgbulkdelete;
+ amroutine->amvacuumcleanup = spgvacuumcleanup;
+ amroutine->amcanreturn = spgcanreturn;
+ amroutine->amcostestimate = spgcostestimate;
+ amroutine->amoptions = spgoptions;
+ amroutine->amproperty = spgproperty;
+ amroutine->ambuildphasename = NULL;
+ amroutine->amvalidate = spgvalidate;
+ amroutine->amadjustmembers = spgadjustmembers;
+ amroutine->ambeginscan = spgbeginscan;
+ amroutine->amrescan = spgrescan;
+ amroutine->amgettuple = spggettuple;
+ amroutine->amgetbitmap = spggetbitmap;
+ amroutine->amendscan = spgendscan;
+ amroutine->ammarkpos = NULL;
+ amroutine->amrestrpos = NULL;
+ amroutine->amestimateparallelscan = NULL;
+ amroutine->aminitparallelscan = NULL;
+ amroutine->amparallelrescan = NULL;
+
+ PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * GetIndexInputType
+ * Determine the nominal input data type for an index column
+ *
+ * We define the "nominal" input type as the associated opclass's opcintype,
+ * or if that is a polymorphic type, the base type of the heap column or
+ * expression that is the index's input. The reason for preferring the
+ * opcintype is that non-polymorphic opclasses probably don't want to hear
+ * about binary-compatible input types. For instance, if a text opclass
+ * is being used with a varchar heap column, we want to report "text" not
+ * "varchar". Likewise, opclasses don't want to hear about domain types,
+ * so if we do consult the actual input type, we make sure to flatten domains.
+ *
+ * At some point maybe this should go somewhere else, but it's not clear
+ * if any other index AMs have a use for it.
+ */
+static Oid
+GetIndexInputType(Relation index, AttrNumber indexcol)
+{
+ Oid opcintype;
+ AttrNumber heapcol;
+ List *indexprs;
+ ListCell *indexpr_item;
+
+ Assert(index->rd_index != NULL);
+ Assert(indexcol > 0 && indexcol <= index->rd_index->indnkeyatts);
+ opcintype = index->rd_opcintype[indexcol - 1];
+ if (!IsPolymorphicType(opcintype))
+ return opcintype;
+ heapcol = index->rd_index->indkey.values[indexcol - 1];
+ if (heapcol != 0) /* Simple index column? */
+ return getBaseType(get_atttype(index->rd_index->indrelid, heapcol));
+
+ /*
+ * If the index expressions are already cached, skip calling
+ * RelationGetIndexExpressions, as it will make a copy which is overkill.
+ * We're not going to modify the trees, and we're not going to do anything
+ * that would invalidate the relcache entry before we're done.
+ */
+ if (index->rd_indexprs)
+ indexprs = index->rd_indexprs;
+ else
+ indexprs = RelationGetIndexExpressions(index);
+ indexpr_item = list_head(indexprs);
+ for (int i = 1; i <= index->rd_index->indnkeyatts; i++)
+ {
+ if (index->rd_index->indkey.values[i - 1] == 0)
+ {
+ /* expression column */
+ if (indexpr_item == NULL)
+ elog(ERROR, "wrong number of index expressions");
+ if (i == indexcol)
+ return getBaseType(exprType((Node *) lfirst(indexpr_item)));
+ indexpr_item = lnext(indexprs, indexpr_item);
+ }
+ }
+ elog(ERROR, "wrong number of index expressions");
+ return InvalidOid; /* keep compiler quiet */
+}
+
+/* Fill in a SpGistTypeDesc struct with info about the specified data type */
+static void
+fillTypeDesc(SpGistTypeDesc *desc, Oid type)
+{
+ HeapTuple tp;
+ Form_pg_type typtup;
+
+ desc->type = type;
+ tp = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type));
+ if (!HeapTupleIsValid(tp))
+ elog(ERROR, "cache lookup failed for type %u", type);
+ typtup = (Form_pg_type) GETSTRUCT(tp);
+ desc->attlen = typtup->typlen;
+ desc->attbyval = typtup->typbyval;
+ desc->attalign = typtup->typalign;
+ desc->attstorage = typtup->typstorage;
+ ReleaseSysCache(tp);
+}
+
+/*
+ * Fetch local cache of AM-specific info about the index, initializing it
+ * if necessary
+ */
+SpGistCache *
+spgGetCache(Relation index)
+{
+ SpGistCache *cache;
+
+ if (index->rd_amcache == NULL)
+ {
+ Oid atttype;
+ spgConfigIn in;
+ FmgrInfo *procinfo;
+ Buffer metabuffer;
+ SpGistMetaPageData *metadata;
+
+ cache = MemoryContextAllocZero(index->rd_indexcxt,
+ sizeof(SpGistCache));
+
+ /* SPGiST must have one key column and can also have INCLUDE columns */
+ Assert(IndexRelationGetNumberOfKeyAttributes(index) == 1);
+ Assert(IndexRelationGetNumberOfAttributes(index) <= INDEX_MAX_KEYS);
+
+ /*
+ * Get the actual (well, nominal) data type of the key column. We
+ * pass this to the opclass config function so that polymorphic
+ * opclasses are possible.
+ */
+ atttype = GetIndexInputType(index, spgKeyColumn + 1);
+
+ /* Call the config function to get config info for the opclass */
+ in.attType = atttype;
+
+ procinfo = index_getprocinfo(index, 1, SPGIST_CONFIG_PROC);
+ FunctionCall2Coll(procinfo,
+ index->rd_indcollation[spgKeyColumn],
+ PointerGetDatum(&in),
+ PointerGetDatum(&cache->config));
+
+ /*
+ * If leafType isn't specified, use the declared index column type,
+ * which index.c will have derived from the opclass's opcintype.
+ * (Although we now make spgvalidate.c warn if these aren't the same,
+ * old user-defined opclasses may not set the STORAGE parameter
+ * correctly, so believe leafType if it's given.)
+ */
+ if (!OidIsValid(cache->config.leafType))
+ {
+ cache->config.leafType =
+ TupleDescAttr(RelationGetDescr(index), spgKeyColumn)->atttypid;
+
+ /*
+ * If index column type is binary-coercible to atttype (for
+ * example, it's a domain over atttype), treat it as plain atttype
+ * to avoid thinking we need to compress.
+ */
+ if (cache->config.leafType != atttype &&
+ IsBinaryCoercible(cache->config.leafType, atttype))
+ cache->config.leafType = atttype;
+ }
+
+ /* Get the information we need about each relevant datatype */
+ fillTypeDesc(&cache->attType, atttype);
+
+ if (cache->config.leafType != atttype)
+ {
+ if (!OidIsValid(index_getprocid(index, 1, SPGIST_COMPRESS_PROC)))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("compress method must be defined when leaf type is different from input type")));
+
+ fillTypeDesc(&cache->attLeafType, cache->config.leafType);
+ }
+ else
+ {
+ /* Save lookups in this common case */
+ cache->attLeafType = cache->attType;
+ }
+
+ fillTypeDesc(&cache->attPrefixType, cache->config.prefixType);
+ fillTypeDesc(&cache->attLabelType, cache->config.labelType);
+
+ /* Last, get the lastUsedPages data from the metapage */
+ metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO);
+ LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
+
+ metadata = SpGistPageGetMeta(BufferGetPage(metabuffer));
+
+ if (metadata->magicNumber != SPGIST_MAGIC_NUMBER)
+ elog(ERROR, "index \"%s\" is not an SP-GiST index",
+ RelationGetRelationName(index));
+
+ cache->lastUsedPages = metadata->lastUsedPages;
+
+ UnlockReleaseBuffer(metabuffer);
+
+ index->rd_amcache = (void *) cache;
+ }
+ else
+ {
+ /* assume it's up to date */
+ cache = (SpGistCache *) index->rd_amcache;
+ }
+
+ return cache;
+}
+
+/*
+ * Compute a tuple descriptor for leaf tuples or index-only-scan result tuples.
+ *
+ * We can use the relcache's tupdesc as-is in many cases, and it's always
+ * OK so far as any INCLUDE columns are concerned. However, the entry for
+ * the key column has to match leafType in the first case or attType in the
+ * second case. While the relcache's tupdesc *should* show leafType, this
+ * might not hold for legacy user-defined opclasses, since before v14 they
+ * were not allowed to declare their true storage type in CREATE OPCLASS.
+ * Also, attType can be different from what is in the relcache.
+ *
+ * This function gives back either a pointer to the relcache's tupdesc
+ * if that is suitable, or a palloc'd copy that's been adjusted to match
+ * the specified key column type. We can avoid doing any catalog lookups
+ * here by insisting that the caller pass an SpGistTypeDesc not just an OID.
+ */
+TupleDesc
+getSpGistTupleDesc(Relation index, SpGistTypeDesc *keyType)
+{
+ TupleDesc outTupDesc;
+ Form_pg_attribute att;
+
+ if (keyType->type ==
+ TupleDescAttr(RelationGetDescr(index), spgKeyColumn)->atttypid)
+ outTupDesc = RelationGetDescr(index);
+ else
+ {
+ outTupDesc = CreateTupleDescCopy(RelationGetDescr(index));
+ att = TupleDescAttr(outTupDesc, spgKeyColumn);
+ /* It's sufficient to update the type-dependent fields of the column */
+ att->atttypid = keyType->type;
+ att->atttypmod = -1;
+ att->attlen = keyType->attlen;
+ att->attbyval = keyType->attbyval;
+ att->attalign = keyType->attalign;
+ att->attstorage = keyType->attstorage;
+ /* We shouldn't need to bother with making these valid: */
+ att->attcompression = InvalidCompressionMethod;
+ att->attcollation = InvalidOid;
+ /* In case we changed typlen, we'd better reset following offsets */
+ for (int i = spgFirstIncludeColumn; i < outTupDesc->natts; i++)
+ TupleDescAttr(outTupDesc, i)->attcacheoff = -1;
+ }
+ return outTupDesc;
+}
+
+/* Initialize SpGistState for working with the given index */
+void
+initSpGistState(SpGistState *state, Relation index)
+{
+ SpGistCache *cache;
+
+ state->index = index;
+
+ /* Get cached static information about index */
+ cache = spgGetCache(index);
+
+ state->config = cache->config;
+ state->attType = cache->attType;
+ state->attLeafType = cache->attLeafType;
+ state->attPrefixType = cache->attPrefixType;
+ state->attLabelType = cache->attLabelType;
+
+ /* Ensure we have a valid descriptor for leaf tuples */
+ state->leafTupDesc = getSpGistTupleDesc(state->index, &state->attLeafType);
+
+ /* Make workspace for constructing dead tuples */
+ state->deadTupleStorage = palloc0(SGDTSIZE);
+
+ /* Set XID to use in redirection tuples */
+ state->myXid = GetTopTransactionIdIfAny();
+
+ /* Assume we're not in an index build (spgbuild will override) */
+ state->isBuild = false;
+}
+
+/*
+ * Allocate a new page (either by recycling, or by extending the index file).
+ *
+ * The returned buffer is already pinned and exclusive-locked.
+ * Caller is responsible for initializing the page by calling SpGistInitBuffer.
+ */
+Buffer
+SpGistNewBuffer(Relation index)
+{
+ Buffer buffer;
+ bool needLock;
+
+ /* First, try to get a page from FSM */
+ for (;;)
+ {
+ BlockNumber blkno = GetFreeIndexPage(index);
+
+ if (blkno == InvalidBlockNumber)
+ break; /* nothing known to FSM */
+
+ /*
+ * The fixed pages shouldn't ever be listed in FSM, but just in case
+ * one is, ignore it.
+ */
+ if (SpGistBlockIsFixed(blkno))
+ continue;
+
+ buffer = ReadBuffer(index, blkno);
+
+ /*
+ * We have to guard against the possibility that someone else already
+ * recycled this page; the buffer may be locked if so.
+ */
+ if (ConditionalLockBuffer(buffer))
+ {
+ Page page = BufferGetPage(buffer);
+
+ if (PageIsNew(page))
+ return buffer; /* OK to use, if never initialized */
+
+ if (SpGistPageIsDeleted(page) || PageIsEmpty(page))
+ return buffer; /* OK to use */
+
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ }
+
+ /* Can't use it, so release buffer and try again */
+ ReleaseBuffer(buffer);
+ }
+
+ /* Must extend the file */
+ needLock = !RELATION_IS_LOCAL(index);
+ if (needLock)
+ LockRelationForExtension(index, ExclusiveLock);
+
+ buffer = ReadBuffer(index, P_NEW);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ if (needLock)
+ UnlockRelationForExtension(index, ExclusiveLock);
+
+ return buffer;
+}
+
+/*
+ * Update index metapage's lastUsedPages info from local cache, if possible
+ *
+ * Updating meta page isn't critical for index working, so
+ * 1 use ConditionalLockBuffer to improve concurrency
+ * 2 don't WAL-log metabuffer changes to decrease WAL traffic
+ */
+void
+SpGistUpdateMetaPage(Relation index)
+{
+ SpGistCache *cache = (SpGistCache *) index->rd_amcache;
+
+ if (cache != NULL)
+ {
+ Buffer metabuffer;
+
+ metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO);
+
+ if (ConditionalLockBuffer(metabuffer))
+ {
+ Page metapage = BufferGetPage(metabuffer);
+ SpGistMetaPageData *metadata = SpGistPageGetMeta(metapage);
+
+ metadata->lastUsedPages = cache->lastUsedPages;
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is
+ * essential, because without doing so, metadata will be lost if
+ * xlog.c compresses the page. (We must do this here because
+ * pre-v11 versions of PG did not set the metapage's pd_lower
+ * correctly, so a pg_upgraded index might contain the wrong
+ * value.)
+ */
+ ((PageHeader) metapage)->pd_lower =
+ ((char *) metadata + sizeof(SpGistMetaPageData)) - (char *) metapage;
+
+ MarkBufferDirty(metabuffer);
+ UnlockReleaseBuffer(metabuffer);
+ }
+ else
+ {
+ ReleaseBuffer(metabuffer);
+ }
+ }
+}
+
+/* Macro to select proper element of lastUsedPages cache depending on flags */
+/* Masking flags with SPGIST_CACHED_PAGES is just for paranoia's sake */
+#define GET_LUP(c, f) (&(c)->lastUsedPages.cachedPage[((unsigned int) (f)) % SPGIST_CACHED_PAGES])
+
+/*
+ * Allocate and initialize a new buffer of the type and parity specified by
+ * flags. The returned buffer is already pinned and exclusive-locked.
+ *
+ * When requesting an inner page, if we get one with the wrong parity,
+ * we just release the buffer and try again. We will get a different page
+ * because GetFreeIndexPage will have marked the page used in FSM. The page
+ * is entered in our local lastUsedPages cache, so there's some hope of
+ * making use of it later in this session, but otherwise we rely on VACUUM
+ * to eventually re-enter the page in FSM, making it available for recycling.
+ * Note that such a page does not get marked dirty here, so unless it's used
+ * fairly soon, the buffer will just get discarded and the page will remain
+ * as it was on disk.
+ *
+ * When we return a buffer to the caller, the page is *not* entered into
+ * the lastUsedPages cache; we expect the caller will do so after it's taken
+ * whatever space it will use. This is because after the caller has used up
+ * some space, the page might have less space than whatever was cached already
+ * so we'd rather not trash the old cache entry.
+ */
+static Buffer
+allocNewBuffer(Relation index, int flags)
+{
+ SpGistCache *cache = spgGetCache(index);
+ uint16 pageflags = 0;
+
+ if (GBUF_REQ_LEAF(flags))
+ pageflags |= SPGIST_LEAF;
+ if (GBUF_REQ_NULLS(flags))
+ pageflags |= SPGIST_NULLS;
+
+ for (;;)
+ {
+ Buffer buffer;
+
+ buffer = SpGistNewBuffer(index);
+ SpGistInitBuffer(buffer, pageflags);
+
+ if (pageflags & SPGIST_LEAF)
+ {
+ /* Leaf pages have no parity concerns, so just use it */
+ return buffer;
+ }
+ else
+ {
+ BlockNumber blkno = BufferGetBlockNumber(buffer);
+ int blkFlags = GBUF_INNER_PARITY(blkno);
+
+ if ((flags & GBUF_PARITY_MASK) == blkFlags)
+ {
+ /* Page has right parity, use it */
+ return buffer;
+ }
+ else
+ {
+ /* Page has wrong parity, record it in cache and try again */
+ if (pageflags & SPGIST_NULLS)
+ blkFlags |= GBUF_NULLS;
+ cache->lastUsedPages.cachedPage[blkFlags].blkno = blkno;
+ cache->lastUsedPages.cachedPage[blkFlags].freeSpace =
+ PageGetExactFreeSpace(BufferGetPage(buffer));
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+ }
+}
+
+/*
+ * Get a buffer of the type and parity specified by flags, having at least
+ * as much free space as indicated by needSpace. We use the lastUsedPages
+ * cache to assign the same buffer previously requested when possible.
+ * The returned buffer is already pinned and exclusive-locked.
+ *
+ * *isNew is set true if the page was initialized here, false if it was
+ * already valid.
+ */
+Buffer
+SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew)
+{
+ SpGistCache *cache = spgGetCache(index);
+ SpGistLastUsedPage *lup;
+
+ /* Bail out if even an empty page wouldn't meet the demand */
+ if (needSpace > SPGIST_PAGE_CAPACITY)
+ elog(ERROR, "desired SPGiST tuple size is too big");
+
+ /*
+ * If possible, increase the space request to include relation's
+ * fillfactor. This ensures that when we add unrelated tuples to a page,
+ * we try to keep 100-fillfactor% available for adding tuples that are
+ * related to the ones already on it. But fillfactor mustn't cause an
+ * error for requests that would otherwise be legal.
+ */
+ needSpace += SpGistGetTargetPageFreeSpace(index);
+ needSpace = Min(needSpace, SPGIST_PAGE_CAPACITY);
+
+ /* Get the cache entry for this flags setting */
+ lup = GET_LUP(cache, flags);
+
+ /* If we have nothing cached, just turn it over to allocNewBuffer */
+ if (lup->blkno == InvalidBlockNumber)
+ {
+ *isNew = true;
+ return allocNewBuffer(index, flags);
+ }
+
+ /* fixed pages should never be in cache */
+ Assert(!SpGistBlockIsFixed(lup->blkno));
+
+ /* If cached freeSpace isn't enough, don't bother looking at the page */
+ if (lup->freeSpace >= needSpace)
+ {
+ Buffer buffer;
+ Page page;
+
+ buffer = ReadBuffer(index, lup->blkno);
+
+ if (!ConditionalLockBuffer(buffer))
+ {
+ /*
+ * buffer is locked by another process, so return a new buffer
+ */
+ ReleaseBuffer(buffer);
+ *isNew = true;
+ return allocNewBuffer(index, flags);
+ }
+
+ page = BufferGetPage(buffer);
+
+ if (PageIsNew(page) || SpGistPageIsDeleted(page) || PageIsEmpty(page))
+ {
+ /* OK to initialize the page */
+ uint16 pageflags = 0;
+
+ if (GBUF_REQ_LEAF(flags))
+ pageflags |= SPGIST_LEAF;
+ if (GBUF_REQ_NULLS(flags))
+ pageflags |= SPGIST_NULLS;
+ SpGistInitBuffer(buffer, pageflags);
+ lup->freeSpace = PageGetExactFreeSpace(page) - needSpace;
+ *isNew = true;
+ return buffer;
+ }
+
+ /*
+ * Check that page is of right type and has enough space. We must
+ * recheck this since our cache isn't necessarily up to date.
+ */
+ if ((GBUF_REQ_LEAF(flags) ? SpGistPageIsLeaf(page) : !SpGistPageIsLeaf(page)) &&
+ (GBUF_REQ_NULLS(flags) ? SpGistPageStoresNulls(page) : !SpGistPageStoresNulls(page)))
+ {
+ int freeSpace = PageGetExactFreeSpace(page);
+
+ if (freeSpace >= needSpace)
+ {
+ /* Success, update freespace info and return the buffer */
+ lup->freeSpace = freeSpace - needSpace;
+ *isNew = false;
+ return buffer;
+ }
+ }
+
+ /*
+ * fallback to allocation of new buffer
+ */
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /* No success with cache, so return a new buffer */
+ *isNew = true;
+ return allocNewBuffer(index, flags);
+}
+
+/*
+ * Update lastUsedPages cache when done modifying a page.
+ *
+ * We update the appropriate cache entry if it already contained this page
+ * (its freeSpace is likely obsolete), or if this page has more space than
+ * whatever we had cached.
+ */
+void
+SpGistSetLastUsedPage(Relation index, Buffer buffer)
+{
+ SpGistCache *cache = spgGetCache(index);
+ SpGistLastUsedPage *lup;
+ int freeSpace;
+ Page page = BufferGetPage(buffer);
+ BlockNumber blkno = BufferGetBlockNumber(buffer);
+ int flags;
+
+ /* Never enter fixed pages (root pages) in cache, though */
+ if (SpGistBlockIsFixed(blkno))
+ return;
+
+ if (SpGistPageIsLeaf(page))
+ flags = GBUF_LEAF;
+ else
+ flags = GBUF_INNER_PARITY(blkno);
+ if (SpGistPageStoresNulls(page))
+ flags |= GBUF_NULLS;
+
+ lup = GET_LUP(cache, flags);
+
+ freeSpace = PageGetExactFreeSpace(page);
+ if (lup->blkno == InvalidBlockNumber || lup->blkno == blkno ||
+ lup->freeSpace < freeSpace)
+ {
+ lup->blkno = blkno;
+ lup->freeSpace = freeSpace;
+ }
+}
+
+/*
+ * Initialize an SPGiST page to empty, with specified flags
+ */
+void
+SpGistInitPage(Page page, uint16 f)
+{
+ SpGistPageOpaque opaque;
+
+ PageInit(page, BLCKSZ, sizeof(SpGistPageOpaqueData));
+ opaque = SpGistPageGetOpaque(page);
+ opaque->flags = f;
+ opaque->spgist_page_id = SPGIST_PAGE_ID;
+}
+
+/*
+ * Initialize a buffer's page to empty, with specified flags
+ */
+void
+SpGistInitBuffer(Buffer b, uint16 f)
+{
+ Assert(BufferGetPageSize(b) == BLCKSZ);
+ SpGistInitPage(BufferGetPage(b), f);
+}
+
+/*
+ * Initialize metadata page
+ */
+void
+SpGistInitMetapage(Page page)
+{
+ SpGistMetaPageData *metadata;
+ int i;
+
+ SpGistInitPage(page, SPGIST_META);
+ metadata = SpGistPageGetMeta(page);
+ memset(metadata, 0, sizeof(SpGistMetaPageData));
+ metadata->magicNumber = SPGIST_MAGIC_NUMBER;
+
+ /* initialize last-used-page cache to empty */
+ for (i = 0; i < SPGIST_CACHED_PAGES; i++)
+ metadata->lastUsedPages.cachedPage[i].blkno = InvalidBlockNumber;
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page.
+ */
+ ((PageHeader) page)->pd_lower =
+ ((char *) metadata + sizeof(SpGistMetaPageData)) - (char *) page;
+}
+
+/*
+ * reloptions processing for SPGiST
+ */
+bytea *
+spgoptions(Datum reloptions, bool validate)
+{
+ static const relopt_parse_elt tab[] = {
+ {"fillfactor", RELOPT_TYPE_INT, offsetof(SpGistOptions, fillfactor)},
+ };
+
+ return (bytea *) build_reloptions(reloptions, validate,
+ RELOPT_KIND_SPGIST,
+ sizeof(SpGistOptions),
+ tab, lengthof(tab));
+}
+
+/*
+ * Get the space needed to store a non-null datum of the indicated type
+ * in an inner tuple (that is, as a prefix or node label).
+ * Note the result is already rounded up to a MAXALIGN boundary.
+ * Here we follow the convention that pass-by-val types are just stored
+ * in their Datum representation (compare memcpyInnerDatum).
+ */
+unsigned int
+SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum)
+{
+ unsigned int size;
+
+ if (att->attbyval)
+ size = sizeof(Datum);
+ else if (att->attlen > 0)
+ size = att->attlen;
+ else
+ size = VARSIZE_ANY(datum);
+
+ return MAXALIGN(size);
+}
+
+/*
+ * Copy the given non-null datum to *target, in the inner-tuple case
+ */
+static void
+memcpyInnerDatum(void *target, SpGistTypeDesc *att, Datum datum)
+{
+ unsigned int size;
+
+ if (att->attbyval)
+ {
+ memcpy(target, &datum, sizeof(Datum));
+ }
+ else
+ {
+ size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(datum);
+ memcpy(target, DatumGetPointer(datum), size);
+ }
+}
+
+/*
+ * Compute space required for a leaf tuple holding the given data.
+ *
+ * This must match the size-calculation portion of spgFormLeafTuple.
+ */
+Size
+SpGistGetLeafTupleSize(TupleDesc tupleDescriptor,
+ Datum *datums, bool *isnulls)
+{
+ Size size;
+ Size data_size;
+ bool needs_null_mask = false;
+ int natts = tupleDescriptor->natts;
+
+ /*
+ * Decide whether we need a nulls bitmask.
+ *
+ * If there is only a key attribute (natts == 1), never use a bitmask, for
+ * compatibility with the pre-v14 layout of leaf tuples. Otherwise, we
+ * need one if any attribute is null.
+ */
+ if (natts > 1)
+ {
+ for (int i = 0; i < natts; i++)
+ {
+ if (isnulls[i])
+ {
+ needs_null_mask = true;
+ break;
+ }
+ }
+ }
+
+ /*
+ * Calculate size of the data part; same as for heap tuples.
+ */
+ data_size = heap_compute_data_size(tupleDescriptor, datums, isnulls);
+
+ /*
+ * Compute total size.
+ */
+ size = SGLTHDRSZ(needs_null_mask);
+ size += data_size;
+ size = MAXALIGN(size);
+
+ /*
+ * Ensure that we can replace the tuple with a dead tuple later. This test
+ * is unnecessary when there are any non-null attributes, but be safe.
+ */
+ if (size < SGDTSIZE)
+ size = SGDTSIZE;
+
+ return size;
+}
+
+/*
+ * Construct a leaf tuple containing the given heap TID and datum values
+ */
+SpGistLeafTuple
+spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr,
+ Datum *datums, bool *isnulls)
+{
+ SpGistLeafTuple tup;
+ TupleDesc tupleDescriptor = state->leafTupDesc;
+ Size size;
+ Size hoff;
+ Size data_size;
+ bool needs_null_mask = false;
+ int natts = tupleDescriptor->natts;
+ char *tp; /* ptr to tuple data */
+ uint16 tupmask = 0; /* unused heap_fill_tuple output */
+
+ /*
+ * Decide whether we need a nulls bitmask.
+ *
+ * If there is only a key attribute (natts == 1), never use a bitmask, for
+ * compatibility with the pre-v14 layout of leaf tuples. Otherwise, we
+ * need one if any attribute is null.
+ */
+ if (natts > 1)
+ {
+ for (int i = 0; i < natts; i++)
+ {
+ if (isnulls[i])
+ {
+ needs_null_mask = true;
+ break;
+ }
+ }
+ }
+
+ /*
+ * Calculate size of the data part; same as for heap tuples.
+ */
+ data_size = heap_compute_data_size(tupleDescriptor, datums, isnulls);
+
+ /*
+ * Compute total size.
+ */
+ hoff = SGLTHDRSZ(needs_null_mask);
+ size = hoff + data_size;
+ size = MAXALIGN(size);
+
+ /*
+ * Ensure that we can replace the tuple with a dead tuple later. This test
+ * is unnecessary when there are any non-null attributes, but be safe.
+ */
+ if (size < SGDTSIZE)
+ size = SGDTSIZE;
+
+ /* OK, form the tuple */
+ tup = (SpGistLeafTuple) palloc0(size);
+
+ tup->size = size;
+ SGLT_SET_NEXTOFFSET(tup, InvalidOffsetNumber);
+ tup->heapPtr = *heapPtr;
+
+ tp = (char *) tup + hoff;
+
+ if (needs_null_mask)
+ {
+ bits8 *bp; /* ptr to null bitmap in tuple */
+
+ /* Set nullmask presence bit in SpGistLeafTuple header */
+ SGLT_SET_HASNULLMASK(tup, true);
+ /* Fill the data area and null mask */
+ bp = (bits8 *) ((char *) tup + sizeof(SpGistLeafTupleData));
+ heap_fill_tuple(tupleDescriptor, datums, isnulls, tp, data_size,
+ &tupmask, bp);
+ }
+ else if (natts > 1 || !isnulls[spgKeyColumn])
+ {
+ /* Fill data area only */
+ heap_fill_tuple(tupleDescriptor, datums, isnulls, tp, data_size,
+ &tupmask, (bits8 *) NULL);
+ }
+ /* otherwise we have no data, nor a bitmap, to fill */
+
+ return tup;
+}
+
+/*
+ * Construct a node (to go into an inner tuple) containing the given label
+ *
+ * Note that the node's downlink is just set invalid here. Caller will fill
+ * it in later.
+ */
+SpGistNodeTuple
+spgFormNodeTuple(SpGistState *state, Datum label, bool isnull)
+{
+ SpGistNodeTuple tup;
+ unsigned int size;
+ unsigned short infomask = 0;
+
+ /* compute space needed (note result is already maxaligned) */
+ size = SGNTHDRSZ;
+ if (!isnull)
+ size += SpGistGetInnerTypeSize(&state->attLabelType, label);
+
+ /*
+ * Here we make sure that the size will fit in the field reserved for it
+ * in t_info.
+ */
+ if ((size & INDEX_SIZE_MASK) != size)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row requires %zu bytes, maximum size is %zu",
+ (Size) size, (Size) INDEX_SIZE_MASK)));
+
+ tup = (SpGistNodeTuple) palloc0(size);
+
+ if (isnull)
+ infomask |= INDEX_NULL_MASK;
+ /* we don't bother setting the INDEX_VAR_MASK bit */
+ infomask |= size;
+ tup->t_info = infomask;
+
+ /* The TID field will be filled in later */
+ ItemPointerSetInvalid(&tup->t_tid);
+
+ if (!isnull)
+ memcpyInnerDatum(SGNTDATAPTR(tup), &state->attLabelType, label);
+
+ return tup;
+}
+
+/*
+ * Construct an inner tuple containing the given prefix and node array
+ */
+SpGistInnerTuple
+spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix,
+ int nNodes, SpGistNodeTuple *nodes)
+{
+ SpGistInnerTuple tup;
+ unsigned int size;
+ unsigned int prefixSize;
+ int i;
+ char *ptr;
+
+ /* Compute size needed */
+ if (hasPrefix)
+ prefixSize = SpGistGetInnerTypeSize(&state->attPrefixType, prefix);
+ else
+ prefixSize = 0;
+
+ size = SGITHDRSZ + prefixSize;
+
+ /* Note: we rely on node tuple sizes to be maxaligned already */
+ for (i = 0; i < nNodes; i++)
+ size += IndexTupleSize(nodes[i]);
+
+ /*
+ * Ensure that we can replace the tuple with a dead tuple later. This
+ * test is unnecessary given current tuple layouts, but let's be safe.
+ */
+ if (size < SGDTSIZE)
+ size = SGDTSIZE;
+
+ /*
+ * Inner tuple should be small enough to fit on a page
+ */
+ if (size > SPGIST_PAGE_CAPACITY - sizeof(ItemIdData))
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("SP-GiST inner tuple size %zu exceeds maximum %zu",
+ (Size) size,
+ SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)),
+ errhint("Values larger than a buffer page cannot be indexed.")));
+
+ /*
+ * Check for overflow of header fields --- probably can't fail if the
+ * above succeeded, but let's be paranoid
+ */
+ if (size > SGITMAXSIZE ||
+ prefixSize > SGITMAXPREFIXSIZE ||
+ nNodes > SGITMAXNNODES)
+ elog(ERROR, "SPGiST inner tuple header field is too small");
+
+ /* OK, form the tuple */
+ tup = (SpGistInnerTuple) palloc0(size);
+
+ tup->nNodes = nNodes;
+ tup->prefixSize = prefixSize;
+ tup->size = size;
+
+ if (hasPrefix)
+ memcpyInnerDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix);
+
+ ptr = (char *) SGITNODEPTR(tup);
+
+ for (i = 0; i < nNodes; i++)
+ {
+ SpGistNodeTuple node = nodes[i];
+
+ memcpy(ptr, node, IndexTupleSize(node));
+ ptr += IndexTupleSize(node);
+ }
+
+ return tup;
+}
+
+/*
+ * Construct a "dead" tuple to replace a tuple being deleted.
+ *
+ * The state can be SPGIST_REDIRECT, SPGIST_DEAD, or SPGIST_PLACEHOLDER.
+ * For a REDIRECT tuple, a pointer (blkno+offset) must be supplied, and
+ * the xid field is filled in automatically.
+ *
+ * This is called in critical sections, so we don't use palloc; the tuple
+ * is built in preallocated storage. It should be copied before another
+ * call with different parameters can occur.
+ */
+SpGistDeadTuple
+spgFormDeadTuple(SpGistState *state, int tupstate,
+ BlockNumber blkno, OffsetNumber offnum)
+{
+ SpGistDeadTuple tuple = (SpGistDeadTuple) state->deadTupleStorage;
+
+ tuple->tupstate = tupstate;
+ tuple->size = SGDTSIZE;
+ SGLT_SET_NEXTOFFSET(tuple, InvalidOffsetNumber);
+
+ if (tupstate == SPGIST_REDIRECT)
+ {
+ ItemPointerSet(&tuple->pointer, blkno, offnum);
+ Assert(TransactionIdIsValid(state->myXid));
+ tuple->xid = state->myXid;
+ }
+ else
+ {
+ ItemPointerSetInvalid(&tuple->pointer);
+ tuple->xid = InvalidTransactionId;
+ }
+
+ return tuple;
+}
+
+/*
+ * Convert an SPGiST leaf tuple into Datum/isnull arrays.
+ *
+ * The caller must allocate sufficient storage for the output arrays.
+ * (INDEX_MAX_KEYS entries should be enough.)
+ */
+void
+spgDeformLeafTuple(SpGistLeafTuple tup, TupleDesc tupleDescriptor,
+ Datum *datums, bool *isnulls, bool keyColumnIsNull)
+{
+ bool hasNullsMask = SGLT_GET_HASNULLMASK(tup);
+ char *tp; /* ptr to tuple data */
+ bits8 *bp; /* ptr to null bitmap in tuple */
+
+ if (keyColumnIsNull && tupleDescriptor->natts == 1)
+ {
+ /*
+ * Trivial case: there is only the key attribute and we're in a nulls
+ * tree. The hasNullsMask bit in the tuple header should not be set
+ * (and thus we can't use index_deform_tuple_internal), but
+ * nonetheless the result is NULL.
+ *
+ * Note: currently this is dead code, because noplace calls this when
+ * there is only the key attribute. But we should cover the case.
+ */
+ Assert(!hasNullsMask);
+
+ datums[spgKeyColumn] = (Datum) 0;
+ isnulls[spgKeyColumn] = true;
+ return;
+ }
+
+ tp = (char *) tup + SGLTHDRSZ(hasNullsMask);
+ bp = (bits8 *) ((char *) tup + sizeof(SpGistLeafTupleData));
+
+ index_deform_tuple_internal(tupleDescriptor,
+ datums, isnulls,
+ tp, bp, hasNullsMask);
+
+ /*
+ * Key column isnull value from the tuple should be consistent with
+ * keyColumnIsNull flag from the caller.
+ */
+ Assert(keyColumnIsNull == isnulls[spgKeyColumn]);
+}
+
+/*
+ * Extract the label datums of the nodes within innerTuple
+ *
+ * Returns NULL if label datums are NULLs
+ */
+Datum *
+spgExtractNodeLabels(SpGistState *state, SpGistInnerTuple innerTuple)
+{
+ Datum *nodeLabels;
+ int i;
+ SpGistNodeTuple node;
+
+ /* Either all the labels must be NULL, or none. */
+ node = SGITNODEPTR(innerTuple);
+ if (IndexTupleHasNulls(node))
+ {
+ SGITITERATE(innerTuple, i, node)
+ {
+ if (!IndexTupleHasNulls(node))
+ elog(ERROR, "some but not all node labels are null in SPGiST inner tuple");
+ }
+ /* They're all null, so just return NULL */
+ return NULL;
+ }
+ else
+ {
+ nodeLabels = (Datum *) palloc(sizeof(Datum) * innerTuple->nNodes);
+ SGITITERATE(innerTuple, i, node)
+ {
+ if (IndexTupleHasNulls(node))
+ elog(ERROR, "some but not all node labels are null in SPGiST inner tuple");
+ nodeLabels[i] = SGNTDATUM(node, state);
+ }
+ return nodeLabels;
+ }
+}
+
+/*
+ * Add a new item to the page, replacing a PLACEHOLDER item if possible.
+ * Return the location it's inserted at, or InvalidOffsetNumber on failure.
+ *
+ * If startOffset isn't NULL, we start searching for placeholders at
+ * *startOffset, and update that to the next place to search. This is just
+ * an optimization for repeated insertions.
+ *
+ * If errorOK is false, we throw error when there's not enough room,
+ * rather than returning InvalidOffsetNumber.
+ */
+OffsetNumber
+SpGistPageAddNewItem(SpGistState *state, Page page, Item item, Size size,
+ OffsetNumber *startOffset, bool errorOK)
+{
+ SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
+ OffsetNumber i,
+ maxoff,
+ offnum;
+
+ if (opaque->nPlaceholder > 0 &&
+ PageGetExactFreeSpace(page) + SGDTSIZE >= MAXALIGN(size))
+ {
+ /* Try to replace a placeholder */
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = InvalidOffsetNumber;
+
+ for (;;)
+ {
+ if (startOffset && *startOffset != InvalidOffsetNumber)
+ i = *startOffset;
+ else
+ i = FirstOffsetNumber;
+ for (; i <= maxoff; i++)
+ {
+ SpGistDeadTuple it = (SpGistDeadTuple) PageGetItem(page,
+ PageGetItemId(page, i));
+
+ if (it->tupstate == SPGIST_PLACEHOLDER)
+ {
+ offnum = i;
+ break;
+ }
+ }
+
+ /* Done if we found a placeholder */
+ if (offnum != InvalidOffsetNumber)
+ break;
+
+ if (startOffset && *startOffset != InvalidOffsetNumber)
+ {
+ /* Hint was no good, re-search from beginning */
+ *startOffset = InvalidOffsetNumber;
+ continue;
+ }
+
+ /* Hmm, no placeholder found? */
+ opaque->nPlaceholder = 0;
+ break;
+ }
+
+ if (offnum != InvalidOffsetNumber)
+ {
+ /* Replace the placeholder tuple */
+ PageIndexTupleDelete(page, offnum);
+
+ offnum = PageAddItem(page, item, size, offnum, false, false);
+
+ /*
+ * We should not have failed given the size check at the top of
+ * the function, but test anyway. If we did fail, we must PANIC
+ * because we've already deleted the placeholder tuple, and
+ * there's no other way to keep the damage from getting to disk.
+ */
+ if (offnum != InvalidOffsetNumber)
+ {
+ Assert(opaque->nPlaceholder > 0);
+ opaque->nPlaceholder--;
+ if (startOffset)
+ *startOffset = offnum + 1;
+ }
+ else
+ elog(PANIC, "failed to add item of size %zu to SPGiST index page",
+ size);
+
+ return offnum;
+ }
+ }
+
+ /* No luck in replacing a placeholder, so just add it to the page */
+ offnum = PageAddItem(page, item, size,
+ InvalidOffsetNumber, false, false);
+
+ if (offnum == InvalidOffsetNumber && !errorOK)
+ elog(ERROR, "failed to add item of size %zu to SPGiST index page",
+ size);
+
+ return offnum;
+}
+
+/*
+ * spgproperty() -- Check boolean properties of indexes.
+ *
+ * This is optional for most AMs, but is required for SP-GiST because the core
+ * property code doesn't support AMPROP_DISTANCE_ORDERABLE.
+ */
+bool
+spgproperty(Oid index_oid, int attno,
+ IndexAMProperty prop, const char *propname,
+ bool *res, bool *isnull)
+{
+ Oid opclass,
+ opfamily,
+ opcintype;
+ CatCList *catlist;
+ int i;
+
+ /* Only answer column-level inquiries */
+ if (attno == 0)
+ return false;
+
+ switch (prop)
+ {
+ case AMPROP_DISTANCE_ORDERABLE:
+ break;
+ default:
+ return false;
+ }
+
+ /*
+ * Currently, SP-GiST distance-ordered scans require that there be a
+ * distance operator in the opclass with the default types. So we assume
+ * that if such an operator exists, then there's a reason for it.
+ */
+
+ /* First we need to know the column's opclass. */
+ opclass = get_index_column_opclass(index_oid, attno);
+ if (!OidIsValid(opclass))
+ {
+ *isnull = true;
+ return true;
+ }
+
+ /* Now look up the opclass family and input datatype. */
+ if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype))
+ {
+ *isnull = true;
+ return true;
+ }
+
+ /* And now we can check whether the operator is provided. */
+ catlist = SearchSysCacheList1(AMOPSTRATEGY,
+ ObjectIdGetDatum(opfamily));
+
+ *res = false;
+
+ for (i = 0; i < catlist->n_members; i++)
+ {
+ HeapTuple amoptup = &catlist->members[i]->tuple;
+ Form_pg_amop amopform = (Form_pg_amop) GETSTRUCT(amoptup);
+
+ if (amopform->amoppurpose == AMOP_ORDER &&
+ (amopform->amoplefttype == opcintype ||
+ amopform->amoprighttype == opcintype) &&
+ opfamily_can_sort_type(amopform->amopsortfamily,
+ get_op_rettype(amopform->amopopr)))
+ {
+ *res = true;
+ break;
+ }
+ }
+
+ ReleaseSysCacheList(catlist);
+
+ *isnull = false;
+
+ return true;
+}
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
new file mode 100644
index 0000000..0049630
--- /dev/null
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -0,0 +1,975 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgvacuum.c
+ * vacuum for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgvacuum.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/spgist_private.h"
+#include "access/spgxlog.h"
+#include "access/transam.h"
+#include "access/xloginsert.h"
+#include "catalog/storage_xlog.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "utils/snapmgr.h"
+
+
+/* Entry in pending-list of TIDs we need to revisit */
+typedef struct spgVacPendingItem
+{
+ ItemPointerData tid; /* redirection target to visit */
+ bool done; /* have we dealt with this? */
+ struct spgVacPendingItem *next; /* list link */
+} spgVacPendingItem;
+
+/* Local state for vacuum operations */
+typedef struct spgBulkDeleteState
+{
+ /* Parameters passed in to spgvacuumscan */
+ IndexVacuumInfo *info;
+ IndexBulkDeleteResult *stats;
+ IndexBulkDeleteCallback callback;
+ void *callback_state;
+
+ /* Additional working state */
+ SpGistState spgstate; /* for SPGiST operations that need one */
+ spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */
+ TransactionId myXmin; /* for detecting newly-added redirects */
+ BlockNumber lastFilledBlock; /* last non-deletable block */
+} spgBulkDeleteState;
+
+
+/*
+ * Add TID to pendingList, but only if not already present.
+ *
+ * Note that new items are always appended at the end of the list; this
+ * ensures that scans of the list don't miss items added during the scan.
+ */
+static void
+spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid)
+{
+ spgVacPendingItem *pitem;
+ spgVacPendingItem **listLink;
+
+ /* search the list for pre-existing entry */
+ listLink = &bds->pendingList;
+ while (*listLink != NULL)
+ {
+ pitem = *listLink;
+ if (ItemPointerEquals(tid, &pitem->tid))
+ return; /* already in list, do nothing */
+ listLink = &pitem->next;
+ }
+ /* not there, so append new entry */
+ pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem));
+ pitem->tid = *tid;
+ pitem->done = false;
+ pitem->next = NULL;
+ *listLink = pitem;
+}
+
+/*
+ * Clear pendingList
+ */
+static void
+spgClearPendingList(spgBulkDeleteState *bds)
+{
+ spgVacPendingItem *pitem;
+ spgVacPendingItem *nitem;
+
+ for (pitem = bds->pendingList; pitem != NULL; pitem = nitem)
+ {
+ nitem = pitem->next;
+ /* All items in list should have been dealt with */
+ Assert(pitem->done);
+ pfree(pitem);
+ }
+ bds->pendingList = NULL;
+}
+
+/*
+ * Vacuum a regular (non-root) leaf page
+ *
+ * We must delete tuples that are targeted for deletion by the VACUUM,
+ * but not move any tuples that are referenced by outside links; we assume
+ * those are the ones that are heads of chains.
+ *
+ * If we find a REDIRECT that was made by a concurrently-running transaction,
+ * we must add its target TID to pendingList. (We don't try to visit the
+ * target immediately, first because we don't want VACUUM locking more than
+ * one buffer at a time, and second because the duplicate-filtering logic
+ * in spgAddPendingTID is useful to ensure we can't get caught in an infinite
+ * loop in the face of continuous concurrent insertions.)
+ *
+ * If forPending is true, we are examining the page as a consequence of
+ * chasing a redirect link, not as part of the normal sequential scan.
+ * We still vacuum the page normally, but we don't increment the stats
+ * about live tuples; else we'd double-count those tuples, since the page
+ * has been or will be visited in the sequential scan as well.
+ */
+static void
+vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
+ bool forPending)
+{
+ Page page = BufferGetPage(buffer);
+ spgxlogVacuumLeaf xlrec;
+ OffsetNumber toDead[MaxIndexTuplesPerPage];
+ OffsetNumber toPlaceholder[MaxIndexTuplesPerPage];
+ OffsetNumber moveSrc[MaxIndexTuplesPerPage];
+ OffsetNumber moveDest[MaxIndexTuplesPerPage];
+ OffsetNumber chainSrc[MaxIndexTuplesPerPage];
+ OffsetNumber chainDest[MaxIndexTuplesPerPage];
+ OffsetNumber predecessor[MaxIndexTuplesPerPage + 1];
+ bool deletable[MaxIndexTuplesPerPage + 1];
+ int nDeletable;
+ OffsetNumber i,
+ max = PageGetMaxOffsetNumber(page);
+
+ memset(predecessor, 0, sizeof(predecessor));
+ memset(deletable, 0, sizeof(deletable));
+ nDeletable = 0;
+
+ /* Scan page, identify tuples to delete, accumulate stats */
+ for (i = FirstOffsetNumber; i <= max; i++)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, i));
+ if (lt->tupstate == SPGIST_LIVE)
+ {
+ Assert(ItemPointerIsValid(&lt->heapPtr));
+
+ if (bds->callback(&lt->heapPtr, bds->callback_state))
+ {
+ bds->stats->tuples_removed += 1;
+ deletable[i] = true;
+ nDeletable++;
+ }
+ else
+ {
+ if (!forPending)
+ bds->stats->num_index_tuples += 1;
+ }
+
+ /* Form predecessor map, too */
+ if (SGLT_GET_NEXTOFFSET(lt) != InvalidOffsetNumber)
+ {
+ /* paranoia about corrupted chain links */
+ if (SGLT_GET_NEXTOFFSET(lt) < FirstOffsetNumber ||
+ SGLT_GET_NEXTOFFSET(lt) > max ||
+ predecessor[SGLT_GET_NEXTOFFSET(lt)] != InvalidOffsetNumber)
+ elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"",
+ BufferGetBlockNumber(buffer),
+ RelationGetRelationName(index));
+ predecessor[SGLT_GET_NEXTOFFSET(lt)] = i;
+ }
+ }
+ else if (lt->tupstate == SPGIST_REDIRECT)
+ {
+ SpGistDeadTuple dt = (SpGistDeadTuple) lt;
+
+ Assert(SGLT_GET_NEXTOFFSET(dt) == InvalidOffsetNumber);
+ Assert(ItemPointerIsValid(&dt->pointer));
+
+ /*
+ * Add target TID to pending list if the redirection could have
+ * happened since VACUUM started.
+ *
+ * Note: we could make a tighter test by seeing if the xid is
+ * "running" according to the active snapshot; but snapmgr.c
+ * doesn't currently export a suitable API, and it's not entirely
+ * clear that a tighter test is worth the cycles anyway.
+ */
+ if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin))
+ spgAddPendingTID(bds, &dt->pointer);
+ }
+ else
+ {
+ Assert(SGLT_GET_NEXTOFFSET(lt) == InvalidOffsetNumber);
+ }
+ }
+
+ if (nDeletable == 0)
+ return; /* nothing more to do */
+
+ /*----------
+ * Figure out exactly what we have to do. We do this separately from
+ * actually modifying the page, mainly so that we have a representation
+ * that can be dumped into WAL and then the replay code can do exactly
+ * the same thing. The output of this step consists of six arrays
+ * describing four kinds of operations, to be performed in this order:
+ *
+ * toDead[]: tuple numbers to be replaced with DEAD tuples
+ * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples
+ * moveSrc[]: tuple numbers that need to be relocated to another offset
+ * (replacing the tuple there) and then replaced with PLACEHOLDER tuples
+ * moveDest[]: new locations for moveSrc tuples
+ * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates
+ * chainDest[]: new values of nextOffset for chainSrc members
+ *
+ * It's easiest to figure out what we have to do by processing tuple
+ * chains, so we iterate over all the tuples (not just the deletable
+ * ones!) to identify chain heads, then chase down each chain and make
+ * work item entries for deletable tuples within the chain.
+ *----------
+ */
+ xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0;
+
+ for (i = FirstOffsetNumber; i <= max; i++)
+ {
+ SpGistLeafTuple head;
+ bool interveningDeletable;
+ OffsetNumber prevLive;
+ OffsetNumber j;
+
+ head = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, i));
+ if (head->tupstate != SPGIST_LIVE)
+ continue; /* can't be a chain member */
+ if (predecessor[i] != 0)
+ continue; /* not a chain head */
+
+ /* initialize ... */
+ interveningDeletable = false;
+ prevLive = deletable[i] ? InvalidOffsetNumber : i;
+
+ /* scan down the chain ... */
+ j = SGLT_GET_NEXTOFFSET(head);
+ while (j != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, j));
+ if (lt->tupstate != SPGIST_LIVE)
+ {
+ /* all tuples in chain should be live */
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ lt->tupstate);
+ }
+
+ if (deletable[j])
+ {
+ /* This tuple should be replaced by a placeholder */
+ toPlaceholder[xlrec.nPlaceholder] = j;
+ xlrec.nPlaceholder++;
+ /* previous live tuple's chain link will need an update */
+ interveningDeletable = true;
+ }
+ else if (prevLive == InvalidOffsetNumber)
+ {
+ /*
+ * This is the first live tuple in the chain. It has to move
+ * to the head position.
+ */
+ moveSrc[xlrec.nMove] = j;
+ moveDest[xlrec.nMove] = i;
+ xlrec.nMove++;
+ /* Chain updates will be applied after the move */
+ prevLive = i;
+ interveningDeletable = false;
+ }
+ else
+ {
+ /*
+ * Second or later live tuple. Arrange to re-chain it to the
+ * previous live one, if there was a gap.
+ */
+ if (interveningDeletable)
+ {
+ chainSrc[xlrec.nChain] = prevLive;
+ chainDest[xlrec.nChain] = j;
+ xlrec.nChain++;
+ }
+ prevLive = j;
+ interveningDeletable = false;
+ }
+
+ j = SGLT_GET_NEXTOFFSET(lt);
+ }
+
+ if (prevLive == InvalidOffsetNumber)
+ {
+ /* The chain is entirely removable, so we need a DEAD tuple */
+ toDead[xlrec.nDead] = i;
+ xlrec.nDead++;
+ }
+ else if (interveningDeletable)
+ {
+ /* One or more deletions at end of chain, so close it off */
+ chainSrc[xlrec.nChain] = prevLive;
+ chainDest[xlrec.nChain] = InvalidOffsetNumber;
+ xlrec.nChain++;
+ }
+ }
+
+ /* sanity check ... */
+ if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove)
+ elog(ERROR, "inconsistent counts of deletable tuples");
+
+ /* Do the updates */
+ START_CRIT_SECTION();
+
+ spgPageIndexMultiDelete(&bds->spgstate, page,
+ toDead, xlrec.nDead,
+ SPGIST_DEAD, SPGIST_DEAD,
+ InvalidBlockNumber, InvalidOffsetNumber);
+
+ spgPageIndexMultiDelete(&bds->spgstate, page,
+ toPlaceholder, xlrec.nPlaceholder,
+ SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber, InvalidOffsetNumber);
+
+ /*
+ * We implement the move step by swapping the line pointers of the source
+ * and target tuples, then replacing the newly-source tuples with
+ * placeholders. This is perhaps unduly friendly with the page data
+ * representation, but it's fast and doesn't risk page overflow when a
+ * tuple to be relocated is large.
+ */
+ for (i = 0; i < xlrec.nMove; i++)
+ {
+ ItemId idSrc = PageGetItemId(page, moveSrc[i]);
+ ItemId idDest = PageGetItemId(page, moveDest[i]);
+ ItemIdData tmp;
+
+ tmp = *idSrc;
+ *idSrc = *idDest;
+ *idDest = tmp;
+ }
+
+ spgPageIndexMultiDelete(&bds->spgstate, page,
+ moveSrc, xlrec.nMove,
+ SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber, InvalidOffsetNumber);
+
+ for (i = 0; i < xlrec.nChain; i++)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, chainSrc[i]));
+ Assert(lt->tupstate == SPGIST_LIVE);
+ SGLT_SET_NEXTOFFSET(lt, chainDest[i]);
+ }
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+
+ STORE_STATE(&bds->spgstate, xlrec.stateSrc);
+
+ XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumLeaf);
+ /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
+ XLogRegisterData((char *) toDead, sizeof(OffsetNumber) * xlrec.nDead);
+ XLogRegisterData((char *) toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder);
+ XLogRegisterData((char *) moveSrc, sizeof(OffsetNumber) * xlrec.nMove);
+ XLogRegisterData((char *) moveDest, sizeof(OffsetNumber) * xlrec.nMove);
+ XLogRegisterData((char *) chainSrc, sizeof(OffsetNumber) * xlrec.nChain);
+ XLogRegisterData((char *) chainDest, sizeof(OffsetNumber) * xlrec.nChain);
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Vacuum a root page when it is also a leaf
+ *
+ * On the root, we just delete any dead leaf tuples; no fancy business
+ */
+static void
+vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer)
+{
+ Page page = BufferGetPage(buffer);
+ spgxlogVacuumRoot xlrec;
+ OffsetNumber toDelete[MaxIndexTuplesPerPage];
+ OffsetNumber i,
+ max = PageGetMaxOffsetNumber(page);
+
+ xlrec.nDelete = 0;
+
+ /* Scan page, identify tuples to delete, accumulate stats */
+ for (i = FirstOffsetNumber; i <= max; i++)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, i));
+ if (lt->tupstate == SPGIST_LIVE)
+ {
+ Assert(ItemPointerIsValid(&lt->heapPtr));
+
+ if (bds->callback(&lt->heapPtr, bds->callback_state))
+ {
+ bds->stats->tuples_removed += 1;
+ toDelete[xlrec.nDelete] = i;
+ xlrec.nDelete++;
+ }
+ else
+ {
+ bds->stats->num_index_tuples += 1;
+ }
+ }
+ else
+ {
+ /* all tuples on root should be live */
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ lt->tupstate);
+ }
+ }
+
+ if (xlrec.nDelete == 0)
+ return; /* nothing more to do */
+
+ /* Do the update */
+ START_CRIT_SECTION();
+
+ /* The tuple numbers are in order, so we can use PageIndexMultiDelete */
+ PageIndexMultiDelete(page, toDelete, xlrec.nDelete);
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+
+ /* Prepare WAL record */
+ STORE_STATE(&bds->spgstate, xlrec.stateSrc);
+
+ XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRoot);
+ /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
+ XLogRegisterData((char *) toDelete,
+ sizeof(OffsetNumber) * xlrec.nDelete);
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Clean up redirect and placeholder tuples on the given page
+ *
+ * Redirect tuples can be marked placeholder once they're old enough.
+ * Placeholder tuples can be removed if it won't change the offsets of
+ * non-placeholder ones.
+ *
+ * Unlike the routines above, this works on both leaf and inner pages.
+ */
+static void
+vacuumRedirectAndPlaceholder(Relation index, Buffer buffer)
+{
+ Page page = BufferGetPage(buffer);
+ SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
+ OffsetNumber i,
+ max = PageGetMaxOffsetNumber(page),
+ firstPlaceholder = InvalidOffsetNumber;
+ bool hasNonPlaceholder = false;
+ bool hasUpdate = false;
+ OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage];
+ OffsetNumber itemnos[MaxIndexTuplesPerPage];
+ spgxlogVacuumRedirect xlrec;
+ GlobalVisState *vistest;
+
+ xlrec.nToPlaceholder = 0;
+ xlrec.newestRedirectXid = InvalidTransactionId;
+
+ /* XXX: providing heap relation would allow more pruning */
+ vistest = GlobalVisTestFor(NULL);
+
+ START_CRIT_SECTION();
+
+ /*
+ * Scan backwards to convert old redirection tuples to placeholder tuples,
+ * and identify location of last non-placeholder tuple while at it.
+ */
+ for (i = max;
+ i >= FirstOffsetNumber &&
+ (opaque->nRedirection > 0 || !hasNonPlaceholder);
+ i--)
+ {
+ SpGistDeadTuple dt;
+
+ dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i));
+
+ if (dt->tupstate == SPGIST_REDIRECT &&
+ GlobalVisTestIsRemovableXid(vistest, dt->xid))
+ {
+ dt->tupstate = SPGIST_PLACEHOLDER;
+ Assert(opaque->nRedirection > 0);
+ opaque->nRedirection--;
+ opaque->nPlaceholder++;
+
+ /* remember newest XID among the removed redirects */
+ if (!TransactionIdIsValid(xlrec.newestRedirectXid) ||
+ TransactionIdPrecedes(xlrec.newestRedirectXid, dt->xid))
+ xlrec.newestRedirectXid = dt->xid;
+
+ ItemPointerSetInvalid(&dt->pointer);
+
+ itemToPlaceholder[xlrec.nToPlaceholder] = i;
+ xlrec.nToPlaceholder++;
+
+ hasUpdate = true;
+ }
+
+ if (dt->tupstate == SPGIST_PLACEHOLDER)
+ {
+ if (!hasNonPlaceholder)
+ firstPlaceholder = i;
+ }
+ else
+ {
+ hasNonPlaceholder = true;
+ }
+ }
+
+ /*
+ * Any placeholder tuples at the end of page can safely be removed. We
+ * can't remove ones before the last non-placeholder, though, because we
+ * can't alter the offset numbers of non-placeholder tuples.
+ */
+ if (firstPlaceholder != InvalidOffsetNumber)
+ {
+ /*
+ * We do not store this array to rdata because it's easy to recreate.
+ */
+ for (i = firstPlaceholder; i <= max; i++)
+ itemnos[i - firstPlaceholder] = i;
+
+ i = max - firstPlaceholder + 1;
+ Assert(opaque->nPlaceholder >= i);
+ opaque->nPlaceholder -= i;
+
+ /* The array is surely sorted, so can use PageIndexMultiDelete */
+ PageIndexMultiDelete(page, itemnos, i);
+
+ hasUpdate = true;
+ }
+
+ xlrec.firstPlaceholder = firstPlaceholder;
+
+ if (hasUpdate)
+ MarkBufferDirty(buffer);
+
+ if (hasUpdate && RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+
+ XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRedirect);
+ XLogRegisterData((char *) itemToPlaceholder,
+ sizeof(OffsetNumber) * xlrec.nToPlaceholder);
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Process one page during a bulkdelete scan
+ */
+static void
+spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
+{
+ Relation index = bds->info->index;
+ Buffer buffer;
+ Page page;
+
+ /* call vacuum_delay_point while not holding any buffer lock */
+ vacuum_delay_point();
+
+ buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, bds->info->strategy);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+
+ if (PageIsNew(page))
+ {
+ /*
+ * We found an all-zero page, which could happen if the database
+ * crashed just after extending the file. Recycle it.
+ */
+ }
+ else if (PageIsEmpty(page))
+ {
+ /* nothing to do */
+ }
+ else if (SpGistPageIsLeaf(page))
+ {
+ if (SpGistBlockIsRoot(blkno))
+ {
+ vacuumLeafRoot(bds, index, buffer);
+ /* no need for vacuumRedirectAndPlaceholder */
+ }
+ else
+ {
+ vacuumLeafPage(bds, index, buffer, false);
+ vacuumRedirectAndPlaceholder(index, buffer);
+ }
+ }
+ else
+ {
+ /* inner page */
+ vacuumRedirectAndPlaceholder(index, buffer);
+ }
+
+ /*
+ * The root pages must never be deleted, nor marked as available in FSM,
+ * because we don't want them ever returned by a search for a place to put
+ * a new tuple. Otherwise, check for empty page, and make sure the FSM
+ * knows about it.
+ */
+ if (!SpGistBlockIsRoot(blkno))
+ {
+ if (PageIsNew(page) || PageIsEmpty(page))
+ {
+ RecordFreeIndexPage(index, blkno);
+ bds->stats->pages_deleted++;
+ }
+ else
+ {
+ SpGistSetLastUsedPage(index, buffer);
+ bds->lastFilledBlock = blkno;
+ }
+ }
+
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Process the pending-TID list between pages of the main scan
+ */
+static void
+spgprocesspending(spgBulkDeleteState *bds)
+{
+ Relation index = bds->info->index;
+ spgVacPendingItem *pitem;
+ spgVacPendingItem *nitem;
+ BlockNumber blkno;
+ Buffer buffer;
+ Page page;
+
+ for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next)
+ {
+ if (pitem->done)
+ continue; /* ignore already-done items */
+
+ /* call vacuum_delay_point while not holding any buffer lock */
+ vacuum_delay_point();
+
+ /* examine the referenced page */
+ blkno = ItemPointerGetBlockNumber(&pitem->tid);
+ buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, bds->info->strategy);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+
+ if (PageIsNew(page) || SpGistPageIsDeleted(page))
+ {
+ /* Probably shouldn't happen, but ignore it */
+ }
+ else if (SpGistPageIsLeaf(page))
+ {
+ if (SpGistBlockIsRoot(blkno))
+ {
+ /* this should definitely not happen */
+ elog(ERROR, "redirection leads to root page of index \"%s\"",
+ RelationGetRelationName(index));
+ }
+
+ /* deal with any deletable tuples */
+ vacuumLeafPage(bds, index, buffer, true);
+ /* might as well do this while we are here */
+ vacuumRedirectAndPlaceholder(index, buffer);
+
+ SpGistSetLastUsedPage(index, buffer);
+
+ /*
+ * We can mark as done not only this item, but any later ones
+ * pointing at the same page, since we vacuumed the whole page.
+ */
+ pitem->done = true;
+ for (nitem = pitem->next; nitem != NULL; nitem = nitem->next)
+ {
+ if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
+ nitem->done = true;
+ }
+ }
+ else
+ {
+ /*
+ * On an inner page, visit the referenced inner tuple and add all
+ * its downlinks to the pending list. We might have pending items
+ * for more than one inner tuple on the same page (in fact this is
+ * pretty likely given the way space allocation works), so get
+ * them all while we are here.
+ */
+ for (nitem = pitem; nitem != NULL; nitem = nitem->next)
+ {
+ if (nitem->done)
+ continue;
+ if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
+ {
+ OffsetNumber offset;
+ SpGistInnerTuple innerTuple;
+
+ offset = ItemPointerGetOffsetNumber(&nitem->tid);
+ innerTuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, offset));
+ if (innerTuple->tupstate == SPGIST_LIVE)
+ {
+ SpGistNodeTuple node;
+ int i;
+
+ SGITITERATE(innerTuple, i, node)
+ {
+ if (ItemPointerIsValid(&node->t_tid))
+ spgAddPendingTID(bds, &node->t_tid);
+ }
+ }
+ else if (innerTuple->tupstate == SPGIST_REDIRECT)
+ {
+ /* transfer attention to redirect point */
+ spgAddPendingTID(bds,
+ &((SpGistDeadTuple) innerTuple)->pointer);
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ innerTuple->tupstate);
+
+ nitem->done = true;
+ }
+ }
+ }
+
+ UnlockReleaseBuffer(buffer);
+ }
+
+ spgClearPendingList(bds);
+}
+
+/*
+ * Perform a bulkdelete scan
+ */
+static void
+spgvacuumscan(spgBulkDeleteState *bds)
+{
+ Relation index = bds->info->index;
+ bool needLock;
+ BlockNumber num_pages,
+ blkno;
+
+ /* Finish setting up spgBulkDeleteState */
+ initSpGistState(&bds->spgstate, index);
+ bds->pendingList = NULL;
+ bds->myXmin = GetActiveSnapshot()->xmin;
+ bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
+
+ /*
+ * Reset counts that will be incremented during the scan; needed in case
+ * of multiple scans during a single VACUUM command
+ */
+ bds->stats->estimated_count = false;
+ bds->stats->num_index_tuples = 0;
+ bds->stats->pages_deleted = 0;
+
+ /* We can skip locking for new or temp relations */
+ needLock = !RELATION_IS_LOCAL(index);
+
+ /*
+ * The outer loop iterates over all index pages except the metapage, in
+ * physical order (we hope the kernel will cooperate in providing
+ * read-ahead for speed). It is critical that we visit all leaf pages,
+ * including ones added after we start the scan, else we might fail to
+ * delete some deletable tuples. See more extensive comments about this
+ * in btvacuumscan().
+ */
+ blkno = SPGIST_METAPAGE_BLKNO + 1;
+ for (;;)
+ {
+ /* Get the current relation length */
+ if (needLock)
+ LockRelationForExtension(index, ExclusiveLock);
+ num_pages = RelationGetNumberOfBlocks(index);
+ if (needLock)
+ UnlockRelationForExtension(index, ExclusiveLock);
+
+ /* Quit if we've scanned the whole relation */
+ if (blkno >= num_pages)
+ break;
+ /* Iterate over pages, then loop back to recheck length */
+ for (; blkno < num_pages; blkno++)
+ {
+ spgvacuumpage(bds, blkno);
+ /* empty the pending-list after each page */
+ if (bds->pendingList != NULL)
+ spgprocesspending(bds);
+ }
+ }
+
+ /* Propagate local lastUsedPages cache to metablock */
+ SpGistUpdateMetaPage(index);
+
+ /*
+ * If we found any empty pages (and recorded them in the FSM), then
+ * forcibly update the upper-level FSM pages to ensure that searchers can
+ * find them. It's possible that the pages were also found during
+ * previous scans and so this is a waste of time, but it's cheap enough
+ * relative to scanning the index that it shouldn't matter much, and
+ * making sure that free pages are available sooner not later seems
+ * worthwhile.
+ *
+ * Note that if no empty pages exist, we don't bother vacuuming the FSM at
+ * all.
+ */
+ if (bds->stats->pages_deleted > 0)
+ IndexFreeSpaceMapVacuum(index);
+
+ /*
+ * Truncate index if possible
+ *
+ * XXX disabled because it's unsafe due to possible concurrent inserts.
+ * We'd have to rescan the pages to make sure they're still empty, and it
+ * doesn't seem worth it. Note that btree doesn't do this either.
+ *
+ * Another reason not to truncate is that it could invalidate the cached
+ * pages-with-freespace pointers in the metapage and other backends'
+ * relation caches, that is leave them pointing to nonexistent pages.
+ * Adding RelationGetNumberOfBlocks calls to protect the places that use
+ * those pointers would be unduly expensive.
+ */
+#ifdef NOT_USED
+ if (num_pages > bds->lastFilledBlock + 1)
+ {
+ BlockNumber lastBlock = num_pages - 1;
+
+ num_pages = bds->lastFilledBlock + 1;
+ RelationTruncate(index, num_pages);
+ bds->stats->pages_removed += lastBlock - bds->lastFilledBlock;
+ bds->stats->pages_deleted -= lastBlock - bds->lastFilledBlock;
+ }
+#endif
+
+ /* Report final stats */
+ bds->stats->num_pages = num_pages;
+ bds->stats->pages_newly_deleted = bds->stats->pages_deleted;
+ bds->stats->pages_free = bds->stats->pages_deleted;
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ spgBulkDeleteState bds;
+
+ /* allocate stats if first time through, else re-use existing struct */
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ bds.info = info;
+ bds.stats = stats;
+ bds.callback = callback;
+ bds.callback_state = callback_state;
+
+ spgvacuumscan(&bds);
+
+ return stats;
+}
+
+/* Dummy callback to delete no tuples during spgvacuumcleanup */
+static bool
+dummy_callback(ItemPointer itemptr, void *state)
+{
+ return false;
+}
+
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+ spgBulkDeleteState bds;
+
+ /* No-op in ANALYZE ONLY mode */
+ if (info->analyze_only)
+ return stats;
+
+ /*
+ * We don't need to scan the index if there was a preceding bulkdelete
+ * pass. Otherwise, make a pass that won't delete any live tuples, but
+ * might still accomplish useful stuff with redirect/placeholder cleanup
+ * and/or FSM housekeeping, and in any case will provide stats.
+ */
+ if (stats == NULL)
+ {
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ bds.info = info;
+ bds.stats = stats;
+ bds.callback = dummy_callback;
+ bds.callback_state = NULL;
+
+ spgvacuumscan(&bds);
+ }
+
+ /*
+ * It's quite possible for us to be fooled by concurrent tuple moves into
+ * double-counting some index tuples, so disbelieve any total that exceeds
+ * the underlying heap's count ... if we know that accurately. Otherwise
+ * this might just make matters worse.
+ */
+ if (!info->estimated_count)
+ {
+ if (stats->num_index_tuples > info->num_heap_tuples)
+ stats->num_index_tuples = info->num_heap_tuples;
+ }
+
+ return stats;
+}
diff --git a/src/backend/access/spgist/spgvalidate.c b/src/backend/access/spgist/spgvalidate.c
new file mode 100644
index 0000000..82281f7
--- /dev/null
+++ b/src/backend/access/spgist/spgvalidate.c
@@ -0,0 +1,392 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgvalidate.c
+ * Opclass validator for SP-GiST.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgvalidate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amvalidate.h"
+#include "access/htup_details.h"
+#include "access/spgist_private.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_amproc.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+
+/*
+ * Validator for an SP-GiST opclass.
+ *
+ * Some of the checks done here cover the whole opfamily, and therefore are
+ * redundant when checking each opclass in a family. But they don't run long
+ * enough to be much of a problem, so we accept the duplication rather than
+ * complicate the amvalidate API.
+ */
+bool
+spgvalidate(Oid opclassoid)
+{
+ bool result = true;
+ HeapTuple classtup;
+ Form_pg_opclass classform;
+ Oid opfamilyoid;
+ Oid opcintype;
+ Oid opckeytype;
+ char *opclassname;
+ HeapTuple familytup;
+ Form_pg_opfamily familyform;
+ char *opfamilyname;
+ CatCList *proclist,
+ *oprlist;
+ List *grouplist;
+ OpFamilyOpFuncGroup *opclassgroup;
+ int i;
+ ListCell *lc;
+ spgConfigIn configIn;
+ spgConfigOut configOut;
+ Oid configOutLefttype = InvalidOid;
+ Oid configOutRighttype = InvalidOid;
+ Oid configOutLeafType = InvalidOid;
+
+ /* Fetch opclass information */
+ classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
+ if (!HeapTupleIsValid(classtup))
+ elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
+ classform = (Form_pg_opclass) GETSTRUCT(classtup);
+
+ opfamilyoid = classform->opcfamily;
+ opcintype = classform->opcintype;
+ opckeytype = classform->opckeytype;
+ opclassname = NameStr(classform->opcname);
+
+ /* Fetch opfamily information */
+ familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
+ if (!HeapTupleIsValid(familytup))
+ elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
+ familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
+
+ opfamilyname = NameStr(familyform->opfname);
+
+ /* Fetch all operators and support functions of the opfamily */
+ oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
+ proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
+ grouplist = identify_opfamily_groups(oprlist, proclist);
+
+ /* Check individual support functions */
+ for (i = 0; i < proclist->n_members; i++)
+ {
+ HeapTuple proctup = &proclist->members[i]->tuple;
+ Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
+ bool ok;
+
+ /*
+ * All SP-GiST support functions should be registered with matching
+ * left/right types
+ */
+ if (procform->amproclefttype != procform->amprocrighttype)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types",
+ opfamilyname, "spgist",
+ format_procedure(procform->amproc))));
+ result = false;
+ }
+
+ /* Check procedure numbers and function signatures */
+ switch (procform->amprocnum)
+ {
+ case SPGIST_CONFIG_PROC:
+ ok = check_amproc_signature(procform->amproc, VOIDOID, true,
+ 2, 2, INTERNALOID, INTERNALOID);
+ configIn.attType = procform->amproclefttype;
+ memset(&configOut, 0, sizeof(configOut));
+
+ OidFunctionCall2(procform->amproc,
+ PointerGetDatum(&configIn),
+ PointerGetDatum(&configOut));
+
+ configOutLefttype = procform->amproclefttype;
+ configOutRighttype = procform->amprocrighttype;
+
+ /* Default leaf type is opckeytype or input type */
+ if (OidIsValid(opckeytype))
+ configOutLeafType = opckeytype;
+ else
+ configOutLeafType = procform->amproclefttype;
+
+ /* If some other leaf datum type is specified, warn */
+ if (OidIsValid(configOut.leafType) &&
+ configOutLeafType != configOut.leafType)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("SP-GiST leaf data type %s does not match declared type %s",
+ format_type_be(configOut.leafType),
+ format_type_be(configOutLeafType))));
+ result = false;
+ configOutLeafType = configOut.leafType;
+ }
+
+ /*
+ * When leaf and attribute types are the same, compress
+ * function is not required and we set corresponding bit in
+ * functionset for later group consistency check.
+ */
+ if (configOutLeafType == configIn.attType)
+ {
+ foreach(lc, grouplist)
+ {
+ OpFamilyOpFuncGroup *group = lfirst(lc);
+
+ if (group->lefttype == procform->amproclefttype &&
+ group->righttype == procform->amprocrighttype)
+ {
+ group->functionset |=
+ ((uint64) 1) << SPGIST_COMPRESS_PROC;
+ break;
+ }
+ }
+ }
+ break;
+ case SPGIST_CHOOSE_PROC:
+ case SPGIST_PICKSPLIT_PROC:
+ case SPGIST_INNER_CONSISTENT_PROC:
+ ok = check_amproc_signature(procform->amproc, VOIDOID, true,
+ 2, 2, INTERNALOID, INTERNALOID);
+ break;
+ case SPGIST_LEAF_CONSISTENT_PROC:
+ ok = check_amproc_signature(procform->amproc, BOOLOID, true,
+ 2, 2, INTERNALOID, INTERNALOID);
+ break;
+ case SPGIST_COMPRESS_PROC:
+ if (configOutLefttype != procform->amproclefttype ||
+ configOutRighttype != procform->amprocrighttype)
+ ok = false;
+ else
+ ok = check_amproc_signature(procform->amproc,
+ configOutLeafType, true,
+ 1, 1, procform->amproclefttype);
+ break;
+ case SPGIST_OPTIONS_PROC:
+ ok = check_amoptsproc_signature(procform->amproc);
+ break;
+ default:
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
+ opfamilyname, "spgist",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ continue; /* don't want additional message */
+ }
+
+ if (!ok)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
+ opfamilyname, "spgist",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ }
+ }
+
+ /* Check individual operators */
+ for (i = 0; i < oprlist->n_members; i++)
+ {
+ HeapTuple oprtup = &oprlist->members[i]->tuple;
+ Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
+ Oid op_rettype;
+
+ /* TODO: Check that only allowed strategy numbers exist */
+ if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
+ opfamilyname, "spgist",
+ format_operator(oprform->amopopr),
+ oprform->amopstrategy)));
+ result = false;
+ }
+
+ /* spgist supports ORDER BY operators */
+ if (oprform->amoppurpose != AMOP_SEARCH)
+ {
+ /* ... and operator result must match the claimed btree opfamily */
+ op_rettype = get_op_rettype(oprform->amopopr);
+ if (!opfamily_can_sort_type(oprform->amopsortfamily, op_rettype))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
+ opfamilyname, "spgist",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+ }
+ else
+ op_rettype = BOOLOID;
+
+ /* Check operator signature --- same for all spgist strategies */
+ if (!check_amop_signature(oprform->amopopr, op_rettype,
+ oprform->amoplefttype,
+ oprform->amoprighttype))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
+ opfamilyname, "spgist",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+ }
+
+ /* Now check for inconsistent groups of operators/functions */
+ opclassgroup = NULL;
+ foreach(lc, grouplist)
+ {
+ OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
+
+ /* Remember the group exactly matching the test opclass */
+ if (thisgroup->lefttype == opcintype &&
+ thisgroup->righttype == opcintype)
+ opclassgroup = thisgroup;
+
+ /*
+ * Complain if there are any datatype pairs with functions but no
+ * operators. This is about the best we can do for now to detect
+ * missing operators.
+ */
+ if (thisgroup->operatorset == 0)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s",
+ opfamilyname, "spgist",
+ format_type_be(thisgroup->lefttype),
+ format_type_be(thisgroup->righttype))));
+ result = false;
+ }
+
+ /*
+ * Complain if we're missing functions for any datatype, remembering
+ * that SP-GiST doesn't use cross-type support functions.
+ */
+ if (thisgroup->lefttype != thisgroup->righttype)
+ continue;
+
+ for (i = 1; i <= SPGISTNProc; i++)
+ {
+ if ((thisgroup->functionset & (((uint64) 1) << i)) != 0)
+ continue; /* got it */
+ if (i == SPGIST_OPTIONS_PROC)
+ continue; /* optional method */
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s is missing support function %d for type %s",
+ opfamilyname, "spgist", i,
+ format_type_be(thisgroup->lefttype))));
+ result = false;
+ }
+ }
+
+ /* Check that the originally-named opclass is supported */
+ /* (if group is there, we already checked it adequately above) */
+ if (!opclassgroup)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator class \"%s\" of access method %s is missing operator(s)",
+ opclassname, "spgist")));
+ result = false;
+ }
+
+ ReleaseCatCacheList(proclist);
+ ReleaseCatCacheList(oprlist);
+ ReleaseSysCache(familytup);
+ ReleaseSysCache(classtup);
+
+ return result;
+}
+
+/*
+ * Prechecking function for adding operators/functions to an SP-GiST opfamily.
+ */
+void
+spgadjustmembers(Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions)
+{
+ ListCell *lc;
+
+ /*
+ * Operator members of an SP-GiST opfamily should never have hard
+ * dependencies, since their connection to the opfamily depends only on
+ * what the support functions think, and that can be altered. For
+ * consistency, we make all soft dependencies point to the opfamily,
+ * though a soft dependency on the opclass would work as well in the
+ * CREATE OPERATOR CLASS case.
+ */
+ foreach(lc, operators)
+ {
+ OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+ op->ref_is_hard = false;
+ op->ref_is_family = true;
+ op->refobjid = opfamilyoid;
+ }
+
+ /*
+ * Required support functions should have hard dependencies. Preferably
+ * those are just dependencies on the opclass, but if we're in ALTER
+ * OPERATOR FAMILY, we leave the dependency pointing at the whole
+ * opfamily. (Given that SP-GiST opclasses generally don't share
+ * opfamilies, it seems unlikely to be worth working harder.)
+ */
+ foreach(lc, functions)
+ {
+ OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+ switch (op->number)
+ {
+ case SPGIST_CONFIG_PROC:
+ case SPGIST_CHOOSE_PROC:
+ case SPGIST_PICKSPLIT_PROC:
+ case SPGIST_INNER_CONSISTENT_PROC:
+ case SPGIST_LEAF_CONSISTENT_PROC:
+ /* Required support function */
+ op->ref_is_hard = true;
+ break;
+ case SPGIST_COMPRESS_PROC:
+ case SPGIST_OPTIONS_PROC:
+ /* Optional, so force it to be a soft family dependency */
+ op->ref_is_hard = false;
+ op->ref_is_family = true;
+ op->refobjid = opfamilyoid;
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("support function number %d is invalid for access method %s",
+ op->number, "spgist")));
+ break;
+ }
+ }
+}
diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c
new file mode 100644
index 0000000..b500b2c
--- /dev/null
+++ b/src/backend/access/spgist/spgxlog.c
@@ -0,0 +1,1013 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgxlog.c
+ * WAL replay logic for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgxlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/bufmask.h"
+#include "access/spgist_private.h"
+#include "access/spgxlog.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "storage/standby.h"
+#include "utils/memutils.h"
+
+
+static MemoryContext opCtx; /* working memory for operations */
+
+
+/*
+ * Prepare a dummy SpGistState, with just the minimum info needed for replay.
+ *
+ * At present, all we need is enough info to support spgFormDeadTuple(),
+ * plus the isBuild flag.
+ */
+static void
+fillFakeState(SpGistState *state, spgxlogState stateSrc)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->myXid = stateSrc.myXid;
+ state->isBuild = stateSrc.isBuild;
+ state->deadTupleStorage = palloc0(SGDTSIZE);
+}
+
+/*
+ * Add a leaf tuple, or replace an existing placeholder tuple. This is used
+ * to replay SpGistPageAddNewItem() operations. If the offset points at an
+ * existing tuple, it had better be a placeholder tuple.
+ */
+static void
+addOrReplaceTuple(Page page, Item tuple, int size, OffsetNumber offset)
+{
+ if (offset <= PageGetMaxOffsetNumber(page))
+ {
+ SpGistDeadTuple dt = (SpGistDeadTuple) PageGetItem(page,
+ PageGetItemId(page, offset));
+
+ if (dt->tupstate != SPGIST_PLACEHOLDER)
+ elog(ERROR, "SPGiST tuple to be replaced is not a placeholder");
+
+ Assert(SpGistPageGetOpaque(page)->nPlaceholder > 0);
+ SpGistPageGetOpaque(page)->nPlaceholder--;
+
+ PageIndexTupleDelete(page, offset);
+ }
+
+ Assert(offset <= PageGetMaxOffsetNumber(page) + 1);
+
+ if (PageAddItem(page, tuple, size, offset, false, false) != offset)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ size);
+}
+
+static void
+spgRedoAddLeaf(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ char *ptr = XLogRecGetData(record);
+ spgxlogAddLeaf *xldata = (spgxlogAddLeaf *) ptr;
+ char *leafTuple;
+ SpGistLeafTupleData leafTupleHdr;
+ Buffer buffer;
+ Page page;
+ XLogRedoAction action;
+
+ ptr += sizeof(spgxlogAddLeaf);
+ leafTuple = ptr;
+ /* the leaf tuple is unaligned, so make a copy to access its header */
+ memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData));
+
+ /*
+ * In normal operation we would have both current and parent pages locked
+ * simultaneously; but in WAL replay it should be safe to update the leaf
+ * page before updating the parent.
+ */
+ if (xldata->newPage)
+ {
+ buffer = XLogInitBufferForRedo(record, 0);
+ SpGistInitBuffer(buffer,
+ SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0));
+ action = BLK_NEEDS_REDO;
+ }
+ else
+ action = XLogReadBufferForRedo(record, 0, &buffer);
+
+ if (action == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(buffer);
+
+ /* insert new tuple */
+ if (xldata->offnumLeaf != xldata->offnumHeadLeaf)
+ {
+ /* normal cases, tuple was added by SpGistPageAddNewItem */
+ addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size,
+ xldata->offnumLeaf);
+
+ /* update head tuple's chain link if needed */
+ if (xldata->offnumHeadLeaf != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple head;
+
+ head = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumHeadLeaf));
+ Assert(SGLT_GET_NEXTOFFSET(head) == SGLT_GET_NEXTOFFSET(&leafTupleHdr));
+ SGLT_SET_NEXTOFFSET(head, xldata->offnumLeaf);
+ }
+ }
+ else
+ {
+ /* replacing a DEAD tuple */
+ PageIndexTupleDelete(page, xldata->offnumLeaf);
+ if (PageAddItem(page,
+ (Item) leafTuple, leafTupleHdr.size,
+ xldata->offnumLeaf, false, false) != xldata->offnumLeaf)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ leafTupleHdr.size);
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* update parent downlink if necessary */
+ if (xldata->offnumParent != InvalidOffsetNumber)
+ {
+ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+ {
+ SpGistInnerTuple tuple;
+ BlockNumber blknoLeaf;
+
+ XLogRecGetBlockTag(record, 0, NULL, NULL, &blknoLeaf);
+
+ page = BufferGetPage(buffer);
+
+ tuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+
+ spgUpdateNodeLink(tuple, xldata->nodeI,
+ blknoLeaf, xldata->offnumLeaf);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ }
+}
+
+static void
+spgRedoMoveLeafs(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ char *ptr = XLogRecGetData(record);
+ spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr;
+ SpGistState state;
+ OffsetNumber *toDelete;
+ OffsetNumber *toInsert;
+ int nInsert;
+ Buffer buffer;
+ Page page;
+ XLogRedoAction action;
+ BlockNumber blknoDst;
+
+ XLogRecGetBlockTag(record, 1, NULL, NULL, &blknoDst);
+
+ fillFakeState(&state, xldata->stateSrc);
+
+ nInsert = xldata->replaceDead ? 1 : xldata->nMoves + 1;
+
+ ptr += SizeOfSpgxlogMoveLeafs;
+ toDelete = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nMoves;
+ toInsert = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * nInsert;
+
+ /* now ptr points to the list of leaf tuples */
+
+ /*
+ * In normal operation we would have all three pages (source, dest, and
+ * parent) locked simultaneously; but in WAL replay it should be safe to
+ * update them one at a time, as long as we do it in the right order.
+ */
+
+ /* Insert tuples on the dest page (do first, so redirect is valid) */
+ if (xldata->newPage)
+ {
+ buffer = XLogInitBufferForRedo(record, 1);
+ SpGistInitBuffer(buffer,
+ SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0));
+ action = BLK_NEEDS_REDO;
+ }
+ else
+ action = XLogReadBufferForRedo(record, 1, &buffer);
+
+ if (action == BLK_NEEDS_REDO)
+ {
+ int i;
+
+ page = BufferGetPage(buffer);
+
+ for (i = 0; i < nInsert; i++)
+ {
+ char *leafTuple;
+ SpGistLeafTupleData leafTupleHdr;
+
+ /*
+ * the tuples are not aligned, so must copy to access the size
+ * field.
+ */
+ leafTuple = ptr;
+ memcpy(&leafTupleHdr, leafTuple,
+ sizeof(SpGistLeafTupleData));
+
+ addOrReplaceTuple(page, (Item) leafTuple,
+ leafTupleHdr.size, toInsert[i]);
+ ptr += leafTupleHdr.size;
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* Delete tuples from the source page, inserting a redirection pointer */
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(buffer);
+
+ spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves,
+ state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT,
+ SPGIST_PLACEHOLDER,
+ blknoDst,
+ toInsert[nInsert - 1]);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* And update the parent downlink */
+ if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
+ {
+ SpGistInnerTuple tuple;
+
+ page = BufferGetPage(buffer);
+
+ tuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+
+ spgUpdateNodeLink(tuple, xldata->nodeI,
+ blknoDst, toInsert[nInsert - 1]);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+spgRedoAddNode(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ char *ptr = XLogRecGetData(record);
+ spgxlogAddNode *xldata = (spgxlogAddNode *) ptr;
+ char *innerTuple;
+ SpGistInnerTupleData innerTupleHdr;
+ SpGistState state;
+ Buffer buffer;
+ Page page;
+ XLogRedoAction action;
+
+ ptr += sizeof(spgxlogAddNode);
+ innerTuple = ptr;
+ /* the tuple is unaligned, so make a copy to access its header */
+ memcpy(&innerTupleHdr, innerTuple, sizeof(SpGistInnerTupleData));
+
+ fillFakeState(&state, xldata->stateSrc);
+
+ if (!XLogRecHasBlockRef(record, 1))
+ {
+ /* update in place */
+ Assert(xldata->parentBlk == -1);
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(buffer);
+
+ PageIndexTupleDelete(page, xldata->offnum);
+ if (PageAddItem(page, (Item) innerTuple, innerTupleHdr.size,
+ xldata->offnum,
+ false, false) != xldata->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ innerTupleHdr.size);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ }
+ else
+ {
+ BlockNumber blkno;
+ BlockNumber blknoNew;
+
+ XLogRecGetBlockTag(record, 0, NULL, NULL, &blkno);
+ XLogRecGetBlockTag(record, 1, NULL, NULL, &blknoNew);
+
+ /*
+ * In normal operation we would have all three pages (source, dest,
+ * and parent) locked simultaneously; but in WAL replay it should be
+ * safe to update them one at a time, as long as we do it in the right
+ * order. We must insert the new tuple before replacing the old tuple
+ * with the redirect tuple.
+ */
+
+ /* Install new tuple first so redirect is valid */
+ if (xldata->newPage)
+ {
+ /* AddNode is not used for nulls pages */
+ buffer = XLogInitBufferForRedo(record, 1);
+ SpGistInitBuffer(buffer, 0);
+ action = BLK_NEEDS_REDO;
+ }
+ else
+ action = XLogReadBufferForRedo(record, 1, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(buffer);
+
+ addOrReplaceTuple(page, (Item) innerTuple,
+ innerTupleHdr.size, xldata->offnumNew);
+
+ /*
+ * If parent is in this same page, update it now.
+ */
+ if (xldata->parentBlk == 1)
+ {
+ SpGistInnerTuple parentTuple;
+
+ parentTuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+
+ spgUpdateNodeLink(parentTuple, xldata->nodeI,
+ blknoNew, xldata->offnumNew);
+ }
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* Delete old tuple, replacing it with redirect or placeholder tuple */
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ SpGistDeadTuple dt;
+
+ page = BufferGetPage(buffer);
+
+ if (state.isBuild)
+ dt = spgFormDeadTuple(&state, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+ else
+ dt = spgFormDeadTuple(&state, SPGIST_REDIRECT,
+ blknoNew,
+ xldata->offnumNew);
+
+ PageIndexTupleDelete(page, xldata->offnum);
+ if (PageAddItem(page, (Item) dt, dt->size,
+ xldata->offnum,
+ false, false) != xldata->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ dt->size);
+
+ if (state.isBuild)
+ SpGistPageGetOpaque(page)->nPlaceholder++;
+ else
+ SpGistPageGetOpaque(page)->nRedirection++;
+
+ /*
+ * If parent is in this same page, update it now.
+ */
+ if (xldata->parentBlk == 0)
+ {
+ SpGistInnerTuple parentTuple;
+
+ parentTuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+
+ spgUpdateNodeLink(parentTuple, xldata->nodeI,
+ blknoNew, xldata->offnumNew);
+ }
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * Update parent downlink (if we didn't do it as part of the source or
+ * destination page update already).
+ */
+ if (xldata->parentBlk == 2)
+ {
+ if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
+ {
+ SpGistInnerTuple parentTuple;
+
+ page = BufferGetPage(buffer);
+
+ parentTuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+
+ spgUpdateNodeLink(parentTuple, xldata->nodeI,
+ blknoNew, xldata->offnumNew);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+}
+
+static void
+spgRedoSplitTuple(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ char *ptr = XLogRecGetData(record);
+ spgxlogSplitTuple *xldata = (spgxlogSplitTuple *) ptr;
+ char *prefixTuple;
+ SpGistInnerTupleData prefixTupleHdr;
+ char *postfixTuple;
+ SpGistInnerTupleData postfixTupleHdr;
+ Buffer buffer;
+ Page page;
+ XLogRedoAction action;
+
+ ptr += sizeof(spgxlogSplitTuple);
+ prefixTuple = ptr;
+ /* the prefix tuple is unaligned, so make a copy to access its header */
+ memcpy(&prefixTupleHdr, prefixTuple, sizeof(SpGistInnerTupleData));
+ ptr += prefixTupleHdr.size;
+ postfixTuple = ptr;
+ /* postfix tuple is also unaligned */
+ memcpy(&postfixTupleHdr, postfixTuple, sizeof(SpGistInnerTupleData));
+
+ /*
+ * In normal operation we would have both pages locked simultaneously; but
+ * in WAL replay it should be safe to update them one at a time, as long
+ * as we do it in the right order.
+ */
+
+ /* insert postfix tuple first to avoid dangling link */
+ if (!xldata->postfixBlkSame)
+ {
+ if (xldata->newPage)
+ {
+ buffer = XLogInitBufferForRedo(record, 1);
+ /* SplitTuple is not used for nulls pages */
+ SpGistInitBuffer(buffer, 0);
+ action = BLK_NEEDS_REDO;
+ }
+ else
+ action = XLogReadBufferForRedo(record, 1, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(buffer);
+
+ addOrReplaceTuple(page, (Item) postfixTuple,
+ postfixTupleHdr.size, xldata->offnumPostfix);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /* now handle the original page */
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(buffer);
+
+ PageIndexTupleDelete(page, xldata->offnumPrefix);
+ if (PageAddItem(page, (Item) prefixTuple, prefixTupleHdr.size,
+ xldata->offnumPrefix, false, false) != xldata->offnumPrefix)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ prefixTupleHdr.size);
+
+ if (xldata->postfixBlkSame)
+ addOrReplaceTuple(page, (Item) postfixTuple,
+ postfixTupleHdr.size,
+ xldata->offnumPostfix);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+spgRedoPickSplit(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ char *ptr = XLogRecGetData(record);
+ spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr;
+ char *innerTuple;
+ SpGistInnerTupleData innerTupleHdr;
+ SpGistState state;
+ OffsetNumber *toDelete;
+ OffsetNumber *toInsert;
+ uint8 *leafPageSelect;
+ Buffer srcBuffer;
+ Buffer destBuffer;
+ Buffer innerBuffer;
+ Page srcPage;
+ Page destPage;
+ Page page;
+ int i;
+ BlockNumber blknoInner;
+ XLogRedoAction action;
+
+ XLogRecGetBlockTag(record, 2, NULL, NULL, &blknoInner);
+
+ fillFakeState(&state, xldata->stateSrc);
+
+ ptr += SizeOfSpgxlogPickSplit;
+ toDelete = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nDelete;
+ toInsert = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nInsert;
+ leafPageSelect = (uint8 *) ptr;
+ ptr += sizeof(uint8) * xldata->nInsert;
+
+ innerTuple = ptr;
+ /* the inner tuple is unaligned, so make a copy to access its header */
+ memcpy(&innerTupleHdr, innerTuple, sizeof(SpGistInnerTupleData));
+ ptr += innerTupleHdr.size;
+
+ /* now ptr points to the list of leaf tuples */
+
+ if (xldata->isRootSplit)
+ {
+ /* when splitting root, we touch it only in the guise of new inner */
+ srcBuffer = InvalidBuffer;
+ srcPage = NULL;
+ }
+ else if (xldata->initSrc)
+ {
+ /* just re-init the source page */
+ srcBuffer = XLogInitBufferForRedo(record, 0);
+ srcPage = (Page) BufferGetPage(srcBuffer);
+
+ SpGistInitBuffer(srcBuffer,
+ SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0));
+ /* don't update LSN etc till we're done with it */
+ }
+ else
+ {
+ /*
+ * Delete the specified tuples from source page. (In case we're in
+ * Hot Standby, we need to hold lock on the page till we're done
+ * inserting leaf tuples and the new inner tuple, else the added
+ * redirect tuple will be a dangling link.)
+ */
+ srcPage = NULL;
+ if (XLogReadBufferForRedo(record, 0, &srcBuffer) == BLK_NEEDS_REDO)
+ {
+ srcPage = BufferGetPage(srcBuffer);
+
+ /*
+ * We have it a bit easier here than in doPickSplit(), because we
+ * know the inner tuple's location already, so we can inject the
+ * correct redirection tuple now.
+ */
+ if (!state.isBuild)
+ spgPageIndexMultiDelete(&state, srcPage,
+ toDelete, xldata->nDelete,
+ SPGIST_REDIRECT,
+ SPGIST_PLACEHOLDER,
+ blknoInner,
+ xldata->offnumInner);
+ else
+ spgPageIndexMultiDelete(&state, srcPage,
+ toDelete, xldata->nDelete,
+ SPGIST_PLACEHOLDER,
+ SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+
+ /* don't update LSN etc till we're done with it */
+ }
+ }
+
+ /* try to access dest page if any */
+ if (!XLogRecHasBlockRef(record, 1))
+ {
+ destBuffer = InvalidBuffer;
+ destPage = NULL;
+ }
+ else if (xldata->initDest)
+ {
+ /* just re-init the dest page */
+ destBuffer = XLogInitBufferForRedo(record, 1);
+ destPage = (Page) BufferGetPage(destBuffer);
+
+ SpGistInitBuffer(destBuffer,
+ SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0));
+ /* don't update LSN etc till we're done with it */
+ }
+ else
+ {
+ /*
+ * We could probably release the page lock immediately in the
+ * full-page-image case, but for safety let's hold it till later.
+ */
+ if (XLogReadBufferForRedo(record, 1, &destBuffer) == BLK_NEEDS_REDO)
+ destPage = (Page) BufferGetPage(destBuffer);
+ else
+ destPage = NULL; /* don't do any page updates */
+ }
+
+ /* restore leaf tuples to src and/or dest page */
+ for (i = 0; i < xldata->nInsert; i++)
+ {
+ char *leafTuple;
+ SpGistLeafTupleData leafTupleHdr;
+
+ /* the tuples are not aligned, so must copy to access the size field. */
+ leafTuple = ptr;
+ memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData));
+ ptr += leafTupleHdr.size;
+
+ page = leafPageSelect[i] ? destPage : srcPage;
+ if (page == NULL)
+ continue; /* no need to touch this page */
+
+ addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size,
+ toInsert[i]);
+ }
+
+ /* Now update src and dest page LSNs if needed */
+ if (srcPage != NULL)
+ {
+ PageSetLSN(srcPage, lsn);
+ MarkBufferDirty(srcBuffer);
+ }
+ if (destPage != NULL)
+ {
+ PageSetLSN(destPage, lsn);
+ MarkBufferDirty(destBuffer);
+ }
+
+ /* restore new inner tuple */
+ if (xldata->initInner)
+ {
+ innerBuffer = XLogInitBufferForRedo(record, 2);
+ SpGistInitBuffer(innerBuffer, (xldata->storesNulls ? SPGIST_NULLS : 0));
+ action = BLK_NEEDS_REDO;
+ }
+ else
+ action = XLogReadBufferForRedo(record, 2, &innerBuffer);
+
+ if (action == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(innerBuffer);
+
+ addOrReplaceTuple(page, (Item) innerTuple, innerTupleHdr.size,
+ xldata->offnumInner);
+
+ /* if inner is also parent, update link while we're here */
+ if (xldata->innerIsParent)
+ {
+ SpGistInnerTuple parent;
+
+ parent = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+ spgUpdateNodeLink(parent, xldata->nodeI,
+ blknoInner, xldata->offnumInner);
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(innerBuffer);
+ }
+ if (BufferIsValid(innerBuffer))
+ UnlockReleaseBuffer(innerBuffer);
+
+ /*
+ * Now we can release the leaf-page locks. It's okay to do this before
+ * updating the parent downlink.
+ */
+ if (BufferIsValid(srcBuffer))
+ UnlockReleaseBuffer(srcBuffer);
+ if (BufferIsValid(destBuffer))
+ UnlockReleaseBuffer(destBuffer);
+
+ /* update parent downlink, unless we did it above */
+ if (XLogRecHasBlockRef(record, 3))
+ {
+ Buffer parentBuffer;
+
+ if (XLogReadBufferForRedo(record, 3, &parentBuffer) == BLK_NEEDS_REDO)
+ {
+ SpGistInnerTuple parent;
+
+ page = BufferGetPage(parentBuffer);
+
+ parent = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+ spgUpdateNodeLink(parent, xldata->nodeI,
+ blknoInner, xldata->offnumInner);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(parentBuffer);
+ }
+ if (BufferIsValid(parentBuffer))
+ UnlockReleaseBuffer(parentBuffer);
+ }
+ else
+ Assert(xldata->innerIsParent || xldata->isRootSplit);
+}
+
+static void
+spgRedoVacuumLeaf(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ char *ptr = XLogRecGetData(record);
+ spgxlogVacuumLeaf *xldata = (spgxlogVacuumLeaf *) ptr;
+ OffsetNumber *toDead;
+ OffsetNumber *toPlaceholder;
+ OffsetNumber *moveSrc;
+ OffsetNumber *moveDest;
+ OffsetNumber *chainSrc;
+ OffsetNumber *chainDest;
+ SpGistState state;
+ Buffer buffer;
+ Page page;
+ int i;
+
+ fillFakeState(&state, xldata->stateSrc);
+
+ ptr += SizeOfSpgxlogVacuumLeaf;
+ toDead = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nDead;
+ toPlaceholder = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nPlaceholder;
+ moveSrc = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nMove;
+ moveDest = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nMove;
+ chainSrc = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nChain;
+ chainDest = (OffsetNumber *) ptr;
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(buffer);
+
+ spgPageIndexMultiDelete(&state, page,
+ toDead, xldata->nDead,
+ SPGIST_DEAD, SPGIST_DEAD,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+
+ spgPageIndexMultiDelete(&state, page,
+ toPlaceholder, xldata->nPlaceholder,
+ SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+
+ /* see comments in vacuumLeafPage() */
+ for (i = 0; i < xldata->nMove; i++)
+ {
+ ItemId idSrc = PageGetItemId(page, moveSrc[i]);
+ ItemId idDest = PageGetItemId(page, moveDest[i]);
+ ItemIdData tmp;
+
+ tmp = *idSrc;
+ *idSrc = *idDest;
+ *idDest = tmp;
+ }
+
+ spgPageIndexMultiDelete(&state, page,
+ moveSrc, xldata->nMove,
+ SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+
+ for (i = 0; i < xldata->nChain; i++)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, chainSrc[i]));
+ Assert(lt->tupstate == SPGIST_LIVE);
+ SGLT_SET_NEXTOFFSET(lt, chainDest[i]);
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+spgRedoVacuumRoot(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ char *ptr = XLogRecGetData(record);
+ spgxlogVacuumRoot *xldata = (spgxlogVacuumRoot *) ptr;
+ OffsetNumber *toDelete;
+ Buffer buffer;
+ Page page;
+
+ toDelete = xldata->offsets;
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(buffer);
+
+ /* The tuple numbers are in order */
+ PageIndexMultiDelete(page, toDelete, xldata->nDelete);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+spgRedoVacuumRedirect(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ char *ptr = XLogRecGetData(record);
+ spgxlogVacuumRedirect *xldata = (spgxlogVacuumRedirect *) ptr;
+ OffsetNumber *itemToPlaceholder;
+ Buffer buffer;
+
+ itemToPlaceholder = xldata->offsets;
+
+ /*
+ * If any redirection tuples are being removed, make sure there are no
+ * live Hot Standby transactions that might need to see them.
+ */
+ if (InHotStandby)
+ {
+ if (TransactionIdIsValid(xldata->newestRedirectXid))
+ {
+ RelFileNode node;
+
+ XLogRecGetBlockTag(record, 0, &node, NULL, NULL);
+ ResolveRecoveryConflictWithSnapshot(xldata->newestRedirectXid,
+ node);
+ }
+ }
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ Page page = BufferGetPage(buffer);
+ SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
+ int i;
+
+ /* Convert redirect pointers to plain placeholders */
+ for (i = 0; i < xldata->nToPlaceholder; i++)
+ {
+ SpGistDeadTuple dt;
+
+ dt = (SpGistDeadTuple) PageGetItem(page,
+ PageGetItemId(page, itemToPlaceholder[i]));
+ Assert(dt->tupstate == SPGIST_REDIRECT);
+ dt->tupstate = SPGIST_PLACEHOLDER;
+ ItemPointerSetInvalid(&dt->pointer);
+ }
+
+ Assert(opaque->nRedirection >= xldata->nToPlaceholder);
+ opaque->nRedirection -= xldata->nToPlaceholder;
+ opaque->nPlaceholder += xldata->nToPlaceholder;
+
+ /* Remove placeholder tuples at end of page */
+ if (xldata->firstPlaceholder != InvalidOffsetNumber)
+ {
+ int max = PageGetMaxOffsetNumber(page);
+ OffsetNumber *toDelete;
+
+ toDelete = palloc(sizeof(OffsetNumber) * max);
+
+ for (i = xldata->firstPlaceholder; i <= max; i++)
+ toDelete[i - xldata->firstPlaceholder] = i;
+
+ i = max - xldata->firstPlaceholder + 1;
+ Assert(opaque->nPlaceholder >= i);
+ opaque->nPlaceholder -= i;
+
+ /* The array is sorted, so can use PageIndexMultiDelete */
+ PageIndexMultiDelete(page, toDelete, i);
+
+ pfree(toDelete);
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+void
+spg_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ MemoryContext oldCxt;
+
+ oldCxt = MemoryContextSwitchTo(opCtx);
+ switch (info)
+ {
+ case XLOG_SPGIST_ADD_LEAF:
+ spgRedoAddLeaf(record);
+ break;
+ case XLOG_SPGIST_MOVE_LEAFS:
+ spgRedoMoveLeafs(record);
+ break;
+ case XLOG_SPGIST_ADD_NODE:
+ spgRedoAddNode(record);
+ break;
+ case XLOG_SPGIST_SPLIT_TUPLE:
+ spgRedoSplitTuple(record);
+ break;
+ case XLOG_SPGIST_PICKSPLIT:
+ spgRedoPickSplit(record);
+ break;
+ case XLOG_SPGIST_VACUUM_LEAF:
+ spgRedoVacuumLeaf(record);
+ break;
+ case XLOG_SPGIST_VACUUM_ROOT:
+ spgRedoVacuumRoot(record);
+ break;
+ case XLOG_SPGIST_VACUUM_REDIRECT:
+ spgRedoVacuumRedirect(record);
+ break;
+ default:
+ elog(PANIC, "spg_redo: unknown op code %u", info);
+ }
+
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextReset(opCtx);
+}
+
+void
+spg_xlog_startup(void)
+{
+ opCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+spg_xlog_cleanup(void)
+{
+ MemoryContextDelete(opCtx);
+ opCtx = NULL;
+}
+
+/*
+ * Mask a SpGist page before performing consistency checks on it.
+ */
+void
+spg_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ PageHeader pagehdr = (PageHeader) page;
+
+ mask_page_lsn_and_checksum(page);
+
+ mask_page_hint_bits(page);
+
+ /*
+ * Mask the unused space, but only if the page's pd_lower appears to have
+ * been set correctly.
+ */
+ if (pagehdr->pd_lower >= SizeOfPageHeaderData)
+ mask_unused_space(page);
+}