diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/access/gist | |
parent | Initial commit. (diff) | |
download | postgresql-14-upstream.tar.xz postgresql-14-upstream.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/backend/access/gist/Makefile | 28 | ||||
-rw-r--r-- | src/backend/access/gist/README | 467 | ||||
-rw-r--r-- | src/backend/access/gist/gist.c | 1713 | ||||
-rw-r--r-- | src/backend/access/gist/gistbuild.c | 1566 | ||||
-rw-r--r-- | src/backend/access/gist/gistbuildbuffers.c | 775 | ||||
-rw-r--r-- | src/backend/access/gist/gistget.c | 803 | ||||
-rw-r--r-- | src/backend/access/gist/gistproc.c | 1777 | ||||
-rw-r--r-- | src/backend/access/gist/gistscan.c | 358 | ||||
-rw-r--r-- | src/backend/access/gist/gistsplit.c | 779 | ||||
-rw-r--r-- | src/backend/access/gist/gistutil.c | 1066 | ||||
-rw-r--r-- | src/backend/access/gist/gistvacuum.c | 668 | ||||
-rw-r--r-- | src/backend/access/gist/gistvalidate.c | 355 | ||||
-rw-r--r-- | src/backend/access/gist/gistxlog.c | 696 |
13 files changed, 11051 insertions, 0 deletions
diff --git a/src/backend/access/gist/Makefile b/src/backend/access/gist/Makefile new file mode 100644 index 0000000..1aca8bc --- /dev/null +++ b/src/backend/access/gist/Makefile @@ -0,0 +1,28 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/gist +# +# IDENTIFICATION +# src/backend/access/gist/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/gist +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + gist.o \ + gistbuild.o \ + gistbuildbuffers.o \ + gistget.o \ + gistproc.o \ + gistscan.o \ + gistsplit.o \ + gistutil.o \ + gistvacuum.o \ + gistvalidate.o \ + gistxlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README new file mode 100644 index 0000000..25cab00 --- /dev/null +++ b/src/backend/access/gist/README @@ -0,0 +1,467 @@ +src/backend/access/gist/README + +GiST Indexing +============= + +This directory contains an implementation of GiST indexing for Postgres. + +GiST stands for Generalized Search Tree. It was introduced in the seminal paper +"Generalized Search Trees for Database Systems", 1995, Joseph M. Hellerstein, +Jeffrey F. Naughton, Avi Pfeffer: + + http://www.sai.msu.su/~megera/postgres/gist/papers/gist.ps + https://dsf.berkeley.edu/papers/sigmod97-gist.pdf + +and implemented by J. Hellerstein and P. Aoki in an early version of +PostgreSQL (more details are available from The GiST Indexing Project +at Berkeley at http://gist.cs.berkeley.edu/). As a "university" +project it had a limited number of features and was in rare use. + +The current implementation of GiST supports: + + * Variable length keys + * Composite keys (multi-key) + * Ordered search (nearest-neighbor search) + * provides NULL-safe interface to GiST core + * Concurrency + * Recovery support via WAL logging + * Buffering build algorithm + +The support for concurrency implemented in PostgreSQL was developed based on +the paper "Access Methods for Next-Generation Database Systems" by +Marcel Kornacker: + + http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz + +Buffering build algorithm for GiST was developed based on the paper "Efficient +Bulk Operations on Dynamic R-trees" by Lars Arge, Klaus Hinrichs, Jan Vahrenhold +and Jeffrey Scott Vitter. + + http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.135.9894&rep=rep1&type=pdf + +The original algorithms were modified in several ways: + +* They had to be adapted to PostgreSQL conventions. For example, the SEARCH + algorithm was considerably changed, because in PostgreSQL the search function + should return one tuple (next), not all tuples at once. Also, it should + release page locks between calls. +* Since we added support for variable length keys, it's not possible to + guarantee enough free space for all keys on pages after splitting. User + defined function picksplit doesn't have information about size of tuples + (each tuple may contain several keys as in multicolumn index while picksplit + could work with only one key) and pages. +* We modified original INSERT algorithm for performance reasons. In particular, + it is now a single-pass algorithm. +* Since the papers were theoretical, some details were omitted and we + had to find out ourself how to solve some specific problems. + +Because of the above reasons, we have revised the interaction of GiST +core and PostgreSQL WAL system. Moreover, we encountered (and solved) +a problem of uncompleted insertions when recovering after crash, which +was not touched in the paper. + +Search Algorithm +---------------- + +The search code maintains a queue of unvisited items, where an "item" is +either a heap tuple known to satisfy the search conditions, or an index +page that is consistent with the search conditions according to inspection +of its parent page's downlink item. Initially the root page is searched +to find unvisited items in it. Then we pull items from the queue. A +heap tuple pointer is just returned immediately; an index page entry +causes that page to be searched, generating more queue entries. + +The queue is kept ordered with heap tuple items at the front, then +index page entries, with any newly-added index page entry inserted +before existing index page entries. This ensures depth-first traversal +of the index, and in particular causes the first few heap tuples to be +returned as soon as possible. That is helpful in case there is a LIMIT +that requires only a few tuples to be produced. + +To implement nearest-neighbor search, the queue entries are augmented +with distance data: heap tuple entries are labeled with exact distance +from the search argument, while index-page entries must be labeled with +the minimum distance that any of their children could have. Then, +queue entries are retrieved in smallest-distance-first order, with +entries having identical distances managed as stated in the previous +paragraph. + +The search algorithm keeps an index page locked only long enough to scan +its entries and queue those that satisfy the search conditions. Since +insertions can occur concurrently with searches, it is possible for an +index child page to be split between the time we make a queue entry for it +(while visiting its parent page) and the time we actually reach and scan +the child page. To avoid missing the entries that were moved to the right +sibling, we detect whether a split has occurred by comparing the child +page's NSN (node sequence number, a special-purpose LSN) to the LSN that +the parent had when visited. If it did, the sibling page is immediately +added to the front of the queue, ensuring that its items will be scanned +in the same order as if they were still on the original child page. + +As is usual in Postgres, the search algorithm only guarantees to find index +entries that existed before the scan started; index entries added during +the scan might or might not be visited. This is okay as long as all +searches use MVCC snapshot rules to reject heap tuples newer than the time +of scan start. In particular, this means that we need not worry about +cases where a parent page's downlink key is "enlarged" after we look at it. +Any such enlargement would be to add child items that we aren't interested +in returning anyway. + + +Insert Algorithm +---------------- + +INSERT guarantees that the GiST tree remains balanced. User defined key method +Penalty is used for choosing a subtree to insert; method PickSplit is used for +the node splitting algorithm; method Union is used for propagating changes +upward to maintain the tree properties. + +To insert a tuple, we first have to find a suitable leaf page to insert to. +The algorithm walks down the tree, starting from the root, along the path +of smallest Penalty. At each step: + +1. Has this page been split since we looked at the parent? If so, it's +possible that we should be inserting to the other half instead, so retreat +back to the parent. +2. If this is a leaf node, we've found our target node. +3. Otherwise use Penalty to pick a new target subtree. +4. Check the key representing the target subtree. If it doesn't already cover +the key we're inserting, replace it with the Union of the old downlink key +and the key being inserted. (Actually, we always call Union, and just skip +the replacement if the Unioned key is the same as the existing key) +5. Replacing the key in step 4 might cause the page to be split. In that case, +propagate the change upwards and restart the algorithm from the first parent +that didn't need to be split. +6. Walk down to the target subtree, and goto 1. + +This differs from the insertion algorithm in the original paper. In the +original paper, you first walk down the tree until you reach a leaf page, and +then you adjust the downlink in the parent, and propagate the adjustment up, +all the way up to the root in the worst case. But we adjust the downlinks to +cover the new key already when we walk down, so that when we reach the leaf +page, we don't need to update the parents anymore, except to insert the +downlinks if we have to split the page. This makes crash recovery simpler: +after inserting a key to the page, the tree is immediately self-consistent +without having to update the parents. Even if we split a page and crash before +inserting the downlink to the parent, the tree is self-consistent because the +right half of the split is accessible via the rightlink of the left page +(which replaced the original page). + +Note that the algorithm can walk up and down the tree before reaching a leaf +page, if internal pages need to split while adjusting the downlinks for the +new key. Eventually, you should reach the bottom, and proceed with the +insertion of the new tuple. + +Once we've found the target page to insert to, we check if there's room +for the new tuple. If there is, the tuple is inserted, and we're done. +If it doesn't fit, however, the page needs to be split. Note that it is +possible that a page needs to be split into more than two pages, if keys have +different lengths or more than one key is being inserted at a time (which can +happen when inserting downlinks for a page split that resulted in more than +two pages at the lower level). After splitting a page, the parent page needs +to be updated. The downlink for the new page needs to be inserted, and the +downlink for the old page, which became the left half of the split, needs to +be updated to only cover those tuples that stayed on the left page. Inserting +the downlink in the parent can again lead to a page split, recursing up to the +root page in the worst case. + +gistplacetopage is the workhorse function that performs one step of the +insertion. If the tuple fits, it inserts it to the given page, otherwise +it splits the page, and constructs the new downlink tuples for the split +pages. The caller must then call gistplacetopage() on the parent page to +insert the downlink tuples. The parent page that holds the downlink to +the child might have migrated as a result of concurrent splits of the +parent, gistFindCorrectParent() is used to find the parent page. + +Splitting the root page works slightly differently. At root split, +gistplacetopage() allocates the new child pages and replaces the old root +page with the new root containing downlinks to the new children, all in one +operation. + + +findPath is a subroutine of findParent, used when the correct parent page +can't be found by following the rightlinks at the parent level: + +findPath( stack item ) + push stack, [root, 0, 0] // page, LSN, parent + while( stack ) + ptr = top of stack + latch( ptr->page, S-mode ) + if ( ptr->parent->page->lsn < ptr->page->nsn ) + push stack, [ ptr->page->rightlink, 0, ptr->parent ] + end + for( each tuple on page ) + if ( tuple->pagepointer == item->page ) + return stack + else + add to stack at the end [tuple->pagepointer,0, ptr] + end + end + unlatch( ptr->page ) + pop stack + end + + +gistFindCorrectParent is used to re-find the parent of a page during +insertion. It might have migrated to the right since we traversed down the +tree because of page splits. + +findParent( stack item ) + parent = item->parent + if ( parent->page->lsn != parent->lsn ) + while(true) + search parent tuple on parent->page, if found the return + rightlink = parent->page->rightlink + unlatch( parent->page ) + if ( rightlink is incorrect ) + break loop + end + parent->page = rightlink + latch( parent->page, X-mode ) + end + newstack = findPath( item->parent ) + replace part of stack to new one + latch( parent->page, X-mode ) + return findParent( item ) + end + +pageSplit function decides how to distribute keys to the new pages after +page split: + +pageSplit(page, allkeys) + (lkeys, rkeys) = pickSplit( allkeys ) + if ( page is root ) + lpage = new page + else + lpage = page + rpage = new page + if ( no space left on rpage ) + newkeys = pageSplit( rpage, rkeys ) + else + push newkeys, union(rkeys) + end + if ( no space left on lpage ) + push newkeys, pageSplit( lpage, lkeys ) + else + push newkeys, union(lkeys) + end + return newkeys + + + +Concurrency control +------------------- +As a rule of thumb, if you need to hold a lock on multiple pages at the +same time, the locks should be acquired in the following order: child page +before parent, and left-to-right at the same level. Always acquiring the +locks in the same order avoids deadlocks. + +The search algorithm only looks at and locks one page at a time. Consequently +there's a race condition between a search and a page split. A page split +happens in two phases: 1. The page is split 2. The downlink is inserted to the +parent. If a search looks at the parent page between those steps, before the +downlink is inserted, it will still find the new right half by following the +rightlink on the left half. But it must not follow the rightlink if it saw the +downlink in the parent, or the page will be visited twice! + +A split initially marks the left page with the F_FOLLOW_RIGHT flag. If a scan +sees that flag set, it knows that the right page is missing the downlink, and +should be visited too. When split inserts the downlink to the parent, it +clears the F_FOLLOW_RIGHT flag in the child, and sets the NSN field in the +child page header to match the LSN of the insertion on the parent. If the +F_FOLLOW_RIGHT flag is not set, a scan compares the NSN on the child and the +LSN it saw in the parent. If NSN < LSN, the scan looked at the parent page +before the downlink was inserted, so it should follow the rightlink. Otherwise +the scan saw the downlink in the parent page, and will/did follow that as +usual. + +A scan can't normally see a page with the F_FOLLOW_RIGHT flag set, because +a page split keeps the child pages locked until the downlink has been inserted +to the parent and the flag cleared again. But if a crash happens in the middle +of a page split, before the downlinks are inserted into the parent, that will +leave a page with F_FOLLOW_RIGHT in the tree. Scans handle that just fine, +but we'll eventually want to fix that for performance reasons. And more +importantly, dealing with pages with missing downlink pointers in the parent +would complicate the insertion algorithm. So when an insertion sees a page +with F_FOLLOW_RIGHT set, it immediately tries to bring the split that +crashed in the middle to completion by adding the downlink in the parent. + +Buffering build algorithm +------------------------- + +In the buffering index build algorithm, some or all internal nodes have a +buffer attached to them. When a tuple is inserted at the top, the descend down +the tree is stopped as soon as a buffer is reached, and the tuple is pushed to +the buffer. When a buffer gets too full, all the tuples in it are flushed to +the lower level, where they again hit lower level buffers or leaf pages. This +makes the insertions happen in more of a breadth-first than depth-first order, +which greatly reduces the amount of random I/O required. + +In the algorithm, levels are numbered so that leaf pages have level zero, +and internal node levels count up from 1. This numbering ensures that a page's +level number never changes, even when the root page is split. + +Level Tree + +3 * + / \ +2 * * + / | \ / | \ +1 * * * * * * + / \ / \ / \ / \ / \ / \ +0 o o o o o o o o o o o o + +* - internal page +o - leaf page + +Internal pages that belong to certain levels have buffers associated with +them. Leaf pages never have buffers. Which levels have buffers is controlled +by "level step" parameter: level numbers that are multiples of level_step +have buffers, while others do not. For example, if level_step = 2, then +pages on levels 2, 4, 6, ... have buffers. If level_step = 1 then every +internal page has a buffer. + +Level Tree (level_step = 1) Tree (level_step = 2) + +3 * * + / \ / \ +2 *(b) *(b) *(b) *(b) + / | \ / | \ / | \ / | \ +1 *(b) *(b) *(b) *(b) *(b) *(b) * * * * * * + / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ +0 o o o o o o o o o o o o o o o o o o o o o o o o + +(b) - buffer + +Logically, a buffer is just bunch of tuples. Physically, it is divided in +pages, backed by a temporary file. Each buffer can be in one of two states: +a) Last page of the buffer is kept in main memory. A node buffer is +automatically switched to this state when a new index tuple is added to it, +or a tuple is removed from it. +b) All pages of the buffer are swapped out to disk. When a buffer becomes too +full, and we start to flush it, all other buffers are switched to this state. + +When an index tuple is inserted, its initial processing can end in one of the +following points: +1) Leaf page, if the depth of the index <= level_step, meaning that + none of the internal pages have buffers associated with them. +2) Buffer of topmost level page that has buffers. + +New index tuples are processed until one of the buffers in the topmost +buffered level becomes half-full. When a buffer becomes half-full, it's added +to the emptying queue, and will be emptied before a new tuple is processed. + +Buffer emptying process means that index tuples from the buffer are moved +into buffers at a lower level, or leaf pages. First, all the other buffers are +swapped to disk to free up the memory. Then tuples are popped from the buffer +one by one, and cascaded down the tree to the next buffer or leaf page below +the buffered node. + +Emptying a buffer has the interesting dynamic property that any intermediate +pages between the buffer being emptied, and the next buffered or leaf level +below it, become cached. If there are no more buffers below the node, the leaf +pages where the tuples finally land on get cached too. If there are, the last +buffer page of each buffer below is kept in memory. This is illustrated in +the figures below: + + Buffer being emptied to + lower-level buffers Buffer being emptied to leaf pages + + +(fb) +(fb) + / \ / \ + + + + + + / \ / \ / \ / \ + *(ab) *(ab) *(ab) *(ab) x x x x + ++ - cached internal page +x - cached leaf page +* - non-cached internal page +(fb) - buffer being emptied +(ab) - buffers being appended to, with last page in memory + +In the beginning of the index build, the level-step is chosen so that all those +pages involved in emptying one buffer fit in cache, so after each of those +pages have been accessed once and cached, emptying a buffer doesn't involve +any more I/O. This locality is where the speedup of the buffering algorithm +comes from. + +Emptying one buffer can fill up one or more of the lower-level buffers, +triggering emptying of them as well. Whenever a buffer becomes too full, it's +added to the emptying queue, and will be emptied after the current buffer has +been processed. + +To keep the size of each buffer limited even in the worst case, buffer emptying +is scheduled as soon as a buffer becomes half-full, and emptying it continues +until 1/2 of the nominal buffer size worth of tuples has been emptied. This +guarantees that when buffer emptying begins, all the lower-level buffers +are at most half-full. In the worst case that all the tuples are cascaded down +to the same lower-level buffer, that buffer therefore has enough space to +accommodate all the tuples emptied from the upper-level buffer. There is no +hard size limit in any of the data structures used, though, so this only needs +to be approximate; small overfilling of some buffers doesn't matter. + +If an internal page that has a buffer associated with it is split, the buffer +needs to be split too. All tuples in the buffer are scanned through and +relocated to the correct sibling buffers, using the penalty function to decide +which buffer each tuple should go to. + +After all tuples from the heap have been processed, there are still some index +tuples in the buffers. At this point, final buffer emptying starts. All buffers +are emptied in top-down order. This is slightly complicated by the fact that +new buffers can be allocated during the emptying, due to page splits. However, +the new buffers will always be siblings of buffers that haven't been fully +emptied yet; tuples never move upwards in the tree. The final emptying loops +through buffers at a given level until all buffers at that level have been +emptied, and then moves down to the next level. + +Bulk delete algorithm (VACUUM) +------------------------------ + +VACUUM works in two stages: + +In the first stage, we scan the whole index in physical order. To make sure +that we don't miss any dead tuples because a concurrent page split moved them, +we check the F_FOLLOW_RIGHT flags and NSN on each page, to detect if the +page has been concurrently split. If a concurrent page split is detected, and +one half of the page was moved to a position that we already scanned, we +"jump backwards" to scan the page again. This is the same mechanism that +B-tree VACUUM uses, but because we already have NSNs on pages, to detect page +splits during searches, we don't need a "vacuum cycle ID" concept for that +like B-tree does. + +While we scan all the pages, we also make note of any completely empty leaf +pages. We will try to unlink them from the tree after the scan. We also record +the block numbers of all internal pages; they are needed to locate parents of +the empty pages while unlinking them. + +We try to unlink any empty leaf pages from the tree, so that their space can +be reused. In order to delete an empty page, its downlink must be removed from +the parent. We scan all the internal pages, whose block numbers we memorized +in the first stage, and look for downlinks to pages that we have memorized as +being empty. Whenever we find one, we acquire a lock on the parent and child +page, re-check that the child page is still empty. Then, we remove the +downlink and mark the child as deleted, and release the locks. + +The insertion algorithm would get confused, if an internal page was completely +empty. So we never delete the last child of an internal page, even if it's +empty. Currently, we only support deleting leaf pages. + +This page deletion algorithm works on a best-effort basis. It might fail to +find a downlink, if a concurrent page split moved it after the first stage. +In that case, we won't be able to remove all empty pages. That's OK, it's +not expected to happen very often, and hopefully the next VACUUM will clean +it up. + +When we have deleted a page, it's possible that an in-progress search will +still descend on the page, if it saw the downlink before we removed it. The +search will see that it is deleted, and ignore it, but as long as that can +happen, we cannot reuse the page. To "wait out" any in-progress searches, when +a page is deleted, it's labeled with the current next-transaction counter +value. The page is not recycled, until that XID is no longer visible to +anyone. That's much more conservative than necessary, but let's keep it +simple. + + +Authors: + Teodor Sigaev <teodor@sigaev.ru> + Oleg Bartunov <oleg@sai.msu.su> diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c new file mode 100644 index 0000000..0683f42 --- /dev/null +++ b/src/backend/access/gist/gist.c @@ -0,0 +1,1713 @@ +/*------------------------------------------------------------------------- + * + * gist.c + * interface routines for the postgres GiST index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gist.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "access/gistscan.h" +#include "catalog/pg_collation.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/builtins.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* non-export function prototypes */ +static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate); +static bool gistinserttuple(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum); +static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild, Buffer rightchild, + bool unlockbuf, bool unlockleftchild); +static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo, bool unlockbuf); +static void gistprunepage(Relation rel, Page page, Buffer buffer, + Relation heapRel); + + +#define ROTATEDIST(d) do { \ + SplitedPageLayout *tmp=(SplitedPageLayout*)palloc0(sizeof(SplitedPageLayout)); \ + tmp->block.blkno = InvalidBlockNumber; \ + tmp->buffer = InvalidBuffer; \ + tmp->next = (d); \ + (d)=tmp; \ +} while(0) + + +/* + * GiST handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +gisthandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = GISTNProcs; + amroutine->amoptsprocnum = GIST_OPTIONS_PROC; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = true; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = true; + amroutine->amstorage = true; + amroutine->amclusterable = true; + amroutine->ampredlocks = true; + amroutine->amcanparallel = false; + amroutine->amcaninclude = true; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = gistbuild; + amroutine->ambuildempty = gistbuildempty; + amroutine->aminsert = gistinsert; + amroutine->ambulkdelete = gistbulkdelete; + amroutine->amvacuumcleanup = gistvacuumcleanup; + amroutine->amcanreturn = gistcanreturn; + amroutine->amcostestimate = gistcostestimate; + amroutine->amoptions = gistoptions; + amroutine->amproperty = gistproperty; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = gistvalidate; + amroutine->amadjustmembers = gistadjustmembers; + amroutine->ambeginscan = gistbeginscan; + amroutine->amrescan = gistrescan; + amroutine->amgettuple = gistgettuple; + amroutine->amgetbitmap = gistgetbitmap; + amroutine->amendscan = gistendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * Create and return a temporary memory context for use by GiST. We + * _always_ invoke user-provided methods in a temporary memory + * context, so that memory leaks in those functions cannot cause + * problems. Also, we use some additional temporary contexts in the + * GiST code itself, to avoid the need to do some awkward manual + * memory management. + */ +MemoryContext +createTempGistContext(void) +{ + return AllocSetContextCreate(CurrentMemoryContext, + "GiST temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +/* + * gistbuildempty() -- build an empty gist index in the initialization fork + */ +void +gistbuildempty(Relation index) +{ + Buffer buffer; + + /* Initialize the root page */ + buffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize and xlog buffer */ + START_CRIT_SECTION(); + GISTInitBuffer(buffer, F_LEAF); + MarkBufferDirty(buffer); + log_newpage_buffer(buffer, true); + END_CRIT_SECTION(); + + /* Unlock and release the buffer */ + UnlockReleaseBuffer(buffer); +} + +/* + * gistinsert -- wrapper for GiST tuple insertion. + * + * This is the public interface routine for tuple insertion in GiSTs. + * It doesn't do any work; just locks the relation and passes the buck. + */ +bool +gistinsert(Relation r, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache; + IndexTuple itup; + MemoryContext oldCxt; + + /* Initialize GISTSTATE cache if first call in this statement */ + if (giststate == NULL) + { + oldCxt = MemoryContextSwitchTo(indexInfo->ii_Context); + giststate = initGISTstate(r); + giststate->tempCxt = createTempGistContext(); + indexInfo->ii_AmCache = (void *) giststate; + MemoryContextSwitchTo(oldCxt); + } + + oldCxt = MemoryContextSwitchTo(giststate->tempCxt); + + itup = gistFormTuple(giststate, r, + values, isnull, true /* size is currently bogus */ ); + itup->t_tid = *ht_ctid; + + gistdoinsert(r, itup, 0, giststate, heapRel, false); + + /* cleanup */ + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(giststate->tempCxt); + + return false; +} + + +/* + * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple + * at that offset is atomically removed along with inserting the new tuples. + * This is used to replace a tuple with a new one. + * + * If 'leftchildbuf' is valid, we're inserting the downlink for the page + * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. + * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. + * + * If 'markfollowright' is true and the page is split, the left child is + * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered + * index build, however, there is no concurrent access and the page splitting + * is done in a slightly simpler fashion, and false is passed. + * + * If there is not enough room on the page, it is split. All the split + * pages are kept pinned and locked and returned in *splitinfo, the caller + * is responsible for inserting the downlinks for them. However, if + * 'buffer' is the root page and it needs to be split, gistplacetopage() + * performs the split as one atomic operation, and *splitinfo is set to NIL. + * In that case, we continue to hold the root page locked, and the child + * pages are released; note that new tuple(s) are *not* on the root page + * but in one of the new child pages. + * + * If 'newblkno' is not NULL, returns the block number of page the first + * new/updated tuple was inserted to. Usually it's the given page, but could + * be its right sibling if the page was split. + * + * Returns 'true' if the page was split, 'false' otherwise. + */ +bool +gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, + Buffer buffer, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + BlockNumber *newblkno, + Buffer leftchildbuf, + List **splitinfo, + bool markfollowright, + Relation heapRel, + bool is_build) +{ + BlockNumber blkno = BufferGetBlockNumber(buffer); + Page page = BufferGetPage(buffer); + bool is_leaf = (GistPageIsLeaf(page)) ? true : false; + XLogRecPtr recptr; + int i; + bool is_split; + + /* + * Refuse to modify a page that's incompletely split. This should not + * happen because we finish any incomplete splits while we walk down the + * tree. However, it's remotely possible that another concurrent inserter + * splits a parent page, and errors out before completing the split. We + * will just throw an error in that case, and leave any split we had in + * progress unfinished too. The next insert that comes along will clean up + * the mess. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); + + /* should never try to insert to a deleted page */ + Assert(!GistPageIsDeleted(page)); + + *splitinfo = NIL; + + /* + * if isupdate, remove old key: This node's key has been modified, either + * because a child split occurred or because we needed to adjust our key + * for an insert in a child node. Therefore, remove the old version of + * this node's key. + * + * for WAL replay, in the non-split case we handle this by setting up a + * one-element todelete array; in the split case, it's handled implicitly + * because the tuple vector passed to gistSplit won't include this tuple. + */ + is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + + /* + * If leaf page is full, try at first to delete dead tuples. And then + * check again. + */ + if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page)) + { + gistprunepage(rel, page, buffer, heapRel); + is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + } + + if (is_split) + { + /* no space for insertion */ + IndexTuple *itvec; + int tlen; + SplitedPageLayout *dist = NULL, + *ptr; + BlockNumber oldrlink = InvalidBlockNumber; + GistNSN oldnsn = 0; + SplitedPageLayout rootpg; + bool is_rootsplit; + int npage; + + is_rootsplit = (blkno == GIST_ROOT_BLKNO); + + /* + * Form index tuples vector to split. If we're replacing an old tuple, + * remove the old version from the vector. + */ + itvec = gistextractpage(page, &tlen); + if (OffsetNumberIsValid(oldoffnum)) + { + /* on inner page we should remove old tuple */ + int pos = oldoffnum - FirstOffsetNumber; + + tlen--; + if (pos != tlen) + memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); + } + itvec = gistjoinvector(itvec, &tlen, itup, ntup); + dist = gistSplit(rel, page, itvec, tlen, giststate); + + /* + * Check that split didn't produce too many pages. + */ + npage = 0; + for (ptr = dist; ptr; ptr = ptr->next) + npage++; + /* in a root split, we'll add one more page to the list below */ + if (is_rootsplit) + npage++; + if (npage > GIST_MAX_SPLIT_PAGES) + elog(ERROR, "GiST page split into too many halves (%d, maximum %d)", + npage, GIST_MAX_SPLIT_PAGES); + + /* + * Set up pages to work with. Allocate new buffers for all but the + * leftmost page. The original page becomes the new leftmost page, and + * is just replaced with the new contents. + * + * For a root-split, allocate new buffers for all child pages, the + * original page is overwritten with new root page containing + * downlinks to the new child pages. + */ + ptr = dist; + if (!is_rootsplit) + { + /* save old rightlink and NSN */ + oldrlink = GistPageGetOpaque(page)->rightlink; + oldnsn = GistPageGetNSN(page); + + dist->buffer = buffer; + dist->block.blkno = BufferGetBlockNumber(buffer); + dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); + + /* clean all flags except F_LEAF */ + GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; + + ptr = ptr->next; + } + for (; ptr; ptr = ptr->next) + { + /* Allocate new page */ + ptr->buffer = gistNewBuffer(rel); + GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); + ptr->page = BufferGetPage(ptr->buffer); + ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); + PredicateLockPageSplit(rel, + BufferGetBlockNumber(buffer), + BufferGetBlockNumber(ptr->buffer)); + } + + /* + * Now that we know which blocks the new pages go to, set up downlink + * tuples to point to them. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); + GistTupleSetValid(ptr->itup); + } + + /* + * If this is a root split, we construct the new root page with the + * downlinks here directly, instead of requiring the caller to insert + * them. Add the new root page to the list along with the child pages. + */ + if (is_rootsplit) + { + IndexTuple *downlinks; + int ndownlinks = 0; + int i; + + rootpg.buffer = buffer; + rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); + GistPageGetOpaque(rootpg.page)->flags = 0; + + /* Prepare a vector of all the downlinks */ + for (ptr = dist; ptr; ptr = ptr->next) + ndownlinks++; + downlinks = palloc(sizeof(IndexTuple) * ndownlinks); + for (i = 0, ptr = dist; ptr; ptr = ptr->next) + downlinks[i++] = ptr->itup; + + rootpg.block.blkno = GIST_ROOT_BLKNO; + rootpg.block.num = ndownlinks; + rootpg.list = gistfillitupvec(downlinks, ndownlinks, + &(rootpg.lenlist)); + rootpg.itup = NULL; + + rootpg.next = dist; + dist = &rootpg; + } + else + { + /* Prepare split-info to be returned to caller */ + for (ptr = dist; ptr; ptr = ptr->next) + { + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + + si->buf = ptr->buffer; + si->downlink = ptr->itup; + *splitinfo = lappend(*splitinfo, si); + } + } + + /* + * Fill all pages. All the pages are new, ie. freshly allocated empty + * pages, or a temporary copy of the old page. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + char *data = (char *) (ptr->list); + + for (i = 0; i < ptr->block.num; i++) + { + IndexTuple thistup = (IndexTuple) data; + + if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); + + /* + * If this is the first inserted/updated tuple, let the caller + * know which page it landed on. + */ + if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) + *newblkno = ptr->block.blkno; + + data += IndexTupleSize(thistup); + } + + /* Set up rightlinks */ + if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) + GistPageGetOpaque(ptr->page)->rightlink = + ptr->next->block.blkno; + else + GistPageGetOpaque(ptr->page)->rightlink = oldrlink; + + /* + * Mark the all but the right-most page with the follow-right + * flag. It will be cleared as soon as the downlink is inserted + * into the parent, but this ensures that if we error out before + * that, the index is still consistent. (in buffering build mode, + * any error will abort the index build anyway, so this is not + * needed.) + */ + if (ptr->next && !is_rootsplit && markfollowright) + GistMarkFollowRight(ptr->page); + else + GistClearFollowRight(ptr->page); + + /* + * Copy the NSN of the original page to all pages. The + * F_FOLLOW_RIGHT flags ensure that scans will follow the + * rightlinks until the downlinks are inserted. + */ + GistPageSetNSN(ptr->page, oldnsn); + } + + /* + * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL + * insertion for that. NB: The number of pages and data segments + * specified here must match the calculations in gistXLogSplit()! + */ + if (!is_build && RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(npage, 1 + npage * 2); + + START_CRIT_SECTION(); + + /* + * Must mark buffers dirty before XLogInsert, even though we'll still + * be changing their opaque fields below. + */ + for (ptr = dist; ptr; ptr = ptr->next) + MarkBufferDirty(ptr->buffer); + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); + + /* + * The first page in the chain was a temporary working copy meant to + * replace the old page. Copy it over the old page. + */ + PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); + dist->page = BufferGetPage(dist->buffer); + + /* + * Write the WAL record. + * + * If we're building a new index, however, we don't WAL-log changes + * yet. The LSN-NSN interlock between parent and child requires that + * LSNs never move backwards, so set the LSNs to a value that's + * smaller than any real or fake unlogged LSN that might be generated + * later. (There can't be any concurrent scans during index build, so + * we don't need to be able to detect concurrent splits yet.) + */ + if (is_build) + recptr = GistBuildLSN; + else + { + if (RelationNeedsWAL(rel)) + recptr = gistXLogSplit(is_leaf, + dist, oldrlink, oldnsn, leftchildbuf, + markfollowright); + else + recptr = gistGetFakeLSN(rel); + } + + for (ptr = dist; ptr; ptr = ptr->next) + PageSetLSN(ptr->page, recptr); + + /* + * Return the new child buffers to the caller. + * + * If this was a root split, we've already inserted the downlink + * pointers, in the form of a new root page. Therefore we can release + * all the new buffers, and keep just the root page locked. + */ + if (is_rootsplit) + { + for (ptr = dist->next; ptr; ptr = ptr->next) + UnlockReleaseBuffer(ptr->buffer); + } + } + else + { + /* + * Enough space. We always get here if ntup==0. + */ + START_CRIT_SECTION(); + + /* + * Delete old tuple if any, then insert new tuple(s) if any. If + * possible, use the fast path of PageIndexTupleOverwrite. + */ + if (OffsetNumberIsValid(oldoffnum)) + { + if (ntup == 1) + { + /* One-for-one replacement, so use PageIndexTupleOverwrite */ + if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup, + IndexTupleSize(*itup))) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(rel)); + } + else + { + /* Delete old, then append new tuple(s) to page */ + PageIndexTupleDelete(page, oldoffnum); + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + } + } + else + { + /* Just append new tuples at the end of the page */ + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + } + + MarkBufferDirty(buffer); + + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); + + if (is_build) + recptr = GistBuildLSN; + else + { + if (RelationNeedsWAL(rel)) + { + OffsetNumber ndeloffs = 0, + deloffs[1]; + + if (OffsetNumberIsValid(oldoffnum)) + { + deloffs[0] = oldoffnum; + ndeloffs = 1; + } + + recptr = gistXLogUpdate(buffer, + deloffs, ndeloffs, itup, ntup, + leftchildbuf); + } + else + recptr = gistGetFakeLSN(rel); + } + PageSetLSN(page, recptr); + + if (newblkno) + *newblkno = blkno; + } + + /* + * If we inserted the downlink for a child page, set NSN and clear + * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to + * follow the rightlink if and only if they looked at the parent page + * before we inserted the downlink. + * + * Note that we do this *after* writing the WAL record. That means that + * the possible full page image in the WAL record does not include these + * changes, and they must be replayed even if the page is restored from + * the full page image. There's a chicken-and-egg problem: if we updated + * the child pages first, we wouldn't know the recptr of the WAL record + * we're about to write. + */ + if (BufferIsValid(leftchildbuf)) + { + Page leftpg = BufferGetPage(leftchildbuf); + + GistPageSetNSN(leftpg, recptr); + GistClearFollowRight(leftpg); + + PageSetLSN(leftpg, recptr); + } + + END_CRIT_SECTION(); + + return is_split; +} + +/* + * Workhouse routine for doing insertion into a GiST index. Note that + * this routine assumes it is invoked in a short-lived memory context, + * so it does not bother releasing palloc'd allocations. + */ +void +gistdoinsert(Relation r, IndexTuple itup, Size freespace, + GISTSTATE *giststate, Relation heapRel, bool is_build) +{ + ItemId iid; + IndexTuple idxtuple; + GISTInsertStack firststack; + GISTInsertStack *stack; + GISTInsertState state; + bool xlocked = false; + + memset(&state, 0, sizeof(GISTInsertState)); + state.freespace = freespace; + state.r = r; + state.heapRel = heapRel; + state.is_build = is_build; + + /* Start from the root */ + firststack.blkno = GIST_ROOT_BLKNO; + firststack.lsn = 0; + firststack.retry_from_parent = false; + firststack.parent = NULL; + firststack.downlinkoffnum = InvalidOffsetNumber; + state.stack = stack = &firststack; + + /* + * Walk down along the path of smallest penalty, updating the parent + * pointers with the key we're inserting as we go. If we crash in the + * middle, the tree is consistent, although the possible parent updates + * were a waste. + */ + for (;;) + { + /* + * If we split an internal page while descending the tree, we have to + * retry at the parent. (Normally, the LSN-NSN interlock below would + * also catch this and cause us to retry. But LSNs are not updated + * during index build.) + */ + while (stack->retry_from_parent) + { + if (xlocked) + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + ReleaseBuffer(stack->buffer); + state.stack = stack = stack->parent; + } + + if (XLogRecPtrIsInvalid(stack->lsn)) + stack->buffer = ReadBuffer(state.r, stack->blkno); + + /* + * Be optimistic and grab shared lock first. Swap it for an exclusive + * lock later if we need to update the page. + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_SHARE); + gistcheckpage(state.r, stack->buffer); + } + + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = xlocked ? + PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer); + Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); + + /* + * If this page was split but the downlink was never inserted to the + * parent because the inserting backend crashed before doing that, fix + * that now. + */ + if (GistFollowRight(stack->page)) + { + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + /* someone might've completed the split when we unlocked */ + if (!GistFollowRight(stack->page)) + continue; + } + gistfixsplit(&state, giststate); + + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + + if ((stack->blkno != GIST_ROOT_BLKNO && + stack->parent->lsn < GistPageGetNSN(stack->page)) || + GistPageIsDeleted(stack->page)) + { + /* + * Concurrent split or page deletion detected. There's no + * guarantee that the downlink for this page is consistent with + * the tuple we're inserting anymore, so go back to parent and + * rechoose the best child. + */ + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + + if (!GistPageIsLeaf(stack->page)) + { + /* + * This is an internal page so continue to walk down the tree. + * Find the child node that has the minimum insertion penalty. + */ + BlockNumber childblkno; + IndexTuple newtup; + GISTInsertStack *item; + OffsetNumber downlinkoffnum; + + downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); + iid = PageGetItemId(stack->page, downlinkoffnum); + idxtuple = (IndexTuple) PageGetItem(stack->page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + /* + * Check that it's not a leftover invalid tuple from pre-9.1 + */ + if (GistTupleIsInvalid(idxtuple)) + ereport(ERROR, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(r)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + + /* + * Check that the key representing the target child node is + * consistent with the key we're inserting. Update it if it's not. + */ + newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); + if (newtup) + { + /* + * Swap shared lock for an exclusive one. Beware, the page may + * change while we unlock/lock the page... + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + + if (PageGetLSN(stack->page) != stack->lsn) + { + /* the page was changed while we unlocked it, retry */ + continue; + } + } + + /* + * Update the tuple. + * + * We still hold the lock after gistinserttuple(), but it + * might have to split the page to make the updated tuple fit. + * In that case the updated tuple might migrate to the other + * half of the split, so we have to go back to the parent and + * descend back to the half that's a better fit for the new + * tuple. + */ + if (gistinserttuple(&state, stack, giststate, newtup, + downlinkoffnum)) + { + /* + * If this was a root split, the root page continues to be + * the parent and the updated tuple went to one of the + * child pages, so we just need to retry from the root + * page. + */ + if (stack->blkno != GIST_ROOT_BLKNO) + { + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + } + continue; + } + } + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + + /* descend to the chosen child */ + item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + item->blkno = childblkno; + item->parent = stack; + item->downlinkoffnum = downlinkoffnum; + state.stack = stack = item; + } + else + { + /* + * Leaf page. Insert the new key. We've already updated all the + * parents on the way down, but we might have to split the page if + * it doesn't fit. gistinserttuple() will take care of that. + */ + + /* + * Swap shared lock for an exclusive one. Be careful, the page may + * change while we unlock/lock the page... + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = PageGetLSN(stack->page); + + if (stack->blkno == GIST_ROOT_BLKNO) + { + /* + * the only page that can become inner instead of leaf is + * the root page, so for root we should recheck it + */ + if (!GistPageIsLeaf(stack->page)) + { + /* + * very rare situation: during unlock/lock index with + * number of pages = 1 was increased + */ + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + continue; + } + + /* + * we don't need to check root split, because checking + * leaf/inner is enough to recognize split for root + */ + } + else if ((GistFollowRight(stack->page) || + stack->parent->lsn < GistPageGetNSN(stack->page)) || + GistPageIsDeleted(stack->page)) + { + /* + * The page was split or deleted while we momentarily + * unlocked the page. Go back to parent. + */ + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + } + + /* now state.stack->(page, buffer and blkno) points to leaf page */ + + gistinserttuple(&state, stack, giststate, itup, + InvalidOffsetNumber); + LockBuffer(stack->buffer, GIST_UNLOCK); + + /* Release any pins we might still hold before exiting */ + for (; stack; stack = stack->parent) + ReleaseBuffer(stack->buffer); + break; + } + } +} + +/* + * Traverse the tree to find path from root page to specified "child" block. + * + * returns a new insertion stack, starting from the parent of "child", up + * to the root. *downlinkoffnum is set to the offset of the downlink in the + * direct parent of child. + * + * To prevent deadlocks, this should lock only one page at a time. + */ +static GISTInsertStack * +gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) +{ + Page page; + Buffer buffer; + OffsetNumber i, + maxoff; + ItemId iid; + IndexTuple idxtuple; + List *fifo; + GISTInsertStack *top, + *ptr; + BlockNumber blkno; + + top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + top->blkno = GIST_ROOT_BLKNO; + top->downlinkoffnum = InvalidOffsetNumber; + + fifo = list_make1(top); + while (fifo != NIL) + { + /* Get next page to visit */ + top = linitial(fifo); + fifo = list_delete_first(fifo); + + buffer = ReadBuffer(r, top->blkno); + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(r, buffer); + page = (Page) BufferGetPage(buffer); + + if (GistPageIsLeaf(page)) + { + /* + * Because we scan the index top-down, all the rest of the pages + * in the queue must be leaf pages as well. + */ + UnlockReleaseBuffer(buffer); + break; + } + + /* currently, internal pages are never deleted */ + Assert(!GistPageIsDeleted(page)); + + top->lsn = BufferGetLSNAtomic(buffer); + + /* + * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a + * downlink. This should not normally happen.. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); + + if (top->parent && top->parent->lsn < GistPageGetNSN(page) && + GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) + { + /* + * Page was split while we looked elsewhere. We didn't see the + * downlink to the right page when we scanned the parent, so add + * it to the queue now. + * + * Put the right page ahead of the queue, so that we visit it + * next. That's important, because if this is the lowest internal + * level, just above leaves, we might already have queued up some + * leaf pages, and we assume that there can't be any non-leaf + * pages behind leaf pages. + */ + ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + ptr->blkno = GistPageGetOpaque(page)->rightlink; + ptr->downlinkoffnum = InvalidOffsetNumber; + ptr->parent = top->parent; + + fifo = lcons(ptr, fifo); + } + + maxoff = PageGetMaxOffsetNumber(page); + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + if (blkno == child) + { + /* Found it! */ + UnlockReleaseBuffer(buffer); + *downlinkoffnum = i; + return top; + } + else + { + /* Append this child to the list of pages to visit later */ + ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + ptr->blkno = blkno; + ptr->downlinkoffnum = i; + ptr->parent = top; + + fifo = lappend(fifo, ptr); + } + } + + UnlockReleaseBuffer(buffer); + } + + elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", + RelationGetRelationName(r), child); + return NULL; /* keep compiler quiet */ +} + +/* + * Updates the stack so that child->parent is the correct parent of the + * child. child->parent must be exclusively locked on entry, and will + * remain so at exit, but it might not be the same page anymore. + */ +static void +gistFindCorrectParent(Relation r, GISTInsertStack *child) +{ + GISTInsertStack *parent = child->parent; + + gistcheckpage(r, parent->buffer); + parent->page = (Page) BufferGetPage(parent->buffer); + + /* here we don't need to distinguish between split and page update */ + if (child->downlinkoffnum == InvalidOffsetNumber || + parent->lsn != PageGetLSN(parent->page)) + { + /* parent is changed, look child in right links until found */ + OffsetNumber i, + maxoff; + ItemId iid; + IndexTuple idxtuple; + GISTInsertStack *ptr; + + while (true) + { + maxoff = PageGetMaxOffsetNumber(parent->page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(parent->page, i); + idxtuple = (IndexTuple) PageGetItem(parent->page, iid); + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) + { + /* yes!!, found */ + child->downlinkoffnum = i; + return; + } + } + + parent->blkno = GistPageGetOpaque(parent->page)->rightlink; + UnlockReleaseBuffer(parent->buffer); + if (parent->blkno == InvalidBlockNumber) + { + /* + * End of chain and still didn't find parent. It's a very-very + * rare situation when root splitted. + */ + break; + } + parent->buffer = ReadBuffer(r, parent->blkno); + LockBuffer(parent->buffer, GIST_EXCLUSIVE); + gistcheckpage(r, parent->buffer); + parent->page = (Page) BufferGetPage(parent->buffer); + } + + /* + * awful!!, we need search tree to find parent ... , but before we + * should release all old parent + */ + + ptr = child->parent->parent; /* child->parent already released + * above */ + while (ptr) + { + ReleaseBuffer(ptr->buffer); + ptr = ptr->parent; + } + + /* ok, find new path */ + ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum); + + /* read all buffers as expected by caller */ + /* note we don't lock them or gistcheckpage them here! */ + while (ptr) + { + ptr->buffer = ReadBuffer(r, ptr->blkno); + ptr->page = (Page) BufferGetPage(ptr->buffer); + ptr = ptr->parent; + } + + /* install new chain of parents to stack */ + child->parent = parent; + + /* make recursive call to normal processing */ + LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(r, child); + } +} + +/* + * Form a downlink pointer for the page in 'buf'. + */ +static IndexTuple +gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, + GISTInsertStack *stack) +{ + Page page = BufferGetPage(buf); + OffsetNumber maxoff; + OffsetNumber offset; + IndexTuple downlink = NULL; + + maxoff = PageGetMaxOffsetNumber(page); + for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) + { + IndexTuple ituple = (IndexTuple) + PageGetItem(page, PageGetItemId(page, offset)); + + if (downlink == NULL) + downlink = CopyIndexTuple(ituple); + else + { + IndexTuple newdownlink; + + newdownlink = gistgetadjusted(rel, downlink, ituple, + giststate); + if (newdownlink) + downlink = newdownlink; + } + } + + /* + * If the page is completely empty, we can't form a meaningful downlink + * for it. But we have to insert a downlink for the page. Any key will do, + * as long as its consistent with the downlink of parent page, so that we + * can legally insert it to the parent. A minimal one that matches as few + * scans as possible would be best, to keep scans from doing useless work, + * but we don't know how to construct that. So we just use the downlink of + * the original page that was split - that's as far from optimal as it can + * get but will do.. + */ + if (!downlink) + { + ItemId iid; + + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(rel, stack); + iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum); + downlink = (IndexTuple) PageGetItem(stack->parent->page, iid); + downlink = CopyIndexTuple(downlink); + LockBuffer(stack->parent->buffer, GIST_UNLOCK); + } + + ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf)); + GistTupleSetValid(downlink); + + return downlink; +} + + +/* + * Complete the incomplete split of state->stack->page. + */ +static void +gistfixsplit(GISTInsertState *state, GISTSTATE *giststate) +{ + GISTInsertStack *stack = state->stack; + Buffer buf; + Page page; + List *splitinfo = NIL; + + ereport(LOG, + (errmsg("fixing incomplete split in index \"%s\", block %u", + RelationGetRelationName(state->r), stack->blkno))); + + Assert(GistFollowRight(stack->page)); + Assert(OffsetNumberIsValid(stack->downlinkoffnum)); + + buf = stack->buffer; + + /* + * Read the chain of split pages, following the rightlinks. Construct a + * downlink tuple for each page. + */ + for (;;) + { + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + IndexTuple downlink; + + page = BufferGetPage(buf); + + /* Form the new downlink tuples to insert to parent */ + downlink = gistformdownlink(state->r, buf, giststate, stack); + + si->buf = buf; + si->downlink = downlink; + + splitinfo = lappend(splitinfo, si); + + if (GistFollowRight(page)) + { + /* lock next page */ + buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink); + LockBuffer(buf, GIST_EXCLUSIVE); + } + else + break; + } + + /* Insert the downlinks */ + gistfinishsplit(state, stack, giststate, splitinfo, false); +} + +/* + * Insert or replace a tuple in stack->buffer. If 'oldoffnum' is valid, the + * tuple at 'oldoffnum' is replaced, otherwise the tuple is inserted as new. + * 'stack' represents the path from the root to the page being updated. + * + * The caller must hold an exclusive lock on stack->buffer. The lock is still + * held on return, but the page might not contain the inserted tuple if the + * page was split. The function returns true if the page was split, false + * otherwise. + */ +static bool +gistinserttuple(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum) +{ + return gistinserttuples(state, stack, giststate, &tuple, 1, oldoffnum, + InvalidBuffer, InvalidBuffer, false, false); +} + +/* ---------------- + * An extended workhorse version of gistinserttuple(). This version allows + * inserting multiple tuples, or replacing a single tuple with multiple tuples. + * This is used to recursively update the downlinks in the parent when a page + * is split. + * + * If leftchild and rightchild are valid, we're inserting/replacing the + * downlink for rightchild, and leftchild is its left sibling. We clear the + * F_FOLLOW_RIGHT flag and update NSN on leftchild, atomically with the + * insertion of the downlink. + * + * To avoid holding locks for longer than necessary, when recursing up the + * tree to update the parents, the locking is a bit peculiar here. On entry, + * the caller must hold an exclusive lock on stack->buffer, as well as + * leftchild and rightchild if given. On return: + * + * - Lock on stack->buffer is released, if 'unlockbuf' is true. The page is + * always kept pinned, however. + * - Lock on 'leftchild' is released, if 'unlockleftchild' is true. The page + * is kept pinned. + * - Lock and pin on 'rightchild' are always released. + * + * Returns 'true' if the page had to be split. Note that if the page was + * split, the inserted/updated tuples might've been inserted to a right + * sibling of stack->buffer instead of stack->buffer itself. + */ +static bool +gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild, Buffer rightchild, + bool unlockbuf, bool unlockleftchild) +{ + List *splitinfo; + bool is_split; + + /* + * Check for any rw conflicts (in serializable isolation level) just + * before we intend to modify the page + */ + CheckForSerializableConflictIn(state->r, NULL, BufferGetBlockNumber(stack->buffer)); + + /* Insert the tuple(s) to the page, splitting the page if necessary */ + is_split = gistplacetopage(state->r, state->freespace, giststate, + stack->buffer, + tuples, ntup, + oldoffnum, NULL, + leftchild, + &splitinfo, + true, + state->heapRel, + state->is_build); + + /* + * Before recursing up in case the page was split, release locks on the + * child pages. We don't need to keep them locked when updating the + * parent. + */ + if (BufferIsValid(rightchild)) + UnlockReleaseBuffer(rightchild); + if (BufferIsValid(leftchild) && unlockleftchild) + LockBuffer(leftchild, GIST_UNLOCK); + + /* + * If we had to split, insert/update the downlinks in the parent. If the + * caller requested us to release the lock on stack->buffer, tell + * gistfinishsplit() to do that as soon as it's safe to do so. If we + * didn't have to split, release it ourselves. + */ + if (splitinfo) + gistfinishsplit(state, stack, giststate, splitinfo, unlockbuf); + else if (unlockbuf) + LockBuffer(stack->buffer, GIST_UNLOCK); + + return is_split; +} + +/* + * Finish an incomplete split by inserting/updating the downlinks in parent + * page. 'splitinfo' contains all the child pages involved in the split, + * from left-to-right. + * + * On entry, the caller must hold a lock on stack->buffer and all the child + * pages in 'splitinfo'. If 'unlockbuf' is true, the lock on stack->buffer is + * released on return. The child pages are always unlocked and unpinned. + */ +static void +gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo, bool unlockbuf) +{ + GISTPageSplitInfo *right; + GISTPageSplitInfo *left; + IndexTuple tuples[2]; + + /* A split always contains at least two halves */ + Assert(list_length(splitinfo) >= 2); + + /* + * We need to insert downlinks for each new page, and update the downlink + * for the original (leftmost) page in the split. Begin at the rightmost + * page, inserting one downlink at a time until there's only two pages + * left. Finally insert the downlink for the last new page and update the + * downlink for the original page as one operation. + */ + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + + /* + * Insert downlinks for the siblings from right to left, until there are + * only two siblings left. + */ + for (int pos = list_length(splitinfo) - 1; pos > 1; pos--) + { + right = (GISTPageSplitInfo *) list_nth(splitinfo, pos); + left = (GISTPageSplitInfo *) list_nth(splitinfo, pos - 1); + + gistFindCorrectParent(state->r, stack); + if (gistinserttuples(state, stack->parent, giststate, + &right->downlink, 1, + InvalidOffsetNumber, + left->buf, right->buf, false, false)) + { + /* + * If the parent page was split, the existing downlink might have + * moved. + */ + stack->downlinkoffnum = InvalidOffsetNumber; + } + /* gistinserttuples() released the lock on right->buf. */ + } + + right = (GISTPageSplitInfo *) lsecond(splitinfo); + left = (GISTPageSplitInfo *) linitial(splitinfo); + + /* + * Finally insert downlink for the remaining right page and update the + * downlink for the original page to not contain the tuples that were + * moved to the new pages. + */ + tuples[0] = left->downlink; + tuples[1] = right->downlink; + gistFindCorrectParent(state->r, stack); + if (gistinserttuples(state, stack->parent, giststate, + tuples, 2, + stack->downlinkoffnum, + left->buf, right->buf, + true, /* Unlock parent */ + unlockbuf /* Unlock stack->buffer if caller wants + * that */ + )) + { + /* + * If the parent page was split, the downlink might have moved. + */ + stack->downlinkoffnum = InvalidOffsetNumber; + } + + Assert(left->buf == stack->buffer); + + /* + * If we split the page because we had to adjust the downlink on an + * internal page, while descending the tree for inserting a new tuple, + * then this might no longer be the correct page for the new tuple. The + * downlink to this page might not cover the new tuple anymore, it might + * need to go to the newly-created right sibling instead. Tell the caller + * to walk back up the stack, to re-check at the parent which page to + * insert to. + * + * Normally, the LSN-NSN interlock during the tree descend would also + * detect that a concurrent split happened (by ourselves), and cause us to + * retry at the parent. But that mechanism doesn't work during index + * build, because we don't do WAL-logging, and don't update LSNs, during + * index build. + */ + stack->retry_from_parent = true; +} + +/* + * gistSplit -- split a page in the tree and fill struct + * used for XLOG and real writes buffers. Function is recursive, ie + * it will split page until keys will fit in every page. + */ +SplitedPageLayout * +gistSplit(Relation r, + Page page, + IndexTuple *itup, /* contains compressed entry */ + int len, + GISTSTATE *giststate) +{ + IndexTuple *lvectup, + *rvectup; + GistSplitVector v; + int i; + SplitedPageLayout *res = NULL; + + /* this should never recurse very deeply, but better safe than sorry */ + check_stack_depth(); + + /* there's no point in splitting an empty page */ + Assert(len > 0); + + /* + * If a single tuple doesn't fit on a page, no amount of splitting will + * help. + */ + if (len == 1) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + IndexTupleSize(itup[0]), GiSTPageSize, + RelationGetRelationName(r)))); + + memset(v.spl_lisnull, true, + sizeof(bool) * giststate->nonLeafTupdesc->natts); + memset(v.spl_risnull, true, + sizeof(bool) * giststate->nonLeafTupdesc->natts); + gistSplitByKey(r, page, itup, len, giststate, &v, 0); + + /* form left and right vector */ + lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1)); + rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1)); + + for (i = 0; i < v.splitVector.spl_nleft; i++) + lvectup[i] = itup[v.splitVector.spl_left[i] - 1]; + + for (i = 0; i < v.splitVector.spl_nright; i++) + rvectup[i] = itup[v.splitVector.spl_right[i] - 1]; + + /* finalize splitting (may need another split) */ + if (!gistfitpage(rvectup, v.splitVector.spl_nright)) + { + res = gistSplit(r, page, rvectup, v.splitVector.spl_nright, giststate); + } + else + { + ROTATEDIST(res); + res->block.num = v.splitVector.spl_nright; + res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist)); + res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false); + } + + if (!gistfitpage(lvectup, v.splitVector.spl_nleft)) + { + SplitedPageLayout *resptr, + *subres; + + resptr = subres = gistSplit(r, page, lvectup, v.splitVector.spl_nleft, giststate); + + /* install on list's tail */ + while (resptr->next) + resptr = resptr->next; + + resptr->next = res; + res = subres; + } + else + { + ROTATEDIST(res); + res->block.num = v.splitVector.spl_nleft; + res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist)); + res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false); + } + + return res; +} + +/* + * Create a GISTSTATE and fill it with information about the index + */ +GISTSTATE * +initGISTstate(Relation index) +{ + GISTSTATE *giststate; + MemoryContext scanCxt; + MemoryContext oldCxt; + int i; + + /* safety check to protect fixed-size arrays in GISTSTATE */ + if (index->rd_att->natts > INDEX_MAX_KEYS) + elog(ERROR, "numberOfAttributes %d > %d", + index->rd_att->natts, INDEX_MAX_KEYS); + + /* Create the memory context that will hold the GISTSTATE */ + scanCxt = AllocSetContextCreate(CurrentMemoryContext, + "GiST scan context", + ALLOCSET_DEFAULT_SIZES); + oldCxt = MemoryContextSwitchTo(scanCxt); + + /* Create and fill in the GISTSTATE */ + giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE)); + + giststate->scanCxt = scanCxt; + giststate->tempCxt = scanCxt; /* caller must change this if needed */ + giststate->leafTupdesc = index->rd_att; + + /* + * The truncated tupdesc for non-leaf index tuples, which doesn't contain + * the INCLUDE attributes. + * + * It is used to form tuples during tuple adjustment and page split. + * B-tree creates shortened tuple descriptor for every truncated tuple, + * because it is doing this less often: it does not have to form truncated + * tuples during page split. Also, B-tree is not adjusting tuples on + * internal pages the way GiST does. + */ + giststate->nonLeafTupdesc = CreateTupleDescCopyConstr(index->rd_att); + giststate->nonLeafTupdesc->natts = + IndexRelationGetNumberOfKeyAttributes(index); + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++) + { + fmgr_info_copy(&(giststate->consistentFn[i]), + index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC), + scanCxt); + fmgr_info_copy(&(giststate->unionFn[i]), + index_getprocinfo(index, i + 1, GIST_UNION_PROC), + scanCxt); + + /* opclasses are not required to provide a Compress method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_COMPRESS_PROC))) + fmgr_info_copy(&(giststate->compressFn[i]), + index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC), + scanCxt); + else + giststate->compressFn[i].fn_oid = InvalidOid; + + /* opclasses are not required to provide a Decompress method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_DECOMPRESS_PROC))) + fmgr_info_copy(&(giststate->decompressFn[i]), + index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC), + scanCxt); + else + giststate->decompressFn[i].fn_oid = InvalidOid; + + fmgr_info_copy(&(giststate->penaltyFn[i]), + index_getprocinfo(index, i + 1, GIST_PENALTY_PROC), + scanCxt); + fmgr_info_copy(&(giststate->picksplitFn[i]), + index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC), + scanCxt); + fmgr_info_copy(&(giststate->equalFn[i]), + index_getprocinfo(index, i + 1, GIST_EQUAL_PROC), + scanCxt); + + /* opclasses are not required to provide a Distance method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC))) + fmgr_info_copy(&(giststate->distanceFn[i]), + index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC), + scanCxt); + else + giststate->distanceFn[i].fn_oid = InvalidOid; + + /* opclasses are not required to provide a Fetch method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_FETCH_PROC))) + fmgr_info_copy(&(giststate->fetchFn[i]), + index_getprocinfo(index, i + 1, GIST_FETCH_PROC), + scanCxt); + else + giststate->fetchFn[i].fn_oid = InvalidOid; + + /* + * If the index column has a specified collation, we should honor that + * while doing comparisons. However, we may have a collatable storage + * type for a noncollatable indexed data type. If there's no index + * collation then specify default collation in case the support + * functions need collation. This is harmless if the support + * functions don't care about collation, so we just do it + * unconditionally. (We could alternatively call get_typcollation, + * but that seems like expensive overkill --- there aren't going to be + * any cases where a GiST storage type has a nondefault collation.) + */ + if (OidIsValid(index->rd_indcollation[i])) + giststate->supportCollation[i] = index->rd_indcollation[i]; + else + giststate->supportCollation[i] = DEFAULT_COLLATION_OID; + } + + /* No opclass information for INCLUDE attributes */ + for (; i < index->rd_att->natts; i++) + { + giststate->consistentFn[i].fn_oid = InvalidOid; + giststate->unionFn[i].fn_oid = InvalidOid; + giststate->compressFn[i].fn_oid = InvalidOid; + giststate->decompressFn[i].fn_oid = InvalidOid; + giststate->penaltyFn[i].fn_oid = InvalidOid; + giststate->picksplitFn[i].fn_oid = InvalidOid; + giststate->equalFn[i].fn_oid = InvalidOid; + giststate->distanceFn[i].fn_oid = InvalidOid; + giststate->fetchFn[i].fn_oid = InvalidOid; + giststate->supportCollation[i] = InvalidOid; + } + + MemoryContextSwitchTo(oldCxt); + + return giststate; +} + +void +freeGISTstate(GISTSTATE *giststate) +{ + /* It's sufficient to delete the scanCxt */ + MemoryContextDelete(giststate->scanCxt); +} + +/* + * gistprunepage() -- try to remove LP_DEAD items from the given page. + * Function assumes that buffer is exclusively locked. + */ +static void +gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel) +{ + OffsetNumber deletable[MaxIndexTuplesPerPage]; + int ndeletable = 0; + OffsetNumber offnum, + maxoff; + + Assert(GistPageIsLeaf(page)); + + /* + * Scan over all items to see which ones need to be deleted according to + * LP_DEAD flags. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + TransactionId latestRemovedXid = InvalidTransactionId; + + if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) + latestRemovedXid = + index_compute_xid_horizon_for_tuples(rel, heapRel, buffer, + deletable, ndeletable); + + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan to + * check it. Remember that F_HAS_GARBAGE is only a hint anyway. + */ + GistClearPageHasGarbage(page); + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogDelete(buffer, + deletable, ndeletable, + latestRemovedXid); + + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + + END_CRIT_SECTION(); + } + + /* + * Note: if we didn't find any LP_DEAD items, then the page's + * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a + * separate write to clear it, however. We will clear it when we split + * the page. + */ +} diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c new file mode 100644 index 0000000..ec28bfe --- /dev/null +++ b/src/backend/access/gist/gistbuild.c @@ -0,0 +1,1566 @@ +/*------------------------------------------------------------------------- + * + * gistbuild.c + * build algorithm for GiST indexes implementation. + * + * There are two different strategies: + * + * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted + * order, and create downlinks and internal pages as we go. This builds + * the index from the bottom up, similar to how B-tree index build + * works. + * + * 2. Start with an empty index, and insert all tuples one by one. + * + * The sorted method is used if the operator classes for all columns have + * a 'sortsupport' defined. Otherwise, we resort to the second strategy. + * + * The second strategy can optionally use buffers at different levels of + * the tree to reduce I/O, see "Buffering build algorithm" in the README + * for a more detailed explanation. It initially calls insert over and + * over, but switches to the buffered algorithm after a certain number of + * tuples (unless buffering mode is disabled). + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistbuild.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "access/genam.h" +#include "access/gist_private.h" +#include "access/gistxlog.h" +#include "access/tableam.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "optimizer/optimizer.h" +#include "storage/bufmgr.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/tuplesort.h" + +/* Step of index tuples for check whether to switch to buffering build mode */ +#define BUFFERING_MODE_SWITCH_CHECK_STEP 256 + +/* + * Number of tuples to process in the slow way before switching to buffering + * mode, when buffering is explicitly turned on. Also, the number of tuples + * to process between readjusting the buffer size parameter, while in + * buffering mode. + */ +#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096 + +/* + * Strategy used to build the index. It can change between the + * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used, + * that needs to be decided up-front and cannot be changed afterwards. + */ +typedef enum +{ + GIST_SORTED_BUILD, /* bottom-up build by sorting */ + GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to + * switch */ + GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to + * buffering build mode if the index grows too + * big */ + GIST_BUFFERING_STATS, /* gathering statistics of index tuple size + * before switching to the buffering build + * mode */ + GIST_BUFFERING_ACTIVE /* in buffering build mode */ +} GistBuildMode; + +/* Working state for gistbuild and its callback */ +typedef struct +{ + Relation indexrel; + Relation heaprel; + GISTSTATE *giststate; + + Size freespace; /* amount of free space to leave on pages */ + + GistBuildMode buildMode; + + int64 indtuples; /* number of tuples indexed */ + + /* + * Extra data structures used during a buffering build. 'gfbb' contains + * information related to managing the build buffers. 'parentMap' is a + * lookup table of the parent of each internal page. + */ + int64 indtuplesSize; /* total size of all indexed tuples */ + GISTBuildBuffers *gfbb; + HTAB *parentMap; + + /* + * Extra data structures used during a sorting build. + */ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + + BlockNumber pages_allocated; + BlockNumber pages_written; + + int ready_num_pages; + BlockNumber ready_blknos[XLR_MAX_BLOCK_ID]; + Page ready_pages[XLR_MAX_BLOCK_ID]; +} GISTBuildState; + +/* + * In sorted build, we use a stack of these structs, one for each level, + * to hold an in-memory buffer of the rightmost page at the level. When the + * page fills up, it is written out and a new page is allocated. + */ +typedef struct GistSortedBuildPageState +{ + Page page; + struct GistSortedBuildPageState *parent; /* Upper level, if any */ +} GistSortedBuildPageState; + +/* prototypes for private functions */ + +static void gistSortedBuildCallback(Relation index, ItemPointer tid, + Datum *values, bool *isnull, + bool tupleIsAlive, void *state); +static void gist_indexsortbuild(GISTBuildState *state); +static void gist_indexsortbuild_pagestate_add(GISTBuildState *state, + GistSortedBuildPageState *pagestate, + IndexTuple itup); +static void gist_indexsortbuild_pagestate_flush(GISTBuildState *state, + GistSortedBuildPageState *pagestate); +static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state); + +static void gistInitBuffering(GISTBuildState *buildstate); +static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); +static void gistBuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state); +static void gistBufferingBuildInsert(GISTBuildState *buildstate, + IndexTuple itup); +static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, + BlockNumber startblkno, int startlevel); +static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate, + Buffer buffer, int level, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + BlockNumber parentblk, OffsetNumber downlinkoffnum); +static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate, + BlockNumber childblkno, int level, + BlockNumber *parentblk, + OffsetNumber *downlinkoffnum); +static void gistProcessEmptyingQueue(GISTBuildState *buildstate); +static void gistEmptyAllBuffers(GISTBuildState *buildstate); +static int gistGetMaxLevel(Relation index); + +static void gistInitParentMap(GISTBuildState *buildstate); +static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, + BlockNumber parent); +static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent); +static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child); + + +/* + * Main entry point to GiST index build. + */ +IndexBuildResult * +gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + GISTBuildState buildstate; + MemoryContext oldcxt = CurrentMemoryContext; + int fillfactor; + Oid SortSupportFnOids[INDEX_MAX_KEYS]; + GiSTOptions *options = (GiSTOptions *) index->rd_options; + + /* + * We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + buildstate.indexrel = index; + buildstate.heaprel = heap; + buildstate.sortstate = NULL; + buildstate.giststate = initGISTstate(index); + + /* + * Create a temporary memory context that is reset once for each tuple + * processed. (Note: we don't bother to make this a child of the + * giststate's scanCxt, so we have to delete it separately at the end.) + */ + buildstate.giststate->tempCxt = createTempGistContext(); + + /* + * Choose build strategy. First check whether the user specified to use + * buffering mode. (The use-case for that in the field is somewhat + * questionable perhaps, but it's important for testing purposes.) + */ + if (options) + { + if (options->buffering_mode == GIST_OPTION_BUFFERING_ON) + buildstate.buildMode = GIST_BUFFERING_STATS; + else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF) + buildstate.buildMode = GIST_BUFFERING_DISABLED; + else /* must be "auto" */ + buildstate.buildMode = GIST_BUFFERING_AUTO; + } + else + { + buildstate.buildMode = GIST_BUFFERING_AUTO; + } + + /* + * Unless buffering mode was forced, see if we can use sorting instead. + */ + if (buildstate.buildMode != GIST_BUFFERING_STATS) + { + bool hasallsortsupports = true; + int keyscount = IndexRelationGetNumberOfKeyAttributes(index); + + for (int i = 0; i < keyscount; i++) + { + SortSupportFnOids[i] = index_getprocid(index, i + 1, + GIST_SORTSUPPORT_PROC); + if (!OidIsValid(SortSupportFnOids[i])) + { + hasallsortsupports = false; + break; + } + } + if (hasallsortsupports) + buildstate.buildMode = GIST_SORTED_BUILD; + } + + /* + * Calculate target amount of free space to leave on pages. + */ + fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR; + buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; + + /* + * Build the index using the chosen strategy. + */ + buildstate.indtuples = 0; + buildstate.indtuplesSize = 0; + + if (buildstate.buildMode == GIST_SORTED_BUILD) + { + /* + * Sort all data, build the index from bottom up. + */ + buildstate.sortstate = tuplesort_begin_index_gist(heap, + index, + maintenance_work_mem, + NULL, + false); + + /* Scan the table, adding all tuples to the tuplesort */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + gistSortedBuildCallback, + (void *) &buildstate, NULL); + + /* + * Perform the sort and build index pages. + */ + tuplesort_performsort(buildstate.sortstate); + + gist_indexsortbuild(&buildstate); + + tuplesort_end(buildstate.sortstate); + } + else + { + /* + * Initialize an empty index and insert all tuples, possibly using + * buffers on intermediate levels. + */ + Buffer buffer; + Page page; + + /* initialize the root page */ + buffer = gistNewBuffer(index); + Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); + page = BufferGetPage(buffer); + + START_CRIT_SECTION(); + + GISTInitBuffer(buffer, F_LEAF); + + MarkBufferDirty(buffer); + PageSetLSN(page, GistBuildLSN); + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* Scan the table, inserting all the tuples to the index. */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + gistBuildCallback, + (void *) &buildstate, NULL); + + /* + * If buffering was used, flush out all the tuples that are still in + * the buffers. + */ + if (buildstate.buildMode == GIST_BUFFERING_ACTIVE) + { + elog(DEBUG1, "all tuples processed, emptying buffers"); + gistEmptyAllBuffers(&buildstate); + gistFreeBuildBuffers(buildstate.gfbb); + } + + /* + * We didn't write WAL records as we built the index, so if + * WAL-logging is required, write all pages to the WAL now. + */ + if (RelationNeedsWAL(index)) + { + log_newpage_range(index, MAIN_FORKNUM, + 0, RelationGetNumberOfBlocks(index), + true); + } + } + + /* okay, all heap tuples are indexed */ + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(buildstate.giststate->tempCxt); + + freeGISTstate(buildstate.giststate); + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = (double) buildstate.indtuples; + + return result; +} + +/*------------------------------------------------------------------------- + * Routines for sorted build + *------------------------------------------------------------------------- + */ + +/* + * Per-tuple callback for table_index_build_scan. + */ +static void +gistSortedBuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + GISTBuildState *buildstate = (GISTBuildState *) state; + MemoryContext oldCtx; + Datum compressed_values[INDEX_MAX_KEYS]; + + oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); + + /* Form an index tuple and point it at the heap tuple */ + gistCompressValues(buildstate->giststate, index, + values, isnull, + true, compressed_values); + + tuplesort_putindextuplevalues(buildstate->sortstate, + buildstate->indexrel, + tid, + compressed_values, isnull); + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->giststate->tempCxt); + + /* Update tuple count. */ + buildstate->indtuples += 1; +} + +/* + * Build GiST index from bottom up from pre-sorted tuples. + */ +static void +gist_indexsortbuild(GISTBuildState *state) +{ + IndexTuple itup; + GistSortedBuildPageState *leafstate; + GistSortedBuildPageState *pagestate; + Page page; + + state->pages_allocated = 0; + state->pages_written = 0; + state->ready_num_pages = 0; + + /* + * Write an empty page as a placeholder for the root page. It will be + * replaced with the real root page at the end. + */ + page = palloc0(BLCKSZ); + RelationOpenSmgr(state->indexrel); + smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, + page, true); + state->pages_allocated++; + state->pages_written++; + + /* Allocate a temporary buffer for the first leaf page. */ + leafstate = palloc(sizeof(GistSortedBuildPageState)); + leafstate->page = page; + leafstate->parent = NULL; + gistinitpage(page, F_LEAF); + + /* + * Fill index pages with tuples in the sorted order. + */ + while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL) + { + gist_indexsortbuild_pagestate_add(state, leafstate, itup); + MemoryContextReset(state->giststate->tempCxt); + } + + /* + * Write out the partially full non-root pages. + * + * Keep in mind that flush can build a new root. + */ + pagestate = leafstate; + while (pagestate->parent != NULL) + { + GistSortedBuildPageState *parent; + + gist_indexsortbuild_pagestate_flush(state, pagestate); + parent = pagestate->parent; + pfree(pagestate->page); + pfree(pagestate); + pagestate = parent; + } + + gist_indexsortbuild_flush_ready_pages(state); + + /* Write out the root */ + RelationOpenSmgr(state->indexrel); + PageSetLSN(pagestate->page, GistBuildLSN); + PageSetChecksumInplace(pagestate->page, GIST_ROOT_BLKNO); + smgrwrite(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, + pagestate->page, true); + if (RelationNeedsWAL(state->indexrel)) + log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, + pagestate->page, true); + + pfree(pagestate->page); + pfree(pagestate); + + /* + * When we WAL-logged index pages, we must nonetheless fsync index files. + * Since we're building outside shared buffers, a CHECKPOINT occurring + * during the build has no way to flush the previously written data to + * disk (indeed it won't know the index even exists). A crash later on + * would replay WAL from the checkpoint, therefore it wouldn't replay our + * earlier WAL entries. If we do not fsync those pages here, they might + * still not be on disk when the crash occurs. + */ + if (RelationNeedsWAL(state->indexrel)) + { + RelationOpenSmgr(state->indexrel); + smgrimmedsync(state->indexrel->rd_smgr, MAIN_FORKNUM); + } +} + +/* + * Add tuple to a page. If the pages is full, write it out and re-initialize + * a new page first. + */ +static void +gist_indexsortbuild_pagestate_add(GISTBuildState *state, + GistSortedBuildPageState *pagestate, + IndexTuple itup) +{ + Size sizeNeeded; + + /* Does the tuple fit? If not, flush */ + sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData) + state->freespace; + if (PageGetFreeSpace(pagestate->page) < sizeNeeded) + gist_indexsortbuild_pagestate_flush(state, pagestate); + + gistfillbuffer(pagestate->page, &itup, 1, InvalidOffsetNumber); +} + +static void +gist_indexsortbuild_pagestate_flush(GISTBuildState *state, + GistSortedBuildPageState *pagestate) +{ + GistSortedBuildPageState *parent; + IndexTuple *itvec; + IndexTuple union_tuple; + int vect_len; + bool isleaf; + BlockNumber blkno; + MemoryContext oldCtx; + + /* check once per page */ + CHECK_FOR_INTERRUPTS(); + + if (state->ready_num_pages == XLR_MAX_BLOCK_ID) + gist_indexsortbuild_flush_ready_pages(state); + + /* + * The page is now complete. Assign a block number to it, and add it to + * the list of finished pages. (We don't write it out immediately, because + * we want to WAL-log the pages in batches.) + */ + blkno = state->pages_allocated++; + state->ready_blknos[state->ready_num_pages] = blkno; + state->ready_pages[state->ready_num_pages] = pagestate->page; + state->ready_num_pages++; + + isleaf = GistPageIsLeaf(pagestate->page); + + /* + * Form a downlink tuple to represent all the tuples on the page. + */ + oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt); + itvec = gistextractpage(pagestate->page, &vect_len); + union_tuple = gistunion(state->indexrel, itvec, vect_len, + state->giststate); + ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); + MemoryContextSwitchTo(oldCtx); + + /* + * Insert the downlink to the parent page. If this was the root, create a + * new page as the parent, which becomes the new root. + */ + parent = pagestate->parent; + if (parent == NULL) + { + parent = palloc(sizeof(GistSortedBuildPageState)); + parent->page = (Page) palloc(BLCKSZ); + parent->parent = NULL; + gistinitpage(parent->page, 0); + + pagestate->parent = parent; + } + gist_indexsortbuild_pagestate_add(state, parent, union_tuple); + + /* Re-initialize the page buffer for next page on this level. */ + pagestate->page = palloc(BLCKSZ); + gistinitpage(pagestate->page, isleaf ? F_LEAF : 0); + + /* + * Set the right link to point to the previous page. This is just for + * debugging purposes: GiST only follows the right link if a page is split + * concurrently to a scan, and that cannot happen during index build. + * + * It's a bit counterintuitive that we set the right link on the new page + * to point to the previous page, and not the other way round. But GiST + * pages are not ordered like B-tree pages are, so as long as the + * right-links form a chain through all the pages in the same level, the + * order doesn't matter. + */ + GistPageGetOpaque(pagestate->page)->rightlink = blkno; +} + +static void +gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) +{ + if (state->ready_num_pages == 0) + return; + + RelationOpenSmgr(state->indexrel); + + for (int i = 0; i < state->ready_num_pages; i++) + { + Page page = state->ready_pages[i]; + BlockNumber blkno = state->ready_blknos[i]; + + /* Currently, the blocks must be buffered in order. */ + if (blkno != state->pages_written) + elog(ERROR, "unexpected block number to flush GiST sorting build"); + + PageSetLSN(page, GistBuildLSN); + PageSetChecksumInplace(page, blkno); + smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, blkno, page, true); + + state->pages_written++; + } + + if (RelationNeedsWAL(state->indexrel)) + log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages, + state->ready_blknos, state->ready_pages, true); + + for (int i = 0; i < state->ready_num_pages; i++) + pfree(state->ready_pages[i]); + + state->ready_num_pages = 0; +} + + +/*------------------------------------------------------------------------- + * Routines for non-sorted build + *------------------------------------------------------------------------- + */ + +/* + * Attempt to switch to buffering mode. + * + * If there is not enough memory for buffering build, sets bufferingMode + * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch + * anymore. Otherwise initializes the build buffers, and sets bufferingMode to + * GIST_BUFFERING_ACTIVE. + */ +static void +gistInitBuffering(GISTBuildState *buildstate) +{ + Relation index = buildstate->indexrel; + int pagesPerBuffer; + Size pageFreeSpace; + Size itupAvgSize, + itupMinSize; + double avgIndexTuplesPerPage, + maxIndexTuplesPerPage; + int i; + int levelStep; + + /* Calc space of index page which is available for index tuples */ + pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) + - sizeof(ItemIdData) + - buildstate->freespace; + + /* + * Calculate average size of already inserted index tuples using gathered + * statistics. + */ + itupAvgSize = (double) buildstate->indtuplesSize / + (double) buildstate->indtuples; + + /* + * Calculate minimal possible size of index tuple by index metadata. + * Minimal possible size of varlena is VARHDRSZ. + * + * XXX: that's not actually true, as a short varlen can be just 2 bytes. + * And we should take padding into account here. + */ + itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData)); + for (i = 0; i < index->rd_att->natts; i++) + { + if (TupleDescAttr(index->rd_att, i)->attlen < 0) + itupMinSize += VARHDRSZ; + else + itupMinSize += TupleDescAttr(index->rd_att, i)->attlen; + } + + /* Calculate average and maximal number of index tuples which fit to page */ + avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize; + maxIndexTuplesPerPage = pageFreeSpace / itupMinSize; + + /* + * We need to calculate two parameters for the buffering algorithm: + * levelStep and pagesPerBuffer. + * + * levelStep determines the size of subtree that we operate on, while + * emptying a buffer. A higher value is better, as you need fewer buffer + * emptying steps to build the index. However, if you set it too high, the + * subtree doesn't fit in cache anymore, and you quickly lose the benefit + * of the buffers. + * + * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is + * the number of tuples on page (ie. fanout), and M is the amount of + * internal memory available. Curiously, they doesn't explain *why* that + * setting is optimal. We calculate it by taking the highest levelStep so + * that a subtree still fits in cache. For a small B, our way of + * calculating levelStep is very close to Arge et al's formula. For a + * large B, our formula gives a value that is 2x higher. + * + * The average size (in pages) of a subtree of depth n can be calculated + * as a geometric series: + * + * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B) + * + * where B is the average number of index tuples on page. The subtree is + * cached in the shared buffer cache and the OS cache, so we choose + * levelStep so that the subtree size is comfortably smaller than + * effective_cache_size, with a safety factor of 4. + * + * The estimate on the average number of index tuples on page is based on + * average tuple sizes observed before switching to buffered build, so the + * real subtree size can be somewhat larger. Also, it would selfish to + * gobble the whole cache for our index build. The safety factor of 4 + * should account for those effects. + * + * The other limiting factor for setting levelStep is that while + * processing a subtree, we need to hold one page for each buffer at the + * next lower buffered level. The max. number of buffers needed for that + * is maxIndexTuplesPerPage^levelStep. This is very conservative, but + * hopefully maintenance_work_mem is set high enough that you're + * constrained by effective_cache_size rather than maintenance_work_mem. + * + * XXX: the buffer hash table consumes a fair amount of memory too per + * buffer, but that is not currently taken into account. That scales on + * the total number of buffers used, ie. the index size and on levelStep. + * Note that a higher levelStep *reduces* the amount of memory needed for + * the hash table. + */ + levelStep = 1; + for (;;) + { + double subtreesize; + double maxlowestlevelpages; + + /* size of an average subtree at this levelStep (in pages). */ + subtreesize = + (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) / + (1 - avgIndexTuplesPerPage); + + /* max number of pages at the lowest level of a subtree */ + maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep); + + /* subtree must fit in cache (with safety factor of 4) */ + if (subtreesize > effective_cache_size / 4) + break; + + /* each node in the lowest level of a subtree has one page in memory */ + if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ) + break; + + /* Good, we can handle this levelStep. See if we can go one higher. */ + levelStep++; + } + + /* + * We just reached an unacceptable value of levelStep in previous loop. + * So, decrease levelStep to get last acceptable value. + */ + levelStep--; + + /* + * If there's not enough cache or maintenance_work_mem, fall back to plain + * inserts. + */ + if (levelStep <= 0) + { + elog(DEBUG1, "failed to switch to buffered GiST build"); + buildstate->buildMode = GIST_BUFFERING_DISABLED; + return; + } + + /* + * The second parameter to set is pagesPerBuffer, which determines the + * size of each buffer. We adjust pagesPerBuffer also during the build, + * which is why this calculation is in a separate function. + */ + pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep); + + /* Initialize GISTBuildBuffers with these parameters */ + buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep, + gistGetMaxLevel(index)); + + gistInitParentMap(buildstate); + + buildstate->buildMode = GIST_BUFFERING_ACTIVE; + + elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d", + levelStep, pagesPerBuffer); +} + +/* + * Calculate pagesPerBuffer parameter for the buffering algorithm. + * + * Buffer size is chosen so that assuming that tuples are distributed + * randomly, emptying half a buffer fills on average one page in every buffer + * at the next lower level. + */ +static int +calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep) +{ + double pagesPerBuffer; + double avgIndexTuplesPerPage; + double itupAvgSize; + Size pageFreeSpace; + + /* Calc space of index page which is available for index tuples */ + pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) + - sizeof(ItemIdData) + - buildstate->freespace; + + /* + * Calculate average size of already inserted index tuples using gathered + * statistics. + */ + itupAvgSize = (double) buildstate->indtuplesSize / + (double) buildstate->indtuples; + + avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize; + + /* + * Recalculate required size of buffers. + */ + pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep); + + return (int) rint(pagesPerBuffer); +} + +/* + * Per-tuple callback for table_index_build_scan. + */ +static void +gistBuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + GISTBuildState *buildstate = (GISTBuildState *) state; + IndexTuple itup; + MemoryContext oldCtx; + + oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); + + /* form an index tuple and point it at the heap tuple */ + itup = gistFormTuple(buildstate->giststate, index, + values, isnull, + true); + itup->t_tid = *tid; + + if (buildstate->buildMode == GIST_BUFFERING_ACTIVE) + { + /* We have buffers, so use them. */ + gistBufferingBuildInsert(buildstate, itup); + } + else + { + /* + * There's no buffers (yet). Since we already have the index relation + * locked, we call gistdoinsert directly. + */ + gistdoinsert(index, itup, buildstate->freespace, + buildstate->giststate, buildstate->heaprel, true); + } + + /* Update tuple count and total size. */ + buildstate->indtuples += 1; + buildstate->indtuplesSize += IndexTupleSize(itup); + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->giststate->tempCxt); + + if (buildstate->buildMode == GIST_BUFFERING_ACTIVE && + buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0) + { + /* Adjust the target buffer size now */ + buildstate->gfbb->pagesPerBuffer = + calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep); + } + + /* + * In 'auto' mode, check if the index has grown too large to fit in cache, + * and switch to buffering mode if it has. + * + * To avoid excessive calls to smgrnblocks(), only check this every + * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples. + * + * In 'stats' state, switch as soon as we have seen enough tuples to have + * some idea of the average tuple size. + */ + if ((buildstate->buildMode == GIST_BUFFERING_AUTO && + buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 && + effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) || + (buildstate->buildMode == GIST_BUFFERING_STATS && + buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET)) + { + /* + * Index doesn't fit in effective cache anymore. Try to switch to + * buffering build mode. + */ + gistInitBuffering(buildstate); + } +} + +/* + * Insert function for buffering index build. + */ +static void +gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup) +{ + /* Insert the tuple to buffers. */ + gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel); + + /* If we filled up (half of a) buffer, process buffer emptying. */ + gistProcessEmptyingQueue(buildstate); +} + +/* + * Process an index tuple. Runs the tuple down the tree until we reach a leaf + * page or node buffer, and inserts the tuple there. Returns true if we have + * to stop buffer emptying process (because one of child buffers can't take + * index tuples anymore). + */ +static bool +gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, + BlockNumber startblkno, int startlevel) +{ + GISTSTATE *giststate = buildstate->giststate; + GISTBuildBuffers *gfbb = buildstate->gfbb; + Relation indexrel = buildstate->indexrel; + BlockNumber childblkno; + Buffer buffer; + bool result = false; + BlockNumber blkno; + int level; + OffsetNumber downlinkoffnum = InvalidOffsetNumber; + BlockNumber parentblkno = InvalidBlockNumber; + + CHECK_FOR_INTERRUPTS(); + + /* + * Loop until we reach a leaf page (level == 0) or a level with buffers + * (not including the level we start at, because we would otherwise make + * no progress). + */ + blkno = startblkno; + level = startlevel; + for (;;) + { + ItemId iid; + IndexTuple idxtuple, + newtup; + Page page; + OffsetNumber childoffnum; + + /* Have we reached a level with buffers? */ + if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel) + break; + + /* Have we reached a leaf page? */ + if (level == 0) + break; + + /* + * Nope. Descend down to the next level then. Choose a child to + * descend down to. + */ + + buffer = ReadBuffer(indexrel, blkno); + LockBuffer(buffer, GIST_EXCLUSIVE); + + page = (Page) BufferGetPage(buffer); + childoffnum = gistchoose(indexrel, page, itup, giststate); + iid = PageGetItemId(page, childoffnum); + idxtuple = (IndexTuple) PageGetItem(page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + if (level > 1) + gistMemorizeParent(buildstate, childblkno, blkno); + + /* + * Check that the key representing the target child node is consistent + * with the key we're inserting. Update it if it's not. + */ + newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate); + if (newtup) + { + blkno = gistbufferinginserttuples(buildstate, + buffer, + level, + &newtup, + 1, + childoffnum, + InvalidBlockNumber, + InvalidOffsetNumber); + /* gistbufferinginserttuples() released the buffer */ + } + else + UnlockReleaseBuffer(buffer); + + /* Descend to the child */ + parentblkno = blkno; + blkno = childblkno; + downlinkoffnum = childoffnum; + Assert(level > 0); + level--; + } + + if (LEVEL_HAS_BUFFERS(level, gfbb)) + { + /* + * We've reached level with buffers. Place the index tuple to the + * buffer, and add the buffer to the emptying queue if it overflows. + */ + GISTNodeBuffer *childNodeBuffer; + + /* Find the buffer or create a new one */ + childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level); + + /* Add index tuple to it */ + gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup); + + if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb)) + result = true; + } + else + { + /* + * We've reached a leaf page. Place the tuple here. + */ + Assert(level == 0); + buffer = ReadBuffer(indexrel, blkno); + LockBuffer(buffer, GIST_EXCLUSIVE); + gistbufferinginserttuples(buildstate, buffer, level, + &itup, 1, InvalidOffsetNumber, + parentblkno, downlinkoffnum); + /* gistbufferinginserttuples() released the buffer */ + } + + return result; +} + +/* + * Insert tuples to a given page. + * + * This is analogous with gistinserttuples() in the regular insertion code. + * + * Returns the block number of the page where the (first) new or updated tuple + * was inserted. Usually that's the original page, but might be a sibling page + * if the original page was split. + * + * Caller should hold a lock on 'buffer' on entry. This function will unlock + * and unpin it. + */ +static BlockNumber +gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + BlockNumber parentblk, OffsetNumber downlinkoffnum) +{ + GISTBuildBuffers *gfbb = buildstate->gfbb; + List *splitinfo; + bool is_split; + BlockNumber placed_to_blk = InvalidBlockNumber; + + is_split = gistplacetopage(buildstate->indexrel, + buildstate->freespace, + buildstate->giststate, + buffer, + itup, ntup, oldoffnum, &placed_to_blk, + InvalidBuffer, + &splitinfo, + false, + buildstate->heaprel, true); + + /* + * If this is a root split, update the root path item kept in memory. This + * ensures that all path stacks are always complete, including all parent + * nodes up to the root. That simplifies the algorithm to re-find correct + * parent. + */ + if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO) + { + Page page = BufferGetPage(buffer); + OffsetNumber off; + OffsetNumber maxoff; + + Assert(level == gfbb->rootlevel); + gfbb->rootlevel++; + + elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel); + + /* + * All the downlinks on the old root page are now on one of the child + * pages. Visit all the new child pages to memorize the parents of the + * grandchildren. + */ + if (gfbb->rootlevel > 1) + { + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno); + + LockBuffer(childbuf, GIST_SHARE); + gistMemorizeAllDownlinks(buildstate, childbuf); + UnlockReleaseBuffer(childbuf); + + /* + * Also remember that the parent of the new child page is the + * root block. + */ + gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO); + } + } + } + + if (splitinfo) + { + /* + * Insert the downlinks to the parent. This is analogous with + * gistfinishsplit() in the regular insertion code, but the locking is + * simpler, and we have to maintain the buffers on internal nodes and + * the parent map. + */ + IndexTuple *downlinks; + int ndownlinks, + i; + Buffer parentBuffer; + ListCell *lc; + + /* Parent may have changed since we memorized this path. */ + parentBuffer = + gistBufferingFindCorrectParent(buildstate, + BufferGetBlockNumber(buffer), + level, + &parentblk, + &downlinkoffnum); + + /* + * If there's a buffer associated with this page, that needs to be + * split too. gistRelocateBuildBuffersOnSplit() will also adjust the + * downlinks in 'splitinfo', to make sure they're consistent not only + * with the tuples already on the pages, but also the tuples in the + * buffers that will eventually be inserted to them. + */ + gistRelocateBuildBuffersOnSplit(gfbb, + buildstate->giststate, + buildstate->indexrel, + level, + buffer, splitinfo); + + /* Create an array of all the downlink tuples */ + ndownlinks = list_length(splitinfo); + downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks); + i = 0; + foreach(lc, splitinfo) + { + GISTPageSplitInfo *splitinfo = lfirst(lc); + + /* + * Remember the parent of each new child page in our parent map. + * This assumes that the downlinks fit on the parent page. If the + * parent page is split, too, when we recurse up to insert the + * downlinks, the recursive gistbufferinginserttuples() call will + * update the map again. + */ + if (level > 0) + gistMemorizeParent(buildstate, + BufferGetBlockNumber(splitinfo->buf), + BufferGetBlockNumber(parentBuffer)); + + /* + * Also update the parent map for all the downlinks that got moved + * to a different page. (actually this also loops through the + * downlinks that stayed on the original page, but it does no + * harm). + */ + if (level > 1) + gistMemorizeAllDownlinks(buildstate, splitinfo->buf); + + /* + * Since there's no concurrent access, we can release the lower + * level buffers immediately. This includes the original page. + */ + UnlockReleaseBuffer(splitinfo->buf); + downlinks[i++] = splitinfo->downlink; + } + + /* Insert them into parent. */ + gistbufferinginserttuples(buildstate, parentBuffer, level + 1, + downlinks, ndownlinks, downlinkoffnum, + InvalidBlockNumber, InvalidOffsetNumber); + + list_free_deep(splitinfo); /* we don't need this anymore */ + } + else + UnlockReleaseBuffer(buffer); + + return placed_to_blk; +} + +/* + * Find the downlink pointing to a child page. + * + * 'childblkno' indicates the child page to find the parent for. 'level' is + * the level of the child. On entry, *parentblkno and *downlinkoffnum can + * point to a location where the downlink used to be - we will check that + * location first, and save some cycles if it hasn't moved. The function + * returns a buffer containing the downlink, exclusively-locked, and + * *parentblkno and *downlinkoffnum are set to the real location of the + * downlink. + * + * If the child page is a leaf (level == 0), the caller must supply a correct + * parentblkno. Otherwise we use the parent map hash table to find the parent + * block. + * + * This function serves the same purpose as gistFindCorrectParent() during + * normal index inserts, but this is simpler because we don't need to deal + * with concurrent inserts. + */ +static Buffer +gistBufferingFindCorrectParent(GISTBuildState *buildstate, + BlockNumber childblkno, int level, + BlockNumber *parentblkno, + OffsetNumber *downlinkoffnum) +{ + BlockNumber parent; + Buffer buffer; + Page page; + OffsetNumber maxoff; + OffsetNumber off; + + if (level > 0) + parent = gistGetParent(buildstate, childblkno); + else + { + /* + * For a leaf page, the caller must supply a correct parent block + * number. + */ + if (*parentblkno == InvalidBlockNumber) + elog(ERROR, "no parent buffer provided of child %u", childblkno); + parent = *parentblkno; + } + + buffer = ReadBuffer(buildstate->indexrel, parent); + page = BufferGetPage(buffer); + LockBuffer(buffer, GIST_EXCLUSIVE); + gistcheckpage(buildstate->indexrel, buffer); + maxoff = PageGetMaxOffsetNumber(page); + + /* Check if it was not moved */ + if (parent == *parentblkno && *parentblkno != InvalidBlockNumber && + *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff) + { + ItemId iid = PageGetItemId(page, *downlinkoffnum); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno) + { + /* Still there */ + return buffer; + } + } + + /* + * Downlink was not at the offset where it used to be. Scan the page to + * find it. During normal gist insertions, it might've moved to another + * page, to the right, but during a buffering build, we keep track of the + * parent of each page in the lookup table so we should always know what + * page it's on. + */ + for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off)) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno) + { + /* yes!!, found it */ + *downlinkoffnum = off; + return buffer; + } + } + + elog(ERROR, "failed to re-find parent for block %u", childblkno); + return InvalidBuffer; /* keep compiler quiet */ +} + +/* + * Process buffers emptying stack. Emptying of one buffer can cause emptying + * of other buffers. This function iterates until this cascading emptying + * process finished, e.g. until buffers emptying stack is empty. + */ +static void +gistProcessEmptyingQueue(GISTBuildState *buildstate) +{ + GISTBuildBuffers *gfbb = buildstate->gfbb; + + /* Iterate while we have elements in buffers emptying stack. */ + while (gfbb->bufferEmptyingQueue != NIL) + { + GISTNodeBuffer *emptyingNodeBuffer; + + /* Get node buffer from emptying stack. */ + emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue); + gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue); + emptyingNodeBuffer->queuedForEmptying = false; + + /* + * We are going to load last pages of buffers where emptying will be + * to. So let's unload any previously loaded buffers. + */ + gistUnloadNodeBuffers(gfbb); + + /* + * Pop tuples from the buffer and run them down to the buffers at + * lower level, or leaf pages. We continue until one of the lower + * level buffers fills up, or this buffer runs empty. + * + * In Arge et al's paper, the buffer emptying is stopped after + * processing 1/2 node buffer worth of tuples, to avoid overfilling + * any of the lower level buffers. However, it's more efficient to + * keep going until one of the lower level buffers actually fills up, + * so that's what we do. This doesn't need to be exact, if a buffer + * overfills by a few tuples, there's no harm done. + */ + while (true) + { + IndexTuple itup; + + /* Get next index tuple from the buffer */ + if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup)) + break; + + /* + * Run it down to the underlying node buffer or leaf page. + * + * Note: it's possible that the buffer we're emptying splits as a + * result of this call. If that happens, our emptyingNodeBuffer + * points to the left half of the split. After split, it's very + * likely that the new left buffer is no longer over the half-full + * threshold, but we might as well keep flushing tuples from it + * until we fill a lower-level buffer. + */ + if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level)) + { + /* + * A lower level buffer filled up. Stop emptying this buffer, + * to avoid overflowing the lower level buffer. + */ + break; + } + + /* Free all the memory allocated during index tuple processing */ + MemoryContextReset(buildstate->giststate->tempCxt); + } + } +} + +/* + * Empty all node buffers, from top to bottom. This is done at the end of + * index build to flush all remaining tuples to the index. + * + * Note: This destroys the buffersOnLevels lists, so the buffers should not + * be inserted to after this call. + */ +static void +gistEmptyAllBuffers(GISTBuildState *buildstate) +{ + GISTBuildBuffers *gfbb = buildstate->gfbb; + MemoryContext oldCtx; + int i; + + oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); + + /* + * Iterate through the levels from top to bottom. + */ + for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--) + { + /* + * Empty all buffers on this level. Note that new buffers can pop up + * in the list during the processing, as a result of page splits, so a + * simple walk through the list won't work. We remove buffers from the + * list when we see them empty; a buffer can't become non-empty once + * it's been fully emptied. + */ + while (gfbb->buffersOnLevels[i] != NIL) + { + GISTNodeBuffer *nodeBuffer; + + nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]); + + if (nodeBuffer->blocksCount != 0) + { + /* + * Add this buffer to the emptying queue, and proceed to empty + * the queue. + */ + if (!nodeBuffer->queuedForEmptying) + { + MemoryContextSwitchTo(gfbb->context); + nodeBuffer->queuedForEmptying = true; + gfbb->bufferEmptyingQueue = + lcons(nodeBuffer, gfbb->bufferEmptyingQueue); + MemoryContextSwitchTo(buildstate->giststate->tempCxt); + } + gistProcessEmptyingQueue(buildstate); + } + else + gfbb->buffersOnLevels[i] = + list_delete_first(gfbb->buffersOnLevels[i]); + } + elog(DEBUG2, "emptied all buffers at level %d", i); + } + MemoryContextSwitchTo(oldCtx); +} + +/* + * Get the depth of the GiST index. + */ +static int +gistGetMaxLevel(Relation index) +{ + int maxLevel; + BlockNumber blkno; + + /* + * Traverse down the tree, starting from the root, until we hit the leaf + * level. + */ + maxLevel = 0; + blkno = GIST_ROOT_BLKNO; + while (true) + { + Buffer buffer; + Page page; + IndexTuple itup; + + buffer = ReadBuffer(index, blkno); + + /* + * There's no concurrent access during index build, so locking is just + * pro forma. + */ + LockBuffer(buffer, GIST_SHARE); + page = (Page) BufferGetPage(buffer); + + if (GistPageIsLeaf(page)) + { + /* We hit the bottom, so we're done. */ + UnlockReleaseBuffer(buffer); + break; + } + + /* + * Pick the first downlink on the page, and follow it. It doesn't + * matter which downlink we choose, the tree has the same depth + * everywhere, so we just pick the first one. + */ + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, FirstOffsetNumber)); + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + UnlockReleaseBuffer(buffer); + + /* + * We're going down on the tree. It means that there is yet one more + * level in the tree. + */ + maxLevel++; + } + return maxLevel; +} + + +/* + * Routines for managing the parent map. + * + * Whenever a page is split, we need to insert the downlinks into the parent. + * We need to somehow find the parent page to do that. In normal insertions, + * we keep a stack of nodes visited when we descend the tree. However, in + * buffering build, we can start descending the tree from any internal node, + * when we empty a buffer by cascading tuples to its children. So we don't + * have a full stack up to the root available at that time. + * + * So instead, we maintain a hash table to track the parent of every internal + * page. We don't need to track the parents of leaf nodes, however. Whenever + * we insert to a leaf, we've just descended down from its parent, so we know + * its immediate parent already. This helps a lot to limit the memory used + * by this hash table. + * + * Whenever an internal node is split, the parent map needs to be updated. + * the parent of the new child page needs to be recorded, and also the + * entries for all page whose downlinks are moved to a new page at the split + * needs to be updated. + * + * We also update the parent map whenever we descend the tree. That might seem + * unnecessary, because we maintain the map whenever a downlink is moved or + * created, but it is needed because we switch to buffering mode after + * creating a tree with regular index inserts. Any pages created before + * switching to buffering mode will not be present in the parent map initially, + * but will be added there the first time we visit them. + */ + +typedef struct +{ + BlockNumber childblkno; /* hash key */ + BlockNumber parentblkno; +} ParentMapEntry; + +static void +gistInitParentMap(GISTBuildState *buildstate) +{ + HASHCTL hashCtl; + + hashCtl.keysize = sizeof(BlockNumber); + hashCtl.entrysize = sizeof(ParentMapEntry); + hashCtl.hcxt = CurrentMemoryContext; + buildstate->parentMap = hash_create("gistbuild parent map", + 1024, + &hashCtl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); +} + +static void +gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent) +{ + ParentMapEntry *entry; + bool found; + + entry = (ParentMapEntry *) hash_search(buildstate->parentMap, + (const void *) &child, + HASH_ENTER, + &found); + entry->parentblkno = parent; +} + +/* + * Scan all downlinks on a page, and memorize their parent. + */ +static void +gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf) +{ + OffsetNumber maxoff; + OffsetNumber off; + BlockNumber parentblkno = BufferGetBlockNumber(parentbuf); + Page page = BufferGetPage(parentbuf); + + Assert(!GistPageIsLeaf(page)); + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + gistMemorizeParent(buildstate, childblkno, parentblkno); + } +} + +static BlockNumber +gistGetParent(GISTBuildState *buildstate, BlockNumber child) +{ + ParentMapEntry *entry; + bool found; + + /* Find node buffer in hash table */ + entry = (ParentMapEntry *) hash_search(buildstate->parentMap, + (const void *) &child, + HASH_FIND, + &found); + if (!found) + elog(ERROR, "could not find parent of block %u in lookup table", child); + + return entry->parentblkno; +} diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c new file mode 100644 index 0000000..95cc334 --- /dev/null +++ b/src/backend/access/gist/gistbuildbuffers.c @@ -0,0 +1,775 @@ +/*------------------------------------------------------------------------- + * + * gistbuildbuffers.c + * node buffer management functions for GiST buffering build algorithm. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistbuildbuffers.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/gist_private.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/buffile.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb); +static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb, + GISTNodeBuffer *nodeBuffer); +static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb, + GISTNodeBuffer *nodeBuffer); +static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, + GISTNodeBuffer *nodeBuffer); +static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, + IndexTuple item); +static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, + IndexTuple *item); +static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb); +static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum); + +static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr); +static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr); + + +/* + * Initialize GiST build buffers. + */ +GISTBuildBuffers * +gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) +{ + GISTBuildBuffers *gfbb; + HASHCTL hashCtl; + + gfbb = palloc(sizeof(GISTBuildBuffers)); + gfbb->pagesPerBuffer = pagesPerBuffer; + gfbb->levelStep = levelStep; + + /* + * Create a temporary file to hold buffer pages that are swapped out of + * memory. + */ + gfbb->pfile = BufFileCreateTemp(false); + gfbb->nFileBlocks = 0; + + /* Initialize free page management. */ + gfbb->nFreeBlocks = 0; + gfbb->freeBlocksLen = 32; + gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); + + /* + * Current memory context will be used for all in-memory data structures + * of buffers which are persistent during buffering build. + */ + gfbb->context = CurrentMemoryContext; + + /* + * nodeBuffersTab hash is association between index blocks and it's + * buffers. + */ + hashCtl.keysize = sizeof(BlockNumber); + hashCtl.entrysize = sizeof(GISTNodeBuffer); + hashCtl.hcxt = CurrentMemoryContext; + gfbb->nodeBuffersTab = hash_create("gistbuildbuffers", + 1024, + &hashCtl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + gfbb->bufferEmptyingQueue = NIL; + + /* + * Per-level node buffers lists for final buffers emptying process. Node + * buffers are inserted here when they are created. + */ + gfbb->buffersOnLevelsLen = 1; + gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * + gfbb->buffersOnLevelsLen); + gfbb->buffersOnLevels[0] = NIL; + + /* + * Block numbers of node buffers which last pages are currently loaded + * into main memory. + */ + gfbb->loadedBuffersLen = 32; + gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen * + sizeof(GISTNodeBuffer *)); + gfbb->loadedBuffersCount = 0; + + gfbb->rootlevel = maxLevel; + + return gfbb; +} + +/* + * Returns a node buffer for given block. The buffer is created if it + * doesn't exist yet. + */ +GISTNodeBuffer * +gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, + BlockNumber nodeBlocknum, int level) +{ + GISTNodeBuffer *nodeBuffer; + bool found; + + /* Find node buffer in hash table */ + nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab, + (const void *) &nodeBlocknum, + HASH_ENTER, + &found); + if (!found) + { + /* + * Node buffer wasn't found. Initialize the new buffer as empty. + */ + MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); + + /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */ + nodeBuffer->blocksCount = 0; + nodeBuffer->pageBlocknum = InvalidBlockNumber; + nodeBuffer->pageBuffer = NULL; + nodeBuffer->queuedForEmptying = false; + nodeBuffer->isTemp = false; + nodeBuffer->level = level; + + /* + * Add this buffer to the list of buffers on this level. Enlarge + * buffersOnLevels array if needed. + */ + if (level >= gfbb->buffersOnLevelsLen) + { + int i; + + gfbb->buffersOnLevels = + (List **) repalloc(gfbb->buffersOnLevels, + (level + 1) * sizeof(List *)); + + /* initialize the enlarged portion */ + for (i = gfbb->buffersOnLevelsLen; i <= level; i++) + gfbb->buffersOnLevels[i] = NIL; + gfbb->buffersOnLevelsLen = level + 1; + } + + /* + * Prepend the new buffer to the list of buffers on this level. It's + * not arbitrary that the new buffer is put to the beginning of the + * list: in the final emptying phase we loop through all buffers at + * each level, and flush them. If a page is split during the emptying, + * it's more efficient to flush the new splitted pages first, before + * moving on to pre-existing pages on the level. The buffers just + * created during the page split are likely still in cache, so + * flushing them immediately is more efficient than putting them to + * the end of the queue. + */ + gfbb->buffersOnLevels[level] = lcons(nodeBuffer, + gfbb->buffersOnLevels[level]); + + MemoryContextSwitchTo(oldcxt); + } + + return nodeBuffer; +} + +/* + * Allocate memory for a buffer page. + */ +static GISTNodeBufferPage * +gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb) +{ + GISTNodeBufferPage *pageBuffer; + + pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context, + BLCKSZ); + pageBuffer->prev = InvalidBlockNumber; + + /* Set page free space */ + PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET; + return pageBuffer; +} + +/* + * Add specified buffer into loadedBuffers array. + */ +static void +gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) +{ + /* Never add a temporary buffer to the array */ + if (nodeBuffer->isTemp) + return; + + /* Enlarge the array if needed */ + if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen) + { + gfbb->loadedBuffersLen *= 2; + gfbb->loadedBuffers = (GISTNodeBuffer **) + repalloc(gfbb->loadedBuffers, + gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *)); + } + + gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer; + gfbb->loadedBuffersCount++; +} + +/* + * Load last page of node buffer into main memory. + */ +static void +gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) +{ + /* Check if we really should load something */ + if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0) + { + /* Allocate memory for page */ + nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); + + /* Read block from temporary file */ + ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum, + nodeBuffer->pageBuffer); + + /* Mark file block as free */ + gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum); + + /* Mark node buffer as loaded */ + gistAddLoadedBuffer(gfbb, nodeBuffer); + nodeBuffer->pageBlocknum = InvalidBlockNumber; + } +} + +/* + * Write last page of node buffer to the disk. + */ +static void +gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) +{ + /* Check if we have something to write */ + if (nodeBuffer->pageBuffer) + { + BlockNumber blkno; + + /* Get free file block */ + blkno = gistBuffersGetFreeBlock(gfbb); + + /* Write block to the temporary file */ + WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); + + /* Free memory of that page */ + pfree(nodeBuffer->pageBuffer); + nodeBuffer->pageBuffer = NULL; + + /* Save block number */ + nodeBuffer->pageBlocknum = blkno; + } +} + +/* + * Write last pages of all node buffers to the disk. + */ +void +gistUnloadNodeBuffers(GISTBuildBuffers *gfbb) +{ + int i; + + /* Unload all the buffers that have a page loaded in memory. */ + for (i = 0; i < gfbb->loadedBuffersCount; i++) + gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]); + + /* Now there are no node buffers with loaded last page */ + gfbb->loadedBuffersCount = 0; +} + +/* + * Add index tuple to buffer page. + */ +static void +gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup) +{ + Size itupsz = IndexTupleSize(itup); + char *ptr; + + /* There should be enough of space. */ + Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz)); + + /* Reduce free space value of page to reserve a spot for the tuple. */ + PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz); + + /* Get pointer to the spot we reserved (ie. end of free space). */ + ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET + + PAGE_FREE_SPACE(pageBuffer); + + /* Copy the index tuple there. */ + memcpy(ptr, itup, itupsz); +} + +/* + * Get last item from buffer page and remove it from page. + */ +static void +gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup) +{ + IndexTuple ptr; + Size itupsz; + + Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */ + + /* Get pointer to last index tuple */ + ptr = (IndexTuple) ((char *) pageBuffer + + BUFFER_PAGE_DATA_OFFSET + + PAGE_FREE_SPACE(pageBuffer)); + itupsz = IndexTupleSize(ptr); + + /* Make a copy of the tuple */ + *itup = (IndexTuple) palloc(itupsz); + memcpy(*itup, ptr, itupsz); + + /* Mark the space used by the tuple as free */ + PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz); +} + +/* + * Push an index tuple to node buffer. + */ +void +gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, + IndexTuple itup) +{ + /* + * Most part of memory operations will be in buffering build persistent + * context. So, let's switch to it. + */ + MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); + + /* + * If the buffer is currently empty, create the first page. + */ + if (nodeBuffer->blocksCount == 0) + { + nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); + nodeBuffer->blocksCount = 1; + gistAddLoadedBuffer(gfbb, nodeBuffer); + } + + /* Load last page of node buffer if it wasn't in memory already */ + if (!nodeBuffer->pageBuffer) + gistLoadNodeBuffer(gfbb, nodeBuffer); + + /* + * Check if there is enough space on the last page for the tuple. + */ + if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup)) + { + /* + * Nope. Swap previous block to disk and allocate a new one. + */ + BlockNumber blkno; + + /* Write filled page to the disk */ + blkno = gistBuffersGetFreeBlock(gfbb); + WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); + + /* + * Reset the in-memory page as empty, and link the previous block to + * the new page by storing its block number in the prev-link. + */ + PAGE_FREE_SPACE(nodeBuffer->pageBuffer) = + BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)); + nodeBuffer->pageBuffer->prev = blkno; + + /* We've just added one more page */ + nodeBuffer->blocksCount++; + } + + gistPlaceItupToPage(nodeBuffer->pageBuffer, itup); + + /* + * If the buffer just overflowed, add it to the emptying queue. + */ + if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying) + { + gfbb->bufferEmptyingQueue = lcons(nodeBuffer, + gfbb->bufferEmptyingQueue); + nodeBuffer->queuedForEmptying = true; + } + + /* Restore memory context */ + MemoryContextSwitchTo(oldcxt); +} + +/* + * Removes one index tuple from node buffer. Returns true if success and false + * if node buffer is empty. + */ +bool +gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, + IndexTuple *itup) +{ + /* + * If node buffer is empty then return false. + */ + if (nodeBuffer->blocksCount <= 0) + return false; + + /* Load last page of node buffer if needed */ + if (!nodeBuffer->pageBuffer) + gistLoadNodeBuffer(gfbb, nodeBuffer); + + /* + * Get index tuple from last non-empty page. + */ + gistGetItupFromPage(nodeBuffer->pageBuffer, itup); + + /* + * If we just removed the last tuple from the page, fetch previous page on + * this node buffer (if any). + */ + if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer)) + { + BlockNumber prevblkno; + + /* + * blocksCount includes the page in pageBuffer, so decrease it now. + */ + nodeBuffer->blocksCount--; + + /* + * If there's more pages, fetch previous one. + */ + prevblkno = nodeBuffer->pageBuffer->prev; + if (prevblkno != InvalidBlockNumber) + { + /* There is a previous page. Fetch it. */ + Assert(nodeBuffer->blocksCount > 0); + ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer); + + /* + * Now that we've read the block in memory, we can release its + * on-disk block for reuse. + */ + gistBuffersReleaseBlock(gfbb, prevblkno); + } + else + { + /* No more pages. Free memory. */ + Assert(nodeBuffer->blocksCount == 0); + pfree(nodeBuffer->pageBuffer); + nodeBuffer->pageBuffer = NULL; + } + } + return true; +} + +/* + * Select a currently unused block for writing to. + */ +static long +gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb) +{ + /* + * If there are multiple free blocks, we select the one appearing last in + * freeBlocks[]. If there are none, assign the next block at the end of + * the file (causing the file to be extended). + */ + if (gfbb->nFreeBlocks > 0) + return gfbb->freeBlocks[--gfbb->nFreeBlocks]; + else + return gfbb->nFileBlocks++; +} + +/* + * Return a block# to the freelist. + */ +static void +gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum) +{ + int ndx; + + /* Enlarge freeBlocks array if full. */ + if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen) + { + gfbb->freeBlocksLen *= 2; + gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks, + gfbb->freeBlocksLen * + sizeof(long)); + } + + /* Add blocknum to array */ + ndx = gfbb->nFreeBlocks++; + gfbb->freeBlocks[ndx] = blocknum; +} + +/* + * Free buffering build data structure. + */ +void +gistFreeBuildBuffers(GISTBuildBuffers *gfbb) +{ + /* Close buffers file. */ + BufFileClose(gfbb->pfile); + + /* All other things will be freed on memory context release */ +} + +/* + * Data structure representing information about node buffer for index tuples + * relocation from splitted node buffer. + */ +typedef struct +{ + GISTENTRY entry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + GISTPageSplitInfo *splitinfo; + GISTNodeBuffer *nodeBuffer; +} RelocationBufferInfo; + +/* + * At page split, distribute tuples from the buffer of the split page to + * new buffers for the created page halves. This also adjusts the downlinks + * in 'splitinfo' to include the tuples in the buffers. + */ +void +gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, + Relation r, int level, + Buffer buffer, List *splitinfo) +{ + RelocationBufferInfo *relocationBuffersInfos; + bool found; + GISTNodeBuffer *nodeBuffer; + BlockNumber blocknum; + IndexTuple itup; + int splitPagesCount = 0, + i; + GISTENTRY entry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + GISTNodeBuffer oldBuf; + ListCell *lc; + + /* If the splitted page doesn't have buffers, we have nothing to do. */ + if (!LEVEL_HAS_BUFFERS(level, gfbb)) + return; + + /* + * Get the node buffer of the splitted page. + */ + blocknum = BufferGetBlockNumber(buffer); + nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum, + HASH_FIND, &found); + if (!found) + { + /* The page has no buffer, so we have nothing to do. */ + return; + } + + /* + * Make a copy of the old buffer, as we're going reuse it as the buffer + * for the new left page, which is on the same block as the old page. + * That's not true for the root page, but that's fine because we never + * have a buffer on the root page anyway. The original algorithm as + * described by Arge et al did, but it's of no use, as you might as well + * read the tuples straight from the heap instead of the root buffer. + */ + Assert(blocknum != GIST_ROOT_BLKNO); + memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer)); + oldBuf.isTemp = true; + + /* Reset the old buffer, used for the new left page from now on */ + nodeBuffer->blocksCount = 0; + nodeBuffer->pageBuffer = NULL; + nodeBuffer->pageBlocknum = InvalidBlockNumber; + + /* + * Allocate memory for information about relocation buffers. + */ + splitPagesCount = list_length(splitinfo); + relocationBuffersInfos = + (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) * + splitPagesCount); + + /* + * Fill relocation buffers information for node buffers of pages produced + * by split. + */ + i = 0; + foreach(lc, splitinfo) + { + GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc); + GISTNodeBuffer *newNodeBuffer; + + /* Decompress parent index tuple of node buffer page. */ + gistDeCompressAtt(giststate, r, + si->downlink, NULL, (OffsetNumber) 0, + relocationBuffersInfos[i].entry, + relocationBuffersInfos[i].isnull); + + /* + * Create a node buffer for the page. The leftmost half is on the same + * block as the old page before split, so for the leftmost half this + * will return the original buffer. The tuples on the original buffer + * were relinked to the temporary buffer, so the original one is now + * empty. + */ + newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level); + + relocationBuffersInfos[i].nodeBuffer = newNodeBuffer; + relocationBuffersInfos[i].splitinfo = si; + + i++; + } + + /* + * Loop through all index tuples in the buffer of the page being split, + * moving them to buffers for the new pages. We try to move each tuple to + * the page that will result in the lowest penalty for the leading column + * or, in the case of a tie, the lowest penalty for the earliest column + * that is not tied. + * + * The page searching logic is very similar to gistchoose(). + */ + while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup)) + { + float best_penalty[INDEX_MAX_KEYS]; + int i, + which; + IndexTuple newtup; + RelocationBufferInfo *targetBufferInfo; + + gistDeCompressAtt(giststate, r, + itup, NULL, (OffsetNumber) 0, entry, isnull); + + /* default to using first page (shouldn't matter) */ + which = 0; + + /* + * best_penalty[j] is the best penalty we have seen so far for column + * j, or -1 when we haven't yet examined column j. Array entries to + * the right of the first -1 are undefined. + */ + best_penalty[0] = -1; + + /* + * Loop over possible target pages, looking for one to move this tuple + * to. + */ + for (i = 0; i < splitPagesCount; i++) + { + RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i]; + bool zero_penalty; + int j; + + zero_penalty = true; + + /* Loop over index attributes. */ + for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++) + { + float usize; + + /* Compute penalty for this column. */ + usize = gistpenalty(giststate, j, + &splitPageInfo->entry[j], + splitPageInfo->isnull[j], + &entry[j], isnull[j]); + if (usize > 0) + zero_penalty = false; + + if (best_penalty[j] < 0 || usize < best_penalty[j]) + { + /* + * New best penalty for column. Tentatively select this + * page as the target, and record the best penalty. Then + * reset the next column's penalty to "unknown" (and + * indirectly, the same for all the ones to its right). + * This will force us to adopt this page's penalty values + * as the best for all the remaining columns during + * subsequent loop iterations. + */ + which = i; + best_penalty[j] = usize; + + if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1) + best_penalty[j + 1] = -1; + } + else if (best_penalty[j] == usize) + { + /* + * The current page is exactly as good for this column as + * the best page seen so far. The next iteration of this + * loop will compare the next column. + */ + } + else + { + /* + * The current page is worse for this column than the best + * page seen so far. Skip the remaining columns and move + * on to the next page, if any. + */ + zero_penalty = false; /* so outer loop won't exit */ + break; + } + } + + /* + * If we find a page with zero penalty for all columns, there's no + * need to examine remaining pages; just break out of the loop and + * return it. + */ + if (zero_penalty) + break; + } + + /* OK, "which" is the page index to push the tuple to */ + targetBufferInfo = &relocationBuffersInfos[which]; + + /* Push item to selected node buffer */ + gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup); + + /* Adjust the downlink for this page, if needed. */ + newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink, + itup, giststate); + if (newtup) + { + gistDeCompressAtt(giststate, r, + newtup, NULL, (OffsetNumber) 0, + targetBufferInfo->entry, + targetBufferInfo->isnull); + + targetBufferInfo->splitinfo->downlink = newtup; + } + } + + pfree(relocationBuffersInfos); +} + + +/* + * Wrappers around BufFile operations. The main difference is that these + * wrappers report errors with ereport(), so that the callers don't need + * to check the return code. + */ + +static void +ReadTempFileBlock(BufFile *file, long blknum, void *ptr) +{ + size_t nread; + + if (BufFileSeekBlock(file, blknum) != 0) + elog(ERROR, "could not seek to block %ld in temporary file", blknum); + nread = BufFileRead(file, ptr, BLCKSZ); + if (nread != BLCKSZ) + elog(ERROR, "could not read temporary file: read only %zu of %zu bytes", + nread, (size_t) BLCKSZ); +} + +static void +WriteTempFileBlock(BufFile *file, long blknum, void *ptr) +{ + if (BufFileSeekBlock(file, blknum) != 0) + elog(ERROR, "could not seek to block %ld in temporary file", blknum); + BufFileWrite(file, ptr, BLCKSZ); +} diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c new file mode 100644 index 0000000..c8f7e78 --- /dev/null +++ b/src/backend/access/gist/gistget.c @@ -0,0 +1,803 @@ +/*------------------------------------------------------------------------- + * + * gistget.c + * fetch tuples from a GiST scan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistget.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/gist_private.h" +#include "access/relscan.h" +#include "lib/pairingheap.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/float.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* + * gistkillitems() -- set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * We re-read page here, so it's important to check page LSN. If the page + * has been modified since the last read (as determined by LSN), we cannot + * flag any entries because it is possible that the old entry was vacuumed + * away and the TID was re-used by a completely different heap tuple. + */ +static void +gistkillitems(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId iid; + int i; + bool killedsomething = false; + + Assert(so->curBlkno != InvalidBlockNumber); + Assert(!XLogRecPtrIsInvalid(so->curPageLSN)); + Assert(so->killedItems != NULL); + + buffer = ReadBuffer(scan->indexRelation, so->curBlkno); + if (!BufferIsValid(buffer)) + return; + + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + + /* + * If page LSN differs it means that the page was modified since the last + * read. killedItems could be not valid so LP_DEAD hints applying is not + * safe. + */ + if (BufferGetLSNAtomic(buffer) != so->curPageLSN) + { + UnlockReleaseBuffer(buffer); + so->numKilled = 0; /* reset counter */ + return; + } + + Assert(GistPageIsLeaf(page)); + + /* + * Mark all killedItems as dead. We need no additional recheck, because, + * if page was modified, curPageLSN must have changed. + */ + for (i = 0; i < so->numKilled; i++) + { + offnum = so->killedItems[i]; + iid = PageGetItemId(page, offnum); + ItemIdMarkDead(iid); + killedsomething = true; + } + + if (killedsomething) + { + GistMarkPageHasGarbage(page); + MarkBufferDirtyHint(buffer, true); + } + + UnlockReleaseBuffer(buffer); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; +} + +/* + * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? + * + * The index tuple might represent either a heap tuple or a lower index page, + * depending on whether the containing page is a leaf page or not. + * + * On success return for a heap tuple, *recheck_p is set to indicate whether + * the quals need to be rechecked. We recheck if any of the consistent() + * functions request it. recheck is not interesting when examining a non-leaf + * entry, since we must visit the lower index page if there's any doubt. + * Similarly, *recheck_distances_p is set to indicate whether the distances + * need to be rechecked, and it is also ignored for non-leaf entries. + * + * If we are doing an ordered scan, so->distances[] is filled with distance + * data from the distance() functions before returning success. + * + * We must decompress the key in the IndexTuple before passing it to the + * sk_funcs (which actually are the opclass Consistent or Distance methods). + * + * Note that this function is always invoked in a short-lived memory context, + * so we don't need to worry about cleaning up allocated memory, either here + * or in the implementation of any Consistent or Distance methods. + */ +static bool +gistindex_keytest(IndexScanDesc scan, + IndexTuple tuple, + Page page, + OffsetNumber offset, + bool *recheck_p, + bool *recheck_distances_p) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTSTATE *giststate = so->giststate; + ScanKey key = scan->keyData; + int keySize = scan->numberOfKeys; + IndexOrderByDistance *distance_p; + Relation r = scan->indexRelation; + + *recheck_p = false; + *recheck_distances_p = false; + + /* + * If it's a leftover invalid tuple from pre-9.1, treat it as a match with + * minimum possible distances. This means we'll always follow it to the + * referenced page. + */ + if (GistTupleIsInvalid(tuple)) + { + int i; + + if (GistPageIsLeaf(page)) /* shouldn't happen */ + elog(ERROR, "invalid GiST tuple found on leaf page"); + for (i = 0; i < scan->numberOfOrderBys; i++) + { + so->distances[i].value = -get_float8_infinity(); + so->distances[i].isnull = false; + } + return true; + } + + /* Check whether it matches according to the Consistent functions */ + while (keySize > 0) + { + Datum datum; + bool isNull; + + datum = index_getattr(tuple, + key->sk_attno, + giststate->leafTupdesc, + &isNull); + + if (key->sk_flags & SK_ISNULL) + { + /* + * On non-leaf page we can't conclude that child hasn't NULL + * values because of assumption in GiST: union (VAL, NULL) is VAL. + * But if on non-leaf page key IS NULL, then all children are + * NULL. + */ + if (key->sk_flags & SK_SEARCHNULL) + { + if (GistPageIsLeaf(page) && !isNull) + return false; + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (isNull) + return false; + } + } + else if (isNull) + { + return false; + } + else + { + Datum test; + bool recheck; + GISTENTRY de; + + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + false, isNull); + + /* + * Call the Consistent function to evaluate the test. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, the comparison operator's strategy number and subtype + * from pg_amop, and the recheck flag. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * We initialize the recheck flag to true (the safest assumption) + * in case the Consistent function forgets to set it. + */ + recheck = true; + + test = FunctionCall5Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int16GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype), + PointerGetDatum(&recheck)); + + if (!DatumGetBool(test)) + return false; + *recheck_p |= recheck; + } + + key++; + keySize--; + } + + /* OK, it passes --- now let's compute the distances */ + key = scan->orderByData; + distance_p = so->distances; + keySize = scan->numberOfOrderBys; + while (keySize > 0) + { + Datum datum; + bool isNull; + + datum = index_getattr(tuple, + key->sk_attno, + giststate->leafTupdesc, + &isNull); + + if ((key->sk_flags & SK_ISNULL) || isNull) + { + /* Assume distance computes as null */ + distance_p->value = 0.0; + distance_p->isnull = true; + } + else + { + Datum dist; + bool recheck; + GISTENTRY de; + + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + false, isNull); + + /* + * Call the Distance function to evaluate the distance. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, the ordering operator's strategy number and subtype from + * pg_amop, and the recheck flag. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * If the function sets the recheck flag, the returned distance is + * a lower bound on the true distance and needs to be rechecked. + * We initialize the flag to 'false'. This flag was added in + * version 9.5; distance functions written before that won't know + * about the flag, but are expected to never be lossy. + */ + recheck = false; + dist = FunctionCall5Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int16GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype), + PointerGetDatum(&recheck)); + *recheck_distances_p |= recheck; + distance_p->value = DatumGetFloat8(dist); + distance_p->isnull = false; + } + + key++; + distance_p++; + keySize--; + } + + return true; +} + +/* + * Scan all items on the GiST index page identified by *pageItem, and insert + * them into the queue (or directly to output areas) + * + * scan: index scan we are executing + * pageItem: search queue item identifying an index page to scan + * myDistances: distances array associated with pageItem, or NULL at the root + * tbm: if not NULL, gistgetbitmap's output bitmap + * ntids: if not NULL, gistgetbitmap's output tuple counter + * + * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap + * tuples should be reported directly into the bitmap. If they are NULL, + * we're doing a plain or ordered indexscan. For a plain indexscan, heap + * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, + * heap tuple TIDs are pushed into individual search queue items. In an + * index-only scan, reconstructed index tuples are returned along with the + * TIDs. + * + * If we detect that the index page has split since we saw its downlink + * in the parent, we push its new right sibling onto the queue so the + * sibling will be processed next. + */ +static void +gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, + IndexOrderByDistance *myDistances, TIDBitmap *tbm, int64 *ntids) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTSTATE *giststate = so->giststate; + Relation r = scan->indexRelation; + Buffer buffer; + Page page; + GISTPageOpaque opaque; + OffsetNumber maxoff; + OffsetNumber i; + MemoryContext oldcxt; + + Assert(!GISTSearchItemIsHeap(*pageItem)); + + buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); + LockBuffer(buffer, GIST_SHARE); + PredicateLockPage(r, BufferGetBlockNumber(buffer), scan->xs_snapshot); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + TestForOldSnapshot(scan->xs_snapshot, r, page); + opaque = GistPageGetOpaque(page); + + /* + * Check if we need to follow the rightlink. We need to follow it if the + * page was concurrently split since we visited the parent (in which case + * parentlsn < nsn), or if the system crashed after a page split but + * before the downlink was inserted into the parent. + */ + if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && + (GistFollowRight(page) || + pageItem->data.parentlsn < GistPageGetNSN(page)) && + opaque->rightlink != InvalidBlockNumber /* sanity check */ ) + { + /* There was a page split, follow right link to add pages */ + GISTSearchItem *item; + + /* This can't happen when starting at the root */ + Assert(myDistances != NULL); + + oldcxt = MemoryContextSwitchTo(so->queueCxt); + + /* Create new GISTSearchItem for the right sibling index page */ + item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); + item->blkno = opaque->rightlink; + item->data.parentlsn = pageItem->data.parentlsn; + + /* Insert it into the queue using same distances as for this page */ + memcpy(item->distances, myDistances, + sizeof(item->distances[0]) * scan->numberOfOrderBys); + + pairingheap_add(so->queue, &item->phNode); + + MemoryContextSwitchTo(oldcxt); + } + + /* + * Check if the page was deleted after we saw the downlink. There's + * nothing of interest on a deleted page. Note that we must do this after + * checking the NSN for concurrent splits! It's possible that the page + * originally contained some tuples that are visible to us, but was split + * so that all the visible tuples were moved to another page, and then + * this page was deleted. + */ + if (GistPageIsDeleted(page)) + { + UnlockReleaseBuffer(buffer); + return; + } + + so->nPageData = so->curPageData = 0; + scan->xs_hitup = NULL; /* might point into pageDataCxt */ + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->curPageLSN = BufferGetLSNAtomic(buffer); + + /* + * check all tuples on page + */ + maxoff = PageGetMaxOffsetNumber(page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId iid = PageGetItemId(page, i); + IndexTuple it; + bool match; + bool recheck; + bool recheck_distances; + + /* + * If the scan specifies not to return killed tuples, then we treat a + * killed tuple as not passing the qual. + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + continue; + + it = (IndexTuple) PageGetItem(page, iid); + + /* + * Must call gistindex_keytest in tempCxt, and clean up any leftover + * junk afterward. + */ + oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt); + + match = gistindex_keytest(scan, it, page, i, + &recheck, &recheck_distances); + + MemoryContextSwitchTo(oldcxt); + MemoryContextReset(so->giststate->tempCxt); + + /* Ignore tuple if it doesn't match */ + if (!match) + continue; + + if (tbm && GistPageIsLeaf(page)) + { + /* + * getbitmap scan, so just push heap tuple TIDs into the bitmap + * without worrying about ordering + */ + tbm_add_tuples(tbm, &it->t_tid, 1, recheck); + (*ntids)++; + } + else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) + { + /* + * Non-ordered scan, so report tuples in so->pageData[] + */ + so->pageData[so->nPageData].heapPtr = it->t_tid; + so->pageData[so->nPageData].recheck = recheck; + so->pageData[so->nPageData].offnum = i; + + /* + * In an index-only scan, also fetch the data from the tuple. The + * reconstructed tuples are stored in pageDataCxt. + */ + if (scan->xs_want_itup) + { + oldcxt = MemoryContextSwitchTo(so->pageDataCxt); + so->pageData[so->nPageData].recontup = + gistFetchTuple(giststate, r, it); + MemoryContextSwitchTo(oldcxt); + } + so->nPageData++; + } + else + { + /* + * Must push item into search queue. We get here for any lower + * index page, and also for heap tuples if doing an ordered + * search. + */ + GISTSearchItem *item; + int nOrderBys = scan->numberOfOrderBys; + + oldcxt = MemoryContextSwitchTo(so->queueCxt); + + /* Create new GISTSearchItem for this item */ + item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); + + if (GistPageIsLeaf(page)) + { + /* Creating heap-tuple GISTSearchItem */ + item->blkno = InvalidBlockNumber; + item->data.heap.heapPtr = it->t_tid; + item->data.heap.recheck = recheck; + item->data.heap.recheckDistances = recheck_distances; + + /* + * In an index-only scan, also fetch the data from the tuple. + */ + if (scan->xs_want_itup) + item->data.heap.recontup = gistFetchTuple(giststate, r, it); + } + else + { + /* Creating index-page GISTSearchItem */ + item->blkno = ItemPointerGetBlockNumber(&it->t_tid); + + /* + * LSN of current page is lsn of parent page for child. We + * only have a shared lock, so we need to get the LSN + * atomically. + */ + item->data.parentlsn = BufferGetLSNAtomic(buffer); + } + + /* Insert it into the queue using new distance data */ + memcpy(item->distances, so->distances, + sizeof(item->distances[0]) * nOrderBys); + + pairingheap_add(so->queue, &item->phNode); + + MemoryContextSwitchTo(oldcxt); + } + } + + UnlockReleaseBuffer(buffer); +} + +/* + * Extract next item (in order) from search queue + * + * Returns a GISTSearchItem or NULL. Caller must pfree item when done with it. + */ +static GISTSearchItem * +getNextGISTSearchItem(GISTScanOpaque so) +{ + GISTSearchItem *item; + + if (!pairingheap_is_empty(so->queue)) + { + item = (GISTSearchItem *) pairingheap_remove_first(so->queue); + } + else + { + /* Done when both heaps are empty */ + item = NULL; + } + + /* Return item; caller is responsible to pfree it */ + return item; +} + +/* + * Fetch next heap tuple in an ordered search + */ +static bool +getNextNearest(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + bool res = false; + + if (scan->xs_hitup) + { + /* free previously returned tuple */ + pfree(scan->xs_hitup); + scan->xs_hitup = NULL; + } + + do + { + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (!item) + break; + + if (GISTSearchItemIsHeap(*item)) + { + /* found a heap item at currently minimal distance */ + scan->xs_heaptid = item->data.heap.heapPtr; + scan->xs_recheck = item->data.heap.recheck; + + index_store_float8_orderby_distances(scan, so->orderByTypes, + item->distances, + item->data.heap.recheckDistances); + + /* in an index-only scan, also return the reconstructed tuple. */ + if (scan->xs_want_itup) + scan->xs_hitup = item->data.heap.recontup; + res = true; + } + else + { + /* visit an index page, extract its items into queue */ + CHECK_FOR_INTERRUPTS(); + + gistScanPage(scan, item, item->distances, NULL, NULL); + } + + pfree(item); + } while (!res); + + return res; +} + +/* + * gistgettuple() -- Get the next tuple in the scan + */ +bool +gistgettuple(IndexScanDesc scan, ScanDirection dir) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + + if (dir != ForwardScanDirection) + elog(ERROR, "GiST only supports forward scan direction"); + + if (!so->qual_ok) + return false; + + if (so->firstCall) + { + /* Begin the scan by processing the root page */ + GISTSearchItem fakeItem; + + pgstat_count_index_scan(scan->indexRelation); + + so->firstCall = false; + so->curPageData = so->nPageData = 0; + scan->xs_hitup = NULL; + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, NULL, NULL); + } + + if (scan->numberOfOrderBys > 0) + { + /* Must fetch tuples in strict distance order */ + return getNextNearest(scan); + } + else + { + /* Fetch tuples index-page-at-a-time */ + for (;;) + { + if (so->curPageData < so->nPageData) + { + if (scan->kill_prior_tuple && so->curPageData > 0) + { + + if (so->killedItems == NULL) + { + MemoryContext oldCxt = + MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = + (OffsetNumber *) palloc(MaxIndexTuplesPerPage + * sizeof(OffsetNumber)); + + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = + so->pageData[so->curPageData - 1].offnum; + } + /* continuing to return tuples from a leaf page */ + scan->xs_heaptid = so->pageData[so->curPageData].heapPtr; + scan->xs_recheck = so->pageData[so->curPageData].recheck; + + /* in an index-only scan, also return the reconstructed tuple */ + if (scan->xs_want_itup) + scan->xs_hitup = so->pageData[so->curPageData].recontup; + + so->curPageData++; + + return true; + } + + /* + * Check the last returned tuple and add it to killedItems if + * necessary + */ + if (scan->kill_prior_tuple + && so->curPageData > 0 + && so->curPageData == so->nPageData) + { + + if (so->killedItems == NULL) + { + MemoryContext oldCxt = + MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = + (OffsetNumber *) palloc(MaxIndexTuplesPerPage + * sizeof(OffsetNumber)); + + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = + so->pageData[so->curPageData - 1].offnum; + } + /* find and process the next index page */ + do + { + GISTSearchItem *item; + + if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) + gistkillitems(scan); + + item = getNextGISTSearchItem(so); + + if (!item) + return false; + + CHECK_FOR_INTERRUPTS(); + + /* save current item BlockNumber for next gistkillitems() call */ + so->curBlkno = item->blkno; + + /* + * While scanning a leaf page, ItemPointers of matching heap + * tuples are stored in so->pageData. If there are any on + * this page, we fall out of the inner "do" and loop around to + * return them. + */ + gistScanPage(scan, item, item->distances, NULL, NULL); + + pfree(item); + } while (so->nPageData == 0); + } + } +} + +/* + * gistgetbitmap() -- Get a bitmap of all heap tuple locations + */ +int64 +gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + int64 ntids = 0; + GISTSearchItem fakeItem; + + if (!so->qual_ok) + return 0; + + pgstat_count_index_scan(scan->indexRelation); + + /* Begin the scan by processing the root page */ + so->curPageData = so->nPageData = 0; + scan->xs_hitup = NULL; + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, tbm, &ntids); + + /* + * While scanning a leaf page, ItemPointers of matching heap tuples will + * be stored directly into tbm, so we don't need to deal with them here. + */ + for (;;) + { + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (!item) + break; + + CHECK_FOR_INTERRUPTS(); + + gistScanPage(scan, item, item->distances, tbm, &ntids); + + pfree(item); + } + + return ntids; +} + +/* + * Can we do index-only scans on the given index column? + * + * Opclasses that implement a fetch function support index-only scans. + * Opclasses without compression functions also support index-only scans. + * Included attributes always can be fetched for index-only scans. + */ +bool +gistcanreturn(Relation index, int attno) +{ + if (attno > IndexRelationGetNumberOfKeyAttributes(index) || + OidIsValid(index_getprocid(index, attno, GIST_FETCH_PROC)) || + !OidIsValid(index_getprocid(index, attno, GIST_COMPRESS_PROC))) + return true; + else + return false; +} diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c new file mode 100644 index 0000000..d474612 --- /dev/null +++ b/src/backend/access/gist/gistproc.c @@ -0,0 +1,1777 @@ +/*------------------------------------------------------------------------- + * + * gistproc.c + * Support procedures for GiSTs over 2-D objects (boxes, polygons, circles, + * points). + * + * This gives R-tree behavior, with Guttman's poly-time split algorithm. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistproc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "access/gist.h" +#include "access/stratnum.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/geo_decls.h" +#include "utils/sortsupport.h" + + +static bool gist_box_leaf_consistent(BOX *key, BOX *query, + StrategyNumber strategy); +static bool rtree_internal_consistent(BOX *key, BOX *query, + StrategyNumber strategy); + +static uint64 point_zorder_internal(float4 x, float4 y); +static uint64 part_bits32_by2(uint32 x); +static uint32 ieee_float32_to_uint32(float f); +static int gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup); +static Datum gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup); +static int gist_bbox_zorder_cmp_abbrev(Datum z1, Datum z2, SortSupport ssup); +static bool gist_bbox_zorder_abbrev_abort(int memtupcount, SortSupport ssup); + + +/* Minimum accepted ratio of split */ +#define LIMIT_RATIO 0.3 + + +/************************************************** + * Box ops + **************************************************/ + +/* + * Calculates union of two boxes, a and b. The result is stored in *n. + */ +static void +rt_box_union(BOX *n, const BOX *a, const BOX *b) +{ + n->high.x = float8_max(a->high.x, b->high.x); + n->high.y = float8_max(a->high.y, b->high.y); + n->low.x = float8_min(a->low.x, b->low.x); + n->low.y = float8_min(a->low.y, b->low.y); +} + +/* + * Size of a BOX for penalty-calculation purposes. + * The result can be +Infinity, but not NaN. + */ +static float8 +size_box(const BOX *box) +{ + /* + * Check for zero-width cases. Note that we define the size of a zero- + * by-infinity box as zero. It's important to special-case this somehow, + * as naively multiplying infinity by zero will produce NaN. + * + * The less-than cases should not happen, but if they do, say "zero". + */ + if (float8_le(box->high.x, box->low.x) || + float8_le(box->high.y, box->low.y)) + return 0.0; + + /* + * We treat NaN as larger than +Infinity, so any distance involving a NaN + * and a non-NaN is infinite. Note the previous check eliminated the + * possibility that the low fields are NaNs. + */ + if (isnan(box->high.x) || isnan(box->high.y)) + return get_float8_infinity(); + return float8_mul(float8_mi(box->high.x, box->low.x), + float8_mi(box->high.y, box->low.y)); +} + +/* + * Return amount by which the union of the two boxes is larger than + * the original BOX's area. The result can be +Infinity, but not NaN. + */ +static float8 +box_penalty(const BOX *original, const BOX *new) +{ + BOX unionbox; + + rt_box_union(&unionbox, original, new); + return float8_mi(size_box(&unionbox), size_box(original)); +} + +/* + * The GiST Consistent method for boxes + * + * Should return false if for all data items x below entry, + * the predicate x op query must be false, where op is the oper + * corresponding to strategy in the pg_amop table. + */ +Datum +gist_box_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + BOX *query = PG_GETARG_BOX_P(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + + /* All cases served by this function are exact */ + *recheck = false; + + if (DatumGetBoxP(entry->key) == NULL || query == NULL) + PG_RETURN_BOOL(false); + + /* + * if entry is not leaf, use rtree_internal_consistent, else use + * gist_box_leaf_consistent + */ + if (GIST_LEAF(entry)) + PG_RETURN_BOOL(gist_box_leaf_consistent(DatumGetBoxP(entry->key), + query, + strategy)); + else + PG_RETURN_BOOL(rtree_internal_consistent(DatumGetBoxP(entry->key), + query, + strategy)); +} + +/* + * Increase BOX b to include addon. + */ +static void +adjustBox(BOX *b, const BOX *addon) +{ + if (float8_lt(b->high.x, addon->high.x)) + b->high.x = addon->high.x; + if (float8_gt(b->low.x, addon->low.x)) + b->low.x = addon->low.x; + if (float8_lt(b->high.y, addon->high.y)) + b->high.y = addon->high.y; + if (float8_gt(b->low.y, addon->low.y)) + b->low.y = addon->low.y; +} + +/* + * The GiST Union method for boxes + * + * returns the minimal bounding box that encloses all the entries in entryvec + */ +Datum +gist_box_union(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + int *sizep = (int *) PG_GETARG_POINTER(1); + int numranges, + i; + BOX *cur, + *pageunion; + + numranges = entryvec->n; + pageunion = (BOX *) palloc(sizeof(BOX)); + cur = DatumGetBoxP(entryvec->vector[0].key); + memcpy((void *) pageunion, (void *) cur, sizeof(BOX)); + + for (i = 1; i < numranges; i++) + { + cur = DatumGetBoxP(entryvec->vector[i].key); + adjustBox(pageunion, cur); + } + *sizep = sizeof(BOX); + + PG_RETURN_POINTER(pageunion); +} + +/* + * We store boxes as boxes in GiST indexes, so we do not need + * compress, decompress, or fetch functions. + */ + +/* + * The GiST Penalty method for boxes (also used for points) + * + * As in the R-tree paper, we use change in area as our penalty metric + */ +Datum +gist_box_penalty(PG_FUNCTION_ARGS) +{ + GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1); + float *result = (float *) PG_GETARG_POINTER(2); + BOX *origbox = DatumGetBoxP(origentry->key); + BOX *newbox = DatumGetBoxP(newentry->key); + + *result = (float) box_penalty(origbox, newbox); + PG_RETURN_POINTER(result); +} + +/* + * Trivial split: half of entries will be placed on one page + * and another half - to another + */ +static void +fallbackSplit(GistEntryVector *entryvec, GIST_SPLITVEC *v) +{ + OffsetNumber i, + maxoff; + BOX *unionL = NULL, + *unionR = NULL; + int nbytes; + + maxoff = entryvec->n - 1; + + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + v->spl_nleft = v->spl_nright = 0; + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + BOX *cur = DatumGetBoxP(entryvec->vector[i].key); + + if (i <= (maxoff - FirstOffsetNumber + 1) / 2) + { + v->spl_left[v->spl_nleft] = i; + if (unionL == NULL) + { + unionL = (BOX *) palloc(sizeof(BOX)); + *unionL = *cur; + } + else + adjustBox(unionL, cur); + + v->spl_nleft++; + } + else + { + v->spl_right[v->spl_nright] = i; + if (unionR == NULL) + { + unionR = (BOX *) palloc(sizeof(BOX)); + *unionR = *cur; + } + else + adjustBox(unionR, cur); + + v->spl_nright++; + } + } + + v->spl_ldatum = BoxPGetDatum(unionL); + v->spl_rdatum = BoxPGetDatum(unionR); +} + +/* + * Represents information about an entry that can be placed to either group + * without affecting overlap over selected axis ("common entry"). + */ +typedef struct +{ + /* Index of entry in the initial array */ + int index; + /* Delta between penalties of entry insertion into different groups */ + float8 delta; +} CommonEntry; + +/* + * Context for g_box_consider_split. Contains information about currently + * selected split and some general information. + */ +typedef struct +{ + int entriesCount; /* total number of entries being split */ + BOX boundingBox; /* minimum bounding box across all entries */ + + /* Information about currently selected split follows */ + + bool first; /* true if no split was selected yet */ + + float8 leftUpper; /* upper bound of left interval */ + float8 rightLower; /* lower bound of right interval */ + + float4 ratio; + float4 overlap; + int dim; /* axis of this split */ + float8 range; /* width of general MBR projection to the + * selected axis */ +} ConsiderSplitContext; + +/* + * Interval represents projection of box to axis. + */ +typedef struct +{ + float8 lower, + upper; +} SplitInterval; + +/* + * Interval comparison function by lower bound of the interval; + */ +static int +interval_cmp_lower(const void *i1, const void *i2) +{ + float8 lower1 = ((const SplitInterval *) i1)->lower, + lower2 = ((const SplitInterval *) i2)->lower; + + return float8_cmp_internal(lower1, lower2); +} + +/* + * Interval comparison function by upper bound of the interval; + */ +static int +interval_cmp_upper(const void *i1, const void *i2) +{ + float8 upper1 = ((const SplitInterval *) i1)->upper, + upper2 = ((const SplitInterval *) i2)->upper; + + return float8_cmp_internal(upper1, upper2); +} + +/* + * Replace negative (or NaN) value with zero. + */ +static inline float +non_negative(float val) +{ + if (val >= 0.0f) + return val; + else + return 0.0f; +} + +/* + * Consider replacement of currently selected split with the better one. + */ +static inline void +g_box_consider_split(ConsiderSplitContext *context, int dimNum, + float8 rightLower, int minLeftCount, + float8 leftUpper, int maxLeftCount) +{ + int leftCount, + rightCount; + float4 ratio, + overlap; + float8 range; + + /* + * Calculate entries distribution ratio assuming most uniform distribution + * of common entries. + */ + if (minLeftCount >= (context->entriesCount + 1) / 2) + { + leftCount = minLeftCount; + } + else + { + if (maxLeftCount <= context->entriesCount / 2) + leftCount = maxLeftCount; + else + leftCount = context->entriesCount / 2; + } + rightCount = context->entriesCount - leftCount; + + /* + * Ratio of split - quotient between size of lesser group and total + * entries count. + */ + ratio = float4_div(Min(leftCount, rightCount), context->entriesCount); + + if (ratio > LIMIT_RATIO) + { + bool selectthis = false; + + /* + * The ratio is acceptable, so compare current split with previously + * selected one. Between splits of one dimension we search for minimal + * overlap (allowing negative values) and minimal ration (between same + * overlaps. We switch dimension if find less overlap (non-negative) + * or less range with same overlap. + */ + if (dimNum == 0) + range = float8_mi(context->boundingBox.high.x, + context->boundingBox.low.x); + else + range = float8_mi(context->boundingBox.high.y, + context->boundingBox.low.y); + + overlap = float8_div(float8_mi(leftUpper, rightLower), range); + + /* If there is no previous selection, select this */ + if (context->first) + selectthis = true; + else if (context->dim == dimNum) + { + /* + * Within the same dimension, choose the new split if it has a + * smaller overlap, or same overlap but better ratio. + */ + if (overlap < context->overlap || + (overlap == context->overlap && ratio > context->ratio)) + selectthis = true; + } + else + { + /* + * Across dimensions, choose the new split if it has a smaller + * *non-negative* overlap, or same *non-negative* overlap but + * bigger range. This condition differs from the one described in + * the article. On the datasets where leaf MBRs don't overlap + * themselves, non-overlapping splits (i.e. splits which have zero + * *non-negative* overlap) are frequently possible. In this case + * splits tends to be along one dimension, because most distant + * non-overlapping splits (i.e. having lowest negative overlap) + * appears to be in the same dimension as in the previous split. + * Therefore MBRs appear to be very prolonged along another + * dimension, which leads to bad search performance. Using range + * as the second split criteria makes MBRs more quadratic. Using + * *non-negative* overlap instead of overlap as the first split + * criteria gives to range criteria a chance to matter, because + * non-overlapping splits are equivalent in this criteria. + */ + if (non_negative(overlap) < non_negative(context->overlap) || + (range > context->range && + non_negative(overlap) <= non_negative(context->overlap))) + selectthis = true; + } + + if (selectthis) + { + /* save information about selected split */ + context->first = false; + context->ratio = ratio; + context->range = range; + context->overlap = overlap; + context->rightLower = rightLower; + context->leftUpper = leftUpper; + context->dim = dimNum; + } + } +} + +/* + * Compare common entries by their deltas. + */ +static int +common_entry_cmp(const void *i1, const void *i2) +{ + float8 delta1 = ((const CommonEntry *) i1)->delta, + delta2 = ((const CommonEntry *) i2)->delta; + + return float8_cmp_internal(delta1, delta2); +} + +/* + * -------------------------------------------------------------------------- + * Double sorting split algorithm. This is used for both boxes and points. + * + * The algorithm finds split of boxes by considering splits along each axis. + * Each entry is first projected as an interval on the X-axis, and different + * ways to split the intervals into two groups are considered, trying to + * minimize the overlap of the groups. Then the same is repeated for the + * Y-axis, and the overall best split is chosen. The quality of a split is + * determined by overlap along that axis and some other criteria (see + * g_box_consider_split). + * + * After that, all the entries are divided into three groups: + * + * 1) Entries which should be placed to the left group + * 2) Entries which should be placed to the right group + * 3) "Common entries" which can be placed to any of groups without affecting + * of overlap along selected axis. + * + * The common entries are distributed by minimizing penalty. + * + * For details see: + * "A new double sorting-based node splitting algorithm for R-tree", A. Korotkov + * http://syrcose.ispras.ru/2011/files/SYRCoSE2011_Proceedings.pdf#page=36 + * -------------------------------------------------------------------------- + */ +Datum +gist_box_picksplit(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + OffsetNumber i, + maxoff; + ConsiderSplitContext context; + BOX *box, + *leftBox, + *rightBox; + int dim, + commonEntriesCount; + SplitInterval *intervalsLower, + *intervalsUpper; + CommonEntry *commonEntries; + int nentries; + + memset(&context, 0, sizeof(ConsiderSplitContext)); + + maxoff = entryvec->n - 1; + nentries = context.entriesCount = maxoff - FirstOffsetNumber + 1; + + /* Allocate arrays for intervals along axes */ + intervalsLower = (SplitInterval *) palloc(nentries * sizeof(SplitInterval)); + intervalsUpper = (SplitInterval *) palloc(nentries * sizeof(SplitInterval)); + + /* + * Calculate the overall minimum bounding box over all the entries. + */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + box = DatumGetBoxP(entryvec->vector[i].key); + if (i == FirstOffsetNumber) + context.boundingBox = *box; + else + adjustBox(&context.boundingBox, box); + } + + /* + * Iterate over axes for optimal split searching. + */ + context.first = true; /* nothing selected yet */ + for (dim = 0; dim < 2; dim++) + { + float8 leftUpper, + rightLower; + int i1, + i2; + + /* Project each entry as an interval on the selected axis. */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + box = DatumGetBoxP(entryvec->vector[i].key); + if (dim == 0) + { + intervalsLower[i - FirstOffsetNumber].lower = box->low.x; + intervalsLower[i - FirstOffsetNumber].upper = box->high.x; + } + else + { + intervalsLower[i - FirstOffsetNumber].lower = box->low.y; + intervalsLower[i - FirstOffsetNumber].upper = box->high.y; + } + } + + /* + * Make two arrays of intervals: one sorted by lower bound and another + * sorted by upper bound. + */ + memcpy(intervalsUpper, intervalsLower, + sizeof(SplitInterval) * nentries); + qsort(intervalsLower, nentries, sizeof(SplitInterval), + interval_cmp_lower); + qsort(intervalsUpper, nentries, sizeof(SplitInterval), + interval_cmp_upper); + + /*---- + * The goal is to form a left and right interval, so that every entry + * interval is contained by either left or right interval (or both). + * + * For example, with the intervals (0,1), (1,3), (2,3), (2,4): + * + * 0 1 2 3 4 + * +-+ + * +---+ + * +-+ + * +---+ + * + * The left and right intervals are of the form (0,a) and (b,4). + * We first consider splits where b is the lower bound of an entry. + * We iterate through all entries, and for each b, calculate the + * smallest possible a. Then we consider splits where a is the + * upper bound of an entry, and for each a, calculate the greatest + * possible b. + * + * In the above example, the first loop would consider splits: + * b=0: (0,1)-(0,4) + * b=1: (0,1)-(1,4) + * b=2: (0,3)-(2,4) + * + * And the second loop: + * a=1: (0,1)-(1,4) + * a=3: (0,3)-(2,4) + * a=4: (0,4)-(2,4) + */ + + /* + * Iterate over lower bound of right group, finding smallest possible + * upper bound of left group. + */ + i1 = 0; + i2 = 0; + rightLower = intervalsLower[i1].lower; + leftUpper = intervalsUpper[i2].lower; + while (true) + { + /* + * Find next lower bound of right group. + */ + while (i1 < nentries && + float8_eq(rightLower, intervalsLower[i1].lower)) + { + if (float8_lt(leftUpper, intervalsLower[i1].upper)) + leftUpper = intervalsLower[i1].upper; + i1++; + } + if (i1 >= nentries) + break; + rightLower = intervalsLower[i1].lower; + + /* + * Find count of intervals which anyway should be placed to the + * left group. + */ + while (i2 < nentries && + float8_le(intervalsUpper[i2].upper, leftUpper)) + i2++; + + /* + * Consider found split. + */ + g_box_consider_split(&context, dim, rightLower, i1, leftUpper, i2); + } + + /* + * Iterate over upper bound of left group finding greatest possible + * lower bound of right group. + */ + i1 = nentries - 1; + i2 = nentries - 1; + rightLower = intervalsLower[i1].upper; + leftUpper = intervalsUpper[i2].upper; + while (true) + { + /* + * Find next upper bound of left group. + */ + while (i2 >= 0 && float8_eq(leftUpper, intervalsUpper[i2].upper)) + { + if (float8_gt(rightLower, intervalsUpper[i2].lower)) + rightLower = intervalsUpper[i2].lower; + i2--; + } + if (i2 < 0) + break; + leftUpper = intervalsUpper[i2].upper; + + /* + * Find count of intervals which anyway should be placed to the + * right group. + */ + while (i1 >= 0 && float8_ge(intervalsLower[i1].lower, rightLower)) + i1--; + + /* + * Consider found split. + */ + g_box_consider_split(&context, dim, + rightLower, i1 + 1, leftUpper, i2 + 1); + } + } + + /* + * If we failed to find any acceptable splits, use trivial split. + */ + if (context.first) + { + fallbackSplit(entryvec, v); + PG_RETURN_POINTER(v); + } + + /* + * Ok, we have now selected the split across one axis. + * + * While considering the splits, we already determined that there will be + * enough entries in both groups to reach the desired ratio, but we did + * not memorize which entries go to which group. So determine that now. + */ + + /* Allocate vectors for results */ + v->spl_left = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber)); + v->spl_right = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber)); + v->spl_nleft = 0; + v->spl_nright = 0; + + /* Allocate bounding boxes of left and right groups */ + leftBox = palloc0(sizeof(BOX)); + rightBox = palloc0(sizeof(BOX)); + + /* + * Allocate an array for "common entries" - entries which can be placed to + * either group without affecting overlap along selected axis. + */ + commonEntriesCount = 0; + commonEntries = (CommonEntry *) palloc(nentries * sizeof(CommonEntry)); + + /* Helper macros to place an entry in the left or right group */ +#define PLACE_LEFT(box, off) \ + do { \ + if (v->spl_nleft > 0) \ + adjustBox(leftBox, box); \ + else \ + *leftBox = *(box); \ + v->spl_left[v->spl_nleft++] = off; \ + } while(0) + +#define PLACE_RIGHT(box, off) \ + do { \ + if (v->spl_nright > 0) \ + adjustBox(rightBox, box); \ + else \ + *rightBox = *(box); \ + v->spl_right[v->spl_nright++] = off; \ + } while(0) + + /* + * Distribute entries which can be distributed unambiguously, and collect + * common entries. + */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + float8 lower, + upper; + + /* + * Get upper and lower bounds along selected axis. + */ + box = DatumGetBoxP(entryvec->vector[i].key); + if (context.dim == 0) + { + lower = box->low.x; + upper = box->high.x; + } + else + { + lower = box->low.y; + upper = box->high.y; + } + + if (float8_le(upper, context.leftUpper)) + { + /* Fits to the left group */ + if (float8_ge(lower, context.rightLower)) + { + /* Fits also to the right group, so "common entry" */ + commonEntries[commonEntriesCount++].index = i; + } + else + { + /* Doesn't fit to the right group, so join to the left group */ + PLACE_LEFT(box, i); + } + } + else + { + /* + * Each entry should fit on either left or right group. Since this + * entry didn't fit on the left group, it better fit in the right + * group. + */ + Assert(float8_ge(lower, context.rightLower)); + + /* Doesn't fit to the left group, so join to the right group */ + PLACE_RIGHT(box, i); + } + } + + /* + * Distribute "common entries", if any. + */ + if (commonEntriesCount > 0) + { + /* + * Calculate minimum number of entries that must be placed in both + * groups, to reach LIMIT_RATIO. + */ + int m = ceil(LIMIT_RATIO * nentries); + + /* + * Calculate delta between penalties of join "common entries" to + * different groups. + */ + for (i = 0; i < commonEntriesCount; i++) + { + box = DatumGetBoxP(entryvec->vector[commonEntries[i].index].key); + commonEntries[i].delta = Abs(float8_mi(box_penalty(leftBox, box), + box_penalty(rightBox, box))); + } + + /* + * Sort "common entries" by calculated deltas in order to distribute + * the most ambiguous entries first. + */ + qsort(commonEntries, commonEntriesCount, sizeof(CommonEntry), common_entry_cmp); + + /* + * Distribute "common entries" between groups. + */ + for (i = 0; i < commonEntriesCount; i++) + { + box = DatumGetBoxP(entryvec->vector[commonEntries[i].index].key); + + /* + * Check if we have to place this entry in either group to achieve + * LIMIT_RATIO. + */ + if (v->spl_nleft + (commonEntriesCount - i) <= m) + PLACE_LEFT(box, commonEntries[i].index); + else if (v->spl_nright + (commonEntriesCount - i) <= m) + PLACE_RIGHT(box, commonEntries[i].index); + else + { + /* Otherwise select the group by minimal penalty */ + if (box_penalty(leftBox, box) < box_penalty(rightBox, box)) + PLACE_LEFT(box, commonEntries[i].index); + else + PLACE_RIGHT(box, commonEntries[i].index); + } + } + } + + v->spl_ldatum = PointerGetDatum(leftBox); + v->spl_rdatum = PointerGetDatum(rightBox); + PG_RETURN_POINTER(v); +} + +/* + * Equality method + * + * This is used for boxes, points, circles, and polygons, all of which store + * boxes as GiST index entries. + * + * Returns true only when boxes are exactly the same. We can't use fuzzy + * comparisons here without breaking index consistency; therefore, this isn't + * equivalent to box_same(). + */ +Datum +gist_box_same(PG_FUNCTION_ARGS) +{ + BOX *b1 = PG_GETARG_BOX_P(0); + BOX *b2 = PG_GETARG_BOX_P(1); + bool *result = (bool *) PG_GETARG_POINTER(2); + + if (b1 && b2) + *result = (float8_eq(b1->low.x, b2->low.x) && + float8_eq(b1->low.y, b2->low.y) && + float8_eq(b1->high.x, b2->high.x) && + float8_eq(b1->high.y, b2->high.y)); + else + *result = (b1 == NULL && b2 == NULL); + PG_RETURN_POINTER(result); +} + +/* + * Leaf-level consistency for boxes: just apply the query operator + */ +static bool +gist_box_leaf_consistent(BOX *key, BOX *query, StrategyNumber strategy) +{ + bool retval; + + switch (strategy) + { + case RTLeftStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_left, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverLeftStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overleft, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverlapStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overlap, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverRightStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overright, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTRightStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_right, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTSameStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_same, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTContainsStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_contain, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTContainedByStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_contained, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverBelowStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overbelow, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTBelowStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_below, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTAboveStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_above, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverAboveStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overabove, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + retval = false; /* keep compiler quiet */ + break; + } + return retval; +} + +/***************************************** + * Common rtree functions (for boxes, polygons, and circles) + *****************************************/ + +/* + * Internal-page consistency for all these types + * + * We can use the same function since all types use bounding boxes as the + * internal-page representation. + */ +static bool +rtree_internal_consistent(BOX *key, BOX *query, StrategyNumber strategy) +{ + bool retval; + + switch (strategy) + { + case RTLeftStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_overright, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverLeftStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_right, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverlapStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overlap, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverRightStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_left, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTRightStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_overleft, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTSameStrategyNumber: + case RTContainsStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_contain, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTContainedByStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overlap, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverBelowStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_above, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTBelowStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_overabove, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTAboveStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_overbelow, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverAboveStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_below, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + retval = false; /* keep compiler quiet */ + break; + } + return retval; +} + +/************************************************** + * Polygon ops + **************************************************/ + +/* + * GiST compress for polygons: represent a polygon by its bounding box + */ +Datum +gist_poly_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval; + + if (entry->leafkey) + { + POLYGON *in = DatumGetPolygonP(entry->key); + BOX *r; + + r = (BOX *) palloc(sizeof(BOX)); + memcpy((void *) r, (void *) &(in->boundbox), sizeof(BOX)); + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, + entry->offset, false); + } + else + retval = entry; + PG_RETURN_POINTER(retval); +} + +/* + * The GiST Consistent method for polygons + */ +Datum +gist_poly_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + POLYGON *query = PG_GETARG_POLYGON_P(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + bool result; + + /* All cases served by this function are inexact */ + *recheck = true; + + if (DatumGetBoxP(entry->key) == NULL || query == NULL) + PG_RETURN_BOOL(false); + + /* + * Since the operators require recheck anyway, we can just use + * rtree_internal_consistent even at leaf nodes. (This works in part + * because the index entries are bounding boxes not polygons.) + */ + result = rtree_internal_consistent(DatumGetBoxP(entry->key), + &(query->boundbox), strategy); + + /* Avoid memory leak if supplied poly is toasted */ + PG_FREE_IF_COPY(query, 1); + + PG_RETURN_BOOL(result); +} + +/************************************************** + * Circle ops + **************************************************/ + +/* + * GiST compress for circles: represent a circle by its bounding box + */ +Datum +gist_circle_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval; + + if (entry->leafkey) + { + CIRCLE *in = DatumGetCircleP(entry->key); + BOX *r; + + r = (BOX *) palloc(sizeof(BOX)); + r->high.x = float8_pl(in->center.x, in->radius); + r->low.x = float8_mi(in->center.x, in->radius); + r->high.y = float8_pl(in->center.y, in->radius); + r->low.y = float8_mi(in->center.y, in->radius); + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, + entry->offset, false); + } + else + retval = entry; + PG_RETURN_POINTER(retval); +} + +/* + * The GiST Consistent method for circles + */ +Datum +gist_circle_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + CIRCLE *query = PG_GETARG_CIRCLE_P(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + BOX bbox; + bool result; + + /* All cases served by this function are inexact */ + *recheck = true; + + if (DatumGetBoxP(entry->key) == NULL || query == NULL) + PG_RETURN_BOOL(false); + + /* + * Since the operators require recheck anyway, we can just use + * rtree_internal_consistent even at leaf nodes. (This works in part + * because the index entries are bounding boxes not circles.) + */ + bbox.high.x = float8_pl(query->center.x, query->radius); + bbox.low.x = float8_mi(query->center.x, query->radius); + bbox.high.y = float8_pl(query->center.y, query->radius); + bbox.low.y = float8_mi(query->center.y, query->radius); + + result = rtree_internal_consistent(DatumGetBoxP(entry->key), + &bbox, strategy); + + PG_RETURN_BOOL(result); +} + +/************************************************** + * Point ops + **************************************************/ + +Datum +gist_point_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + + if (entry->leafkey) /* Point, actually */ + { + BOX *box = palloc(sizeof(BOX)); + Point *point = DatumGetPointP(entry->key); + GISTENTRY *retval = palloc(sizeof(GISTENTRY)); + + box->high = box->low = *point; + + gistentryinit(*retval, BoxPGetDatum(box), + entry->rel, entry->page, entry->offset, false); + + PG_RETURN_POINTER(retval); + } + + PG_RETURN_POINTER(entry); +} + +/* + * GiST Fetch method for point + * + * Get point coordinates from its bounding box coordinates and form new + * gistentry. + */ +Datum +gist_point_fetch(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + BOX *in = DatumGetBoxP(entry->key); + Point *r; + GISTENTRY *retval; + + retval = palloc(sizeof(GISTENTRY)); + + r = (Point *) palloc(sizeof(Point)); + r->x = in->high.x; + r->y = in->high.y; + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, + entry->offset, false); + + PG_RETURN_POINTER(retval); +} + + +#define point_point_distance(p1,p2) \ + DatumGetFloat8(DirectFunctionCall2(point_distance, \ + PointPGetDatum(p1), PointPGetDatum(p2))) + +static float8 +computeDistance(bool isLeaf, BOX *box, Point *point) +{ + float8 result = 0.0; + + if (isLeaf) + { + /* simple point to point distance */ + result = point_point_distance(point, &box->low); + } + else if (point->x <= box->high.x && point->x >= box->low.x && + point->y <= box->high.y && point->y >= box->low.y) + { + /* point inside the box */ + result = 0.0; + } + else if (point->x <= box->high.x && point->x >= box->low.x) + { + /* point is over or below box */ + Assert(box->low.y <= box->high.y); + if (point->y > box->high.y) + result = float8_mi(point->y, box->high.y); + else if (point->y < box->low.y) + result = float8_mi(box->low.y, point->y); + else + elog(ERROR, "inconsistent point values"); + } + else if (point->y <= box->high.y && point->y >= box->low.y) + { + /* point is to left or right of box */ + Assert(box->low.x <= box->high.x); + if (point->x > box->high.x) + result = float8_mi(point->x, box->high.x); + else if (point->x < box->low.x) + result = float8_mi(box->low.x, point->x); + else + elog(ERROR, "inconsistent point values"); + } + else + { + /* closest point will be a vertex */ + Point p; + float8 subresult; + + result = point_point_distance(point, &box->low); + + subresult = point_point_distance(point, &box->high); + if (result > subresult) + result = subresult; + + p.x = box->low.x; + p.y = box->high.y; + subresult = point_point_distance(point, &p); + if (result > subresult) + result = subresult; + + p.x = box->high.x; + p.y = box->low.y; + subresult = point_point_distance(point, &p); + if (result > subresult) + result = subresult; + } + + return result; +} + +static bool +gist_point_consistent_internal(StrategyNumber strategy, + bool isLeaf, BOX *key, Point *query) +{ + bool result = false; + + switch (strategy) + { + case RTLeftStrategyNumber: + result = FPlt(key->low.x, query->x); + break; + case RTRightStrategyNumber: + result = FPgt(key->high.x, query->x); + break; + case RTAboveStrategyNumber: + result = FPgt(key->high.y, query->y); + break; + case RTBelowStrategyNumber: + result = FPlt(key->low.y, query->y); + break; + case RTSameStrategyNumber: + if (isLeaf) + { + /* key.high must equal key.low, so we can disregard it */ + result = (FPeq(key->low.x, query->x) && + FPeq(key->low.y, query->y)); + } + else + { + result = (FPle(query->x, key->high.x) && + FPge(query->x, key->low.x) && + FPle(query->y, key->high.y) && + FPge(query->y, key->low.y)); + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + result = false; /* keep compiler quiet */ + break; + } + + return result; +} + +#define GeoStrategyNumberOffset 20 +#define PointStrategyNumberGroup 0 +#define BoxStrategyNumberGroup 1 +#define PolygonStrategyNumberGroup 2 +#define CircleStrategyNumberGroup 3 + +Datum +gist_point_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + bool *recheck = (bool *) PG_GETARG_POINTER(4); + bool result; + StrategyNumber strategyGroup; + + /* + * We have to remap these strategy numbers to get this klugy + * classification logic to work. + */ + if (strategy == RTOldBelowStrategyNumber) + strategy = RTBelowStrategyNumber; + else if (strategy == RTOldAboveStrategyNumber) + strategy = RTAboveStrategyNumber; + + strategyGroup = strategy / GeoStrategyNumberOffset; + switch (strategyGroup) + { + case PointStrategyNumberGroup: + result = gist_point_consistent_internal(strategy % GeoStrategyNumberOffset, + GIST_LEAF(entry), + DatumGetBoxP(entry->key), + PG_GETARG_POINT_P(1)); + *recheck = false; + break; + case BoxStrategyNumberGroup: + { + /* + * The only operator in this group is point <@ box (on_pb), so + * we needn't examine strategy again. + * + * For historical reasons, on_pb uses exact rather than fuzzy + * comparisons. We could use box_overlap when at an internal + * page, but that would lead to possibly visiting child pages + * uselessly, because box_overlap uses fuzzy comparisons. + * Instead we write a non-fuzzy overlap test. The same code + * will also serve for leaf-page tests, since leaf keys have + * high == low. + */ + BOX *query, + *key; + + query = PG_GETARG_BOX_P(1); + key = DatumGetBoxP(entry->key); + + result = (key->high.x >= query->low.x && + key->low.x <= query->high.x && + key->high.y >= query->low.y && + key->low.y <= query->high.y); + *recheck = false; + } + break; + case PolygonStrategyNumberGroup: + { + POLYGON *query = PG_GETARG_POLYGON_P(1); + + result = DatumGetBool(DirectFunctionCall5(gist_poly_consistent, + PointerGetDatum(entry), + PolygonPGetDatum(query), + Int16GetDatum(RTOverlapStrategyNumber), + 0, PointerGetDatum(recheck))); + + if (GIST_LEAF(entry) && result) + { + /* + * We are on leaf page and quick check shows overlapping + * of polygon's bounding box and point + */ + BOX *box = DatumGetBoxP(entry->key); + + Assert(box->high.x == box->low.x + && box->high.y == box->low.y); + result = DatumGetBool(DirectFunctionCall2(poly_contain_pt, + PolygonPGetDatum(query), + PointPGetDatum(&box->high))); + *recheck = false; + } + } + break; + case CircleStrategyNumberGroup: + { + CIRCLE *query = PG_GETARG_CIRCLE_P(1); + + result = DatumGetBool(DirectFunctionCall5(gist_circle_consistent, + PointerGetDatum(entry), + CirclePGetDatum(query), + Int16GetDatum(RTOverlapStrategyNumber), + 0, PointerGetDatum(recheck))); + + if (GIST_LEAF(entry) && result) + { + /* + * We are on leaf page and quick check shows overlapping + * of polygon's bounding box and point + */ + BOX *box = DatumGetBoxP(entry->key); + + Assert(box->high.x == box->low.x + && box->high.y == box->low.y); + result = DatumGetBool(DirectFunctionCall2(circle_contain_pt, + CirclePGetDatum(query), + PointPGetDatum(&box->high))); + *recheck = false; + } + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + result = false; /* keep compiler quiet */ + break; + } + + PG_RETURN_BOOL(result); +} + +Datum +gist_point_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + float8 distance; + StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset; + + switch (strategyGroup) + { + case PointStrategyNumberGroup: + distance = computeDistance(GIST_LEAF(entry), + DatumGetBoxP(entry->key), + PG_GETARG_POINT_P(1)); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + distance = 0.0; /* keep compiler quiet */ + break; + } + + PG_RETURN_FLOAT8(distance); +} + +static float8 +gist_bbox_distance(GISTENTRY *entry, Datum query, StrategyNumber strategy) +{ + float8 distance; + StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset; + + switch (strategyGroup) + { + case PointStrategyNumberGroup: + distance = computeDistance(false, + DatumGetBoxP(entry->key), + DatumGetPointP(query)); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + distance = 0.0; /* keep compiler quiet */ + } + + return distance; +} + +Datum +gist_box_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + Datum query = PG_GETARG_DATUM(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + /* bool *recheck = (bool *) PG_GETARG_POINTER(4); */ + float8 distance; + + distance = gist_bbox_distance(entry, query, strategy); + + PG_RETURN_FLOAT8(distance); +} + +/* + * The inexact GiST distance methods for geometric types that store bounding + * boxes. + * + * Compute lossy distance from point to index entries. The result is inexact + * because index entries are bounding boxes, not the exact shapes of the + * indexed geometric types. We use distance from point to MBR of index entry. + * This is a lower bound estimate of distance from point to indexed geometric + * type. + */ +Datum +gist_circle_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + Datum query = PG_GETARG_DATUM(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + float8 distance; + + distance = gist_bbox_distance(entry, query, strategy); + *recheck = true; + + PG_RETURN_FLOAT8(distance); +} + +Datum +gist_poly_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + Datum query = PG_GETARG_DATUM(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + float8 distance; + + distance = gist_bbox_distance(entry, query, strategy); + *recheck = true; + + PG_RETURN_FLOAT8(distance); +} + +/* + * Z-order routines for fast index build + */ + +/* + * Compute Z-value of a point + * + * Z-order (also known as Morton Code) maps a two-dimensional point to a + * single integer, in a way that preserves locality. Points that are close in + * the two-dimensional space are mapped to integer that are not far from each + * other. We do that by interleaving the bits in the X and Y components. + * + * Morton Code is normally defined only for integers, but the X and Y values + * of a point are floating point. We expect floats to be in IEEE format. + */ +static uint64 +point_zorder_internal(float4 x, float4 y) +{ + uint32 ix = ieee_float32_to_uint32(x); + uint32 iy = ieee_float32_to_uint32(y); + + /* Interleave the bits */ + return part_bits32_by2(ix) | (part_bits32_by2(iy) << 1); +} + +/* Interleave 32 bits with zeroes */ +static uint64 +part_bits32_by2(uint32 x) +{ + uint64 n = x; + + n = (n | (n << 16)) & UINT64CONST(0x0000FFFF0000FFFF); + n = (n | (n << 8)) & UINT64CONST(0x00FF00FF00FF00FF); + n = (n | (n << 4)) & UINT64CONST(0x0F0F0F0F0F0F0F0F); + n = (n | (n << 2)) & UINT64CONST(0x3333333333333333); + n = (n | (n << 1)) & UINT64CONST(0x5555555555555555); + + return n; +} + +/* + * Convert a 32-bit IEEE float to uint32 in a way that preserves the ordering + */ +static uint32 +ieee_float32_to_uint32(float f) +{ + /*---- + * + * IEEE 754 floating point format + * ------------------------------ + * + * IEEE 754 floating point numbers have this format: + * + * exponent (8 bits) + * | + * s eeeeeeee mmmmmmmmmmmmmmmmmmmmmmm + * | | + * sign mantissa (23 bits) + * + * Infinity has all bits in the exponent set and the mantissa is all + * zeros. Negative infinity is the same but with the sign bit set. + * + * NaNs are represented with all bits in the exponent set, and the least + * significant bit in the mantissa also set. The rest of the mantissa bits + * can be used to distinguish different kinds of NaNs. + * + * The IEEE format has the nice property that when you take the bit + * representation and interpret it as an integer, the order is preserved, + * except for the sign. That holds for the +-Infinity values too. + * + * Mapping to uint32 + * ----------------- + * + * In order to have a smooth transition from negative to positive numbers, + * we map floats to unsigned integers like this: + * + * x < 0 to range 0-7FFFFFFF + * x = 0 to value 8000000 (both positive and negative zero) + * x > 0 to range 8000001-FFFFFFFF + * + * We don't care to distinguish different kind of NaNs, so they are all + * mapped to the same arbitrary value, FFFFFFFF. Because of the IEEE bit + * representation of NaNs, there aren't any non-NaN values that would be + * mapped to FFFFFFFF. In fact, there is a range of unused values on both + * ends of the uint32 space. + */ + if (isnan(f)) + return 0xFFFFFFFF; + else + { + union + { + float f; + uint32 i; + } u; + + u.f = f; + + /* Check the sign bit */ + if ((u.i & 0x80000000) != 0) + { + /* + * Map the negative value to range 0-7FFFFFFF. This flips the sign + * bit to 0 in the same instruction. + */ + Assert(f <= 0); /* can be -0 */ + u.i ^= 0xFFFFFFFF; + } + else + { + /* Map the positive value (or 0) to range 80000000-FFFFFFFF */ + u.i |= 0x80000000; + } + + return u.i; + } +} + +/* + * Compare the Z-order of points + */ +static int +gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup) +{ + Point *p1 = &(DatumGetBoxP(a)->low); + Point *p2 = &(DatumGetBoxP(b)->low); + uint64 z1; + uint64 z2; + + /* + * Do a quick check for equality first. It's not clear if this is worth it + * in general, but certainly is when used as tie-breaker with abbreviated + * keys, + */ + if (p1->x == p2->x && p1->y == p2->y) + return 0; + + z1 = point_zorder_internal(p1->x, p1->y); + z2 = point_zorder_internal(p2->x, p2->y); + if (z1 > z2) + return 1; + else if (z1 < z2) + return -1; + else + return 0; +} + +/* + * Abbreviated version of Z-order comparison + * + * The abbreviated format is a Z-order value computed from the two 32-bit + * floats. If SIZEOF_DATUM == 8, the 64-bit Z-order value fits fully in the + * abbreviated Datum, otherwise use its most significant bits. + */ +static Datum +gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup) +{ + Point *p = &(DatumGetBoxP(original)->low); + uint64 z; + + z = point_zorder_internal(p->x, p->y); + +#if SIZEOF_DATUM == 8 + return (Datum) z; +#else + return (Datum) (z >> 32); +#endif +} + +static int +gist_bbox_zorder_cmp_abbrev(Datum z1, Datum z2, SortSupport ssup) +{ + /* + * Compare the pre-computed Z-orders as unsigned integers. Datum is a + * typedef for 'uintptr_t', so no casting is required. + */ + if (z1 > z2) + return 1; + else if (z1 < z2) + return -1; + else + return 0; +} + +/* + * We never consider aborting the abbreviation. + * + * On 64-bit systems, the abbreviation is not lossy so it is always + * worthwhile. (Perhaps it's not on 32-bit systems, but we don't bother + * with logic to decide.) + */ +static bool +gist_bbox_zorder_abbrev_abort(int memtupcount, SortSupport ssup) +{ + return false; +} + +/* + * Sort support routine for fast GiST index build by sorting. + */ +Datum +gist_point_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + if (ssup->abbreviate) + { + ssup->comparator = gist_bbox_zorder_cmp_abbrev; + ssup->abbrev_converter = gist_bbox_zorder_abbrev_convert; + ssup->abbrev_abort = gist_bbox_zorder_abbrev_abort; + ssup->abbrev_full_comparator = gist_bbox_zorder_cmp; + } + else + { + ssup->comparator = gist_bbox_zorder_cmp; + } + PG_RETURN_VOID(); +} diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c new file mode 100644 index 0000000..61e92cf --- /dev/null +++ b/src/backend/access/gist/gistscan.c @@ -0,0 +1,358 @@ +/*------------------------------------------------------------------------- + * + * gistscan.c + * routines to manage scans on GiST index relations + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistscan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "access/gistscan.h" +#include "access/relscan.h" +#include "utils/float.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +/* + * Pairing heap comparison function for the GISTSearchItem queue + */ +static int +pairingheap_GISTSearchItem_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg) +{ + const GISTSearchItem *sa = (const GISTSearchItem *) a; + const GISTSearchItem *sb = (const GISTSearchItem *) b; + IndexScanDesc scan = (IndexScanDesc) arg; + int i; + + /* Order according to distance comparison */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + if (sa->distances[i].isnull) + { + if (!sb->distances[i].isnull) + return -1; + } + else if (sb->distances[i].isnull) + { + return 1; + } + else + { + int cmp = -float8_cmp_internal(sa->distances[i].value, + sb->distances[i].value); + + if (cmp != 0) + return cmp; + } + } + + /* Heap items go before inner pages, to ensure a depth-first search */ + if (GISTSearchItemIsHeap(*sa) && !GISTSearchItemIsHeap(*sb)) + return 1; + if (!GISTSearchItemIsHeap(*sa) && GISTSearchItemIsHeap(*sb)) + return -1; + + return 0; +} + + +/* + * Index AM API functions for scanning GiST indexes + */ + +IndexScanDesc +gistbeginscan(Relation r, int nkeys, int norderbys) +{ + IndexScanDesc scan; + GISTSTATE *giststate; + GISTScanOpaque so; + MemoryContext oldCxt; + + scan = RelationGetIndexScan(r, nkeys, norderbys); + + /* First, set up a GISTSTATE with a scan-lifespan memory context */ + giststate = initGISTstate(scan->indexRelation); + + /* + * Everything made below is in the scanCxt, or is a child of the scanCxt, + * so it'll all go away automatically in gistendscan. + */ + oldCxt = MemoryContextSwitchTo(giststate->scanCxt); + + /* initialize opaque data */ + so = (GISTScanOpaque) palloc0(sizeof(GISTScanOpaqueData)); + so->giststate = giststate; + giststate->tempCxt = createTempGistContext(); + so->queue = NULL; + so->queueCxt = giststate->scanCxt; /* see gistrescan */ + + /* workspaces with size dependent on numberOfOrderBys: */ + so->distances = palloc(sizeof(so->distances[0]) * scan->numberOfOrderBys); + so->qual_ok = true; /* in case there are zero keys */ + if (scan->numberOfOrderBys > 0) + { + scan->xs_orderbyvals = palloc0(sizeof(Datum) * scan->numberOfOrderBys); + scan->xs_orderbynulls = palloc(sizeof(bool) * scan->numberOfOrderBys); + memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); + } + + so->killedItems = NULL; /* until needed */ + so->numKilled = 0; + so->curBlkno = InvalidBlockNumber; + so->curPageLSN = InvalidXLogRecPtr; + + scan->opaque = so; + + /* + * All fields required for index-only scans are initialized in gistrescan, + * as we don't know yet if we're doing an index-only scan or not. + */ + + MemoryContextSwitchTo(oldCxt); + + return scan; +} + +void +gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, + ScanKey orderbys, int norderbys) +{ + /* nkeys and norderbys arguments are ignored */ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + bool first_time; + int i; + MemoryContext oldCxt; + + /* rescan an existing indexscan --- reset state */ + + /* + * The first time through, we create the search queue in the scanCxt. + * Subsequent times through, we create the queue in a separate queueCxt, + * which is created on the second call and reset on later calls. Thus, in + * the common case where a scan is only rescan'd once, we just put the + * queue in scanCxt and don't pay the overhead of making a second memory + * context. If we do rescan more than once, the first queue is just left + * for dead until end of scan; this small wastage seems worth the savings + * in the common case. + */ + if (so->queue == NULL) + { + /* first time through */ + Assert(so->queueCxt == so->giststate->scanCxt); + first_time = true; + } + else if (so->queueCxt == so->giststate->scanCxt) + { + /* second time through */ + so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, + "GiST queue context", + ALLOCSET_DEFAULT_SIZES); + first_time = false; + } + else + { + /* third or later time through */ + MemoryContextReset(so->queueCxt); + first_time = false; + } + + /* + * If we're doing an index-only scan, on the first call, also initialize a + * tuple descriptor to represent the returned index tuples and create a + * memory context to hold them during the scan. + */ + if (scan->xs_want_itup && !scan->xs_hitupdesc) + { + int natts; + int nkeyatts; + int attno; + + /* + * The storage type of the index can be different from the original + * datatype being indexed, so we cannot just grab the index's tuple + * descriptor. Instead, construct a descriptor with the original data + * types. + */ + natts = RelationGetNumberOfAttributes(scan->indexRelation); + nkeyatts = IndexRelationGetNumberOfKeyAttributes(scan->indexRelation); + so->giststate->fetchTupdesc = CreateTemplateTupleDesc(natts); + for (attno = 1; attno <= nkeyatts; attno++) + { + TupleDescInitEntry(so->giststate->fetchTupdesc, attno, NULL, + scan->indexRelation->rd_opcintype[attno - 1], + -1, 0); + } + + for (; attno <= natts; attno++) + { + /* taking opcintype from giststate->tupdesc */ + TupleDescInitEntry(so->giststate->fetchTupdesc, attno, NULL, + TupleDescAttr(so->giststate->leafTupdesc, + attno - 1)->atttypid, + -1, 0); + } + scan->xs_hitupdesc = so->giststate->fetchTupdesc; + + /* Also create a memory context that will hold the returned tuples */ + so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, + "GiST page data context", + ALLOCSET_DEFAULT_SIZES); + } + + /* create new, empty pairing heap for search queue */ + oldCxt = MemoryContextSwitchTo(so->queueCxt); + so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); + MemoryContextSwitchTo(oldCxt); + + so->firstCall = true; + + /* Update scan key, if a new one is given */ + if (key && scan->numberOfKeys > 0) + { + void **fn_extras = NULL; + + /* + * If this isn't the first time through, preserve the fn_extra + * pointers, so that if the consistentFns are using them to cache + * data, that data is not leaked across a rescan. + */ + if (!first_time) + { + fn_extras = (void **) palloc(scan->numberOfKeys * sizeof(void *)); + for (i = 0; i < scan->numberOfKeys; i++) + fn_extras[i] = scan->keyData[i].sk_func.fn_extra; + } + + memmove(scan->keyData, key, + scan->numberOfKeys * sizeof(ScanKeyData)); + + /* + * Modify the scan key so that the Consistent method is called for all + * comparisons. The original operator is passed to the Consistent + * function in the form of its strategy number, which is available + * from the sk_strategy field, and its subtype from the sk_subtype + * field. + * + * Next, if any of keys is a NULL and that key is not marked with + * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we + * assume all indexable operators are strict). + */ + so->qual_ok = true; + + for (i = 0; i < scan->numberOfKeys; i++) + { + ScanKey skey = scan->keyData + i; + + /* + * Copy consistent support function to ScanKey structure instead + * of function implementing filtering operator. + */ + fmgr_info_copy(&(skey->sk_func), + &(so->giststate->consistentFn[skey->sk_attno - 1]), + so->giststate->scanCxt); + + /* Restore prior fn_extra pointers, if not first time */ + if (!first_time) + skey->sk_func.fn_extra = fn_extras[i]; + + if (skey->sk_flags & SK_ISNULL) + { + if (!(skey->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL))) + so->qual_ok = false; + } + } + + if (!first_time) + pfree(fn_extras); + } + + /* Update order-by key, if a new one is given */ + if (orderbys && scan->numberOfOrderBys > 0) + { + void **fn_extras = NULL; + + /* As above, preserve fn_extra if not first time through */ + if (!first_time) + { + fn_extras = (void **) palloc(scan->numberOfOrderBys * sizeof(void *)); + for (i = 0; i < scan->numberOfOrderBys; i++) + fn_extras[i] = scan->orderByData[i].sk_func.fn_extra; + } + + memmove(scan->orderByData, orderbys, + scan->numberOfOrderBys * sizeof(ScanKeyData)); + + so->orderByTypes = (Oid *) palloc(scan->numberOfOrderBys * sizeof(Oid)); + + /* + * Modify the order-by key so that the Distance method is called for + * all comparisons. The original operator is passed to the Distance + * function in the form of its strategy number, which is available + * from the sk_strategy field, and its subtype from the sk_subtype + * field. + */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + ScanKey skey = scan->orderByData + i; + FmgrInfo *finfo = &(so->giststate->distanceFn[skey->sk_attno - 1]); + + /* Check we actually have a distance function ... */ + if (!OidIsValid(finfo->fn_oid)) + elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", + GIST_DISTANCE_PROC, skey->sk_attno, + RelationGetRelationName(scan->indexRelation)); + + /* + * Look up the datatype returned by the original ordering + * operator. GiST always uses a float8 for the distance function, + * but the ordering operator could be anything else. + * + * XXX: The distance function is only allowed to be lossy if the + * ordering operator's result type is float4 or float8. Otherwise + * we don't know how to return the distance to the executor. But + * we cannot check that here, as we won't know if the distance + * function is lossy until it returns *recheck = true for the + * first time. + */ + so->orderByTypes[i] = get_func_rettype(skey->sk_func.fn_oid); + + /* + * Copy distance support function to ScanKey structure instead of + * function implementing ordering operator. + */ + fmgr_info_copy(&(skey->sk_func), finfo, so->giststate->scanCxt); + + /* Restore prior fn_extra pointers, if not first time */ + if (!first_time) + skey->sk_func.fn_extra = fn_extras[i]; + } + + if (!first_time) + pfree(fn_extras); + } + + /* any previous xs_hitup will have been pfree'd in context resets above */ + scan->xs_hitup = NULL; +} + +void +gistendscan(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + + /* + * freeGISTstate is enough to clean up everything made by gistbeginscan, + * as well as the queueCxt if there is a separate context for it. + */ + freeGISTstate(so->giststate); +} diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c new file mode 100644 index 0000000..526ed12 --- /dev/null +++ b/src/backend/access/gist/gistsplit.c @@ -0,0 +1,779 @@ +/*------------------------------------------------------------------------- + * + * gistsplit.c + * Multi-column page splitting algorithm + * + * This file is concerned with making good page-split decisions in multi-column + * GiST indexes. The opclass-specific picksplit functions can only be expected + * to produce answers based on a single column. We first run the picksplit + * function for column 1; then, if there are more columns, we check if any of + * the tuples are "don't cares" so far as the column 1 split is concerned + * (that is, they could go to either side for no additional penalty). If so, + * we try to redistribute those tuples on the basis of the next column. + * Repeat till we're out of columns. + * + * gistSplitByKey() is the entry point to this file. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistsplit.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "utils/rel.h" + +typedef struct +{ + OffsetNumber *entries; + int len; + Datum *attr; + bool *isnull; + bool *dontcare; +} GistSplitUnion; + + +/* + * Form unions of subkeys in itvec[] entries listed in gsvp->entries[], + * ignoring any tuples that are marked in gsvp->dontcare[]. Subroutine for + * gistunionsubkey. + */ +static void +gistunionsubkeyvec(GISTSTATE *giststate, IndexTuple *itvec, + GistSplitUnion *gsvp) +{ + IndexTuple *cleanedItVec; + int i, + cleanedLen = 0; + + cleanedItVec = (IndexTuple *) palloc(sizeof(IndexTuple) * gsvp->len); + + for (i = 0; i < gsvp->len; i++) + { + if (gsvp->dontcare && gsvp->dontcare[gsvp->entries[i]]) + continue; + + cleanedItVec[cleanedLen++] = itvec[gsvp->entries[i] - 1]; + } + + gistMakeUnionItVec(giststate, cleanedItVec, cleanedLen, + gsvp->attr, gsvp->isnull); + + pfree(cleanedItVec); +} + +/* + * Recompute unions of left- and right-side subkeys after a page split, + * ignoring any tuples that are marked in spl->spl_dontcare[]. + * + * Note: we always recompute union keys for all index columns. In some cases + * this might represent duplicate work for the leftmost column(s), but it's + * not safe to assume that "zero penalty to move a tuple" means "the union + * key doesn't change at all". Penalty functions aren't 100% accurate. + */ +static void +gistunionsubkey(GISTSTATE *giststate, IndexTuple *itvec, GistSplitVector *spl) +{ + GistSplitUnion gsvp; + + gsvp.dontcare = spl->spl_dontcare; + + gsvp.entries = spl->splitVector.spl_left; + gsvp.len = spl->splitVector.spl_nleft; + gsvp.attr = spl->spl_lattr; + gsvp.isnull = spl->spl_lisnull; + + gistunionsubkeyvec(giststate, itvec, &gsvp); + + gsvp.entries = spl->splitVector.spl_right; + gsvp.len = spl->splitVector.spl_nright; + gsvp.attr = spl->spl_rattr; + gsvp.isnull = spl->spl_risnull; + + gistunionsubkeyvec(giststate, itvec, &gsvp); +} + +/* + * Find tuples that are "don't cares", that is could be moved to the other + * side of the split with zero penalty, so far as the attno column is + * concerned. + * + * Don't-care tuples are marked by setting the corresponding entry in + * spl->spl_dontcare[] to "true". Caller must have initialized that array + * to zeroes. + * + * Returns number of don't-cares found. + */ +static int +findDontCares(Relation r, GISTSTATE *giststate, GISTENTRY *valvec, + GistSplitVector *spl, int attno) +{ + int i; + GISTENTRY entry; + int NumDontCare = 0; + + /* + * First, search the left-side tuples to see if any have zero penalty to + * be added to the right-side union key. + * + * attno column is known all-not-null (see gistSplitByKey), so we need not + * check for nulls + */ + gistentryinit(entry, spl->splitVector.spl_rdatum, r, NULL, + (OffsetNumber) 0, false); + for (i = 0; i < spl->splitVector.spl_nleft; i++) + { + int j = spl->splitVector.spl_left[i]; + float penalty = gistpenalty(giststate, attno, &entry, false, + &valvec[j], false); + + if (penalty == 0.0) + { + spl->spl_dontcare[j] = true; + NumDontCare++; + } + } + + /* And conversely for the right-side tuples */ + gistentryinit(entry, spl->splitVector.spl_ldatum, r, NULL, + (OffsetNumber) 0, false); + for (i = 0; i < spl->splitVector.spl_nright; i++) + { + int j = spl->splitVector.spl_right[i]; + float penalty = gistpenalty(giststate, attno, &entry, false, + &valvec[j], false); + + if (penalty == 0.0) + { + spl->spl_dontcare[j] = true; + NumDontCare++; + } + } + + return NumDontCare; +} + +/* + * Remove tuples that are marked don't-cares from the tuple index array a[] + * of length *len. This is applied separately to the spl_left and spl_right + * arrays. + */ +static void +removeDontCares(OffsetNumber *a, int *len, const bool *dontcare) +{ + int origlen, + newlen, + i; + OffsetNumber *curwpos; + + origlen = newlen = *len; + curwpos = a; + for (i = 0; i < origlen; i++) + { + OffsetNumber ai = a[i]; + + if (dontcare[ai] == false) + { + /* re-emit item into a[] */ + *curwpos = ai; + curwpos++; + } + else + newlen--; + } + + *len = newlen; +} + +/* + * Place a single don't-care tuple into either the left or right side of the + * split, according to which has least penalty for merging the tuple into + * the previously-computed union keys. We need consider only columns starting + * at attno. + */ +static void +placeOne(Relation r, GISTSTATE *giststate, GistSplitVector *v, + IndexTuple itup, OffsetNumber off, int attno) +{ + GISTENTRY identry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + bool toLeft = true; + + gistDeCompressAtt(giststate, r, itup, NULL, (OffsetNumber) 0, + identry, isnull); + + for (; attno < giststate->nonLeafTupdesc->natts; attno++) + { + float lpenalty, + rpenalty; + GISTENTRY entry; + + gistentryinit(entry, v->spl_lattr[attno], r, NULL, 0, false); + lpenalty = gistpenalty(giststate, attno, &entry, v->spl_lisnull[attno], + identry + attno, isnull[attno]); + gistentryinit(entry, v->spl_rattr[attno], r, NULL, 0, false); + rpenalty = gistpenalty(giststate, attno, &entry, v->spl_risnull[attno], + identry + attno, isnull[attno]); + + if (lpenalty != rpenalty) + { + if (lpenalty > rpenalty) + toLeft = false; + break; + } + } + + if (toLeft) + v->splitVector.spl_left[v->splitVector.spl_nleft++] = off; + else + v->splitVector.spl_right[v->splitVector.spl_nright++] = off; +} + +#define SWAPVAR( s, d, t ) \ +do { \ + (t) = (s); \ + (s) = (d); \ + (d) = (t); \ +} while(0) + +/* + * Clean up when we did a secondary split but the user-defined PickSplit + * method didn't support it (leaving spl_ldatum_exists or spl_rdatum_exists + * true). + * + * We consider whether to swap the left and right outputs of the secondary + * split; this can be worthwhile if the penalty for merging those tuples into + * the previously chosen sets is less that way. + * + * In any case we must update the union datums for the current column by + * adding in the previous union keys (oldL/oldR), since the user-defined + * PickSplit method didn't do so. + */ +static void +supportSecondarySplit(Relation r, GISTSTATE *giststate, int attno, + GIST_SPLITVEC *sv, Datum oldL, Datum oldR) +{ + bool leaveOnLeft = true, + tmpBool; + GISTENTRY entryL, + entryR, + entrySL, + entrySR; + + gistentryinit(entryL, oldL, r, NULL, 0, false); + gistentryinit(entryR, oldR, r, NULL, 0, false); + gistentryinit(entrySL, sv->spl_ldatum, r, NULL, 0, false); + gistentryinit(entrySR, sv->spl_rdatum, r, NULL, 0, false); + + if (sv->spl_ldatum_exists && sv->spl_rdatum_exists) + { + float penalty1, + penalty2; + + penalty1 = gistpenalty(giststate, attno, &entryL, false, &entrySL, false) + + gistpenalty(giststate, attno, &entryR, false, &entrySR, false); + penalty2 = gistpenalty(giststate, attno, &entryL, false, &entrySR, false) + + gistpenalty(giststate, attno, &entryR, false, &entrySL, false); + + if (penalty1 > penalty2) + leaveOnLeft = false; + } + else + { + GISTENTRY *entry1 = (sv->spl_ldatum_exists) ? &entryL : &entryR; + float penalty1, + penalty2; + + /* + * There is only one previously defined union, so we just choose swap + * or not by lowest penalty for that side. We can only get here if a + * secondary split happened to have all NULLs in its column in the + * tuples that the outer recursion level had assigned to one side. + * (Note that the null checks in gistSplitByKey don't prevent the + * case, because they'll only be checking tuples that were considered + * don't-cares at the outer recursion level, not the tuples that went + * into determining the passed-down left and right union keys.) + */ + penalty1 = gistpenalty(giststate, attno, entry1, false, &entrySL, false); + penalty2 = gistpenalty(giststate, attno, entry1, false, &entrySR, false); + + if (penalty1 < penalty2) + leaveOnLeft = (sv->spl_ldatum_exists) ? true : false; + else + leaveOnLeft = (sv->spl_rdatum_exists) ? true : false; + } + + if (leaveOnLeft == false) + { + /* + * swap left and right + */ + OffsetNumber *off, + noff; + Datum datum; + + SWAPVAR(sv->spl_left, sv->spl_right, off); + SWAPVAR(sv->spl_nleft, sv->spl_nright, noff); + SWAPVAR(sv->spl_ldatum, sv->spl_rdatum, datum); + gistentryinit(entrySL, sv->spl_ldatum, r, NULL, 0, false); + gistentryinit(entrySR, sv->spl_rdatum, r, NULL, 0, false); + } + + if (sv->spl_ldatum_exists) + gistMakeUnionKey(giststate, attno, &entryL, false, &entrySL, false, + &sv->spl_ldatum, &tmpBool); + + if (sv->spl_rdatum_exists) + gistMakeUnionKey(giststate, attno, &entryR, false, &entrySR, false, + &sv->spl_rdatum, &tmpBool); + + sv->spl_ldatum_exists = sv->spl_rdatum_exists = false; +} + +/* + * Trivial picksplit implementation. Function called only + * if user-defined picksplit puts all keys on the same side of the split. + * That is a bug of user-defined picksplit but we don't want to fail. + */ +static void +genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC *v, int attno) +{ + OffsetNumber i, + maxoff; + int nbytes; + GistEntryVector *evec; + + maxoff = entryvec->n - 1; + + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + v->spl_nleft = v->spl_nright = 0; + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + if (i <= (maxoff - FirstOffsetNumber + 1) / 2) + { + v->spl_left[v->spl_nleft] = i; + v->spl_nleft++; + } + else + { + v->spl_right[v->spl_nright] = i; + v->spl_nright++; + } + } + + /* + * Form union datums for each side + */ + evec = palloc(sizeof(GISTENTRY) * entryvec->n + GEVHDRSZ); + + evec->n = v->spl_nleft; + memcpy(evec->vector, entryvec->vector + FirstOffsetNumber, + sizeof(GISTENTRY) * evec->n); + v->spl_ldatum = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&nbytes)); + + evec->n = v->spl_nright; + memcpy(evec->vector, entryvec->vector + FirstOffsetNumber + v->spl_nleft, + sizeof(GISTENTRY) * evec->n); + v->spl_rdatum = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&nbytes)); +} + +/* + * Calls user picksplit method for attno column to split tuples into + * two vectors. + * + * Returns false if split is complete (there are no more index columns, or + * there is no need to consider them because split is optimal already). + * + * Returns true and v->spl_dontcare = NULL if the picksplit result is + * degenerate (all tuples seem to be don't-cares), so we should just + * disregard this column and split on the next column(s) instead. + * + * Returns true and v->spl_dontcare != NULL if there are don't-care tuples + * that could be relocated based on the next column(s). The don't-care + * tuples have been removed from the split and must be reinserted by caller. + * There is at least one non-don't-care tuple on each side of the split, + * and union keys for all columns are updated to include just those tuples. + * + * A true result implies there is at least one more index column. + */ +static bool +gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVector *v, + IndexTuple *itup, int len, GISTSTATE *giststate) +{ + GIST_SPLITVEC *sv = &v->splitVector; + + /* + * Prepare spl_ldatum/spl_rdatum/spl_ldatum_exists/spl_rdatum_exists in + * case we are doing a secondary split (see comments in gist.h). + */ + sv->spl_ldatum_exists = (v->spl_lisnull[attno]) ? false : true; + sv->spl_rdatum_exists = (v->spl_risnull[attno]) ? false : true; + sv->spl_ldatum = v->spl_lattr[attno]; + sv->spl_rdatum = v->spl_rattr[attno]; + + /* + * Let the opclass-specific PickSplit method do its thing. Note that at + * this point we know there are no null keys in the entryvec. + */ + FunctionCall2Coll(&giststate->picksplitFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(entryvec), + PointerGetDatum(sv)); + + if (sv->spl_nleft == 0 || sv->spl_nright == 0) + { + /* + * User-defined picksplit failed to create an actual split, ie it put + * everything on the same side. Complain but cope. + */ + ereport(DEBUG1, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("picksplit method for column %d of index \"%s\" failed", + attno + 1, RelationGetRelationName(r)), + errhint("The index is not optimal. To optimize it, contact a developer, or try to use the column as the second one in the CREATE INDEX command."))); + + /* + * Reinit GIST_SPLITVEC. Although these fields are not used by + * genericPickSplit(), set them up for further processing + */ + sv->spl_ldatum_exists = (v->spl_lisnull[attno]) ? false : true; + sv->spl_rdatum_exists = (v->spl_risnull[attno]) ? false : true; + sv->spl_ldatum = v->spl_lattr[attno]; + sv->spl_rdatum = v->spl_rattr[attno]; + + /* Do a generic split */ + genericPickSplit(giststate, entryvec, sv, attno); + } + else + { + /* hack for compatibility with old picksplit API */ + if (sv->spl_left[sv->spl_nleft - 1] == InvalidOffsetNumber) + sv->spl_left[sv->spl_nleft - 1] = (OffsetNumber) (entryvec->n - 1); + if (sv->spl_right[sv->spl_nright - 1] == InvalidOffsetNumber) + sv->spl_right[sv->spl_nright - 1] = (OffsetNumber) (entryvec->n - 1); + } + + /* Clean up if PickSplit didn't take care of a secondary split */ + if (sv->spl_ldatum_exists || sv->spl_rdatum_exists) + supportSecondarySplit(r, giststate, attno, sv, + v->spl_lattr[attno], v->spl_rattr[attno]); + + /* emit union datums computed by PickSplit back to v arrays */ + v->spl_lattr[attno] = sv->spl_ldatum; + v->spl_rattr[attno] = sv->spl_rdatum; + v->spl_lisnull[attno] = false; + v->spl_risnull[attno] = false; + + /* + * If index columns remain, then consider whether we can improve the split + * by using them. + */ + v->spl_dontcare = NULL; + + if (attno + 1 < giststate->nonLeafTupdesc->natts) + { + int NumDontCare; + + /* + * Make a quick check to see if left and right union keys are equal; + * if so, the split is certainly degenerate, so tell caller to + * re-split with the next column. + */ + if (gistKeyIsEQ(giststate, attno, sv->spl_ldatum, sv->spl_rdatum)) + return true; + + /* + * Locate don't-care tuples, if any. If there are none, the split is + * optimal, so just fall out and return false. + */ + v->spl_dontcare = (bool *) palloc0(sizeof(bool) * (entryvec->n + 1)); + + NumDontCare = findDontCares(r, giststate, entryvec->vector, v, attno); + + if (NumDontCare > 0) + { + /* + * Remove don't-cares from spl_left[] and spl_right[]. + */ + removeDontCares(sv->spl_left, &sv->spl_nleft, v->spl_dontcare); + removeDontCares(sv->spl_right, &sv->spl_nright, v->spl_dontcare); + + /* + * If all tuples on either side were don't-cares, the split is + * degenerate, and we're best off to ignore it and split on the + * next column. (We used to try to press on with a secondary + * split by forcing a random tuple on each side to be treated as + * non-don't-care, but it seems unlikely that that technique + * really gives a better result. Note that we don't want to try a + * secondary split with empty left or right primary split sides, + * because then there is no union key on that side for the + * PickSplit function to try to expand, so it can have no good + * figure of merit for what it's doing. Also note that this check + * ensures we can't produce a bogus one-side-only split in the + * NumDontCare == 1 special case below.) + */ + if (sv->spl_nleft == 0 || sv->spl_nright == 0) + { + v->spl_dontcare = NULL; + return true; + } + + /* + * Recompute union keys, considering only non-don't-care tuples. + * NOTE: this will set union keys for remaining index columns, + * which will cause later calls of gistUserPicksplit to pass those + * values down to user-defined PickSplit methods with + * spl_ldatum_exists/spl_rdatum_exists set true. + */ + gistunionsubkey(giststate, itup, v); + + if (NumDontCare == 1) + { + /* + * If there's only one don't-care tuple then we can't do a + * PickSplit on it, so just choose whether to send it left or + * right by comparing penalties. We needed the + * gistunionsubkey step anyway so that we have appropriate + * union keys for figuring the penalties. + */ + OffsetNumber toMove; + + /* find it ... */ + for (toMove = FirstOffsetNumber; toMove < entryvec->n; toMove++) + { + if (v->spl_dontcare[toMove]) + break; + } + Assert(toMove < entryvec->n); + + /* ... and assign it to cheaper side */ + placeOne(r, giststate, v, itup[toMove - 1], toMove, attno + 1); + + /* + * At this point the union keys are wrong, but we don't care + * because we're done splitting. The outermost recursion + * level of gistSplitByKey will fix things before returning. + */ + } + else + return true; + } + } + + return false; +} + +/* + * simply split page in half + */ +static void +gistSplitHalf(GIST_SPLITVEC *v, int len) +{ + int i; + + v->spl_nright = v->spl_nleft = 0; + v->spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + v->spl_right = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + for (i = 1; i <= len; i++) + if (i < len / 2) + v->spl_right[v->spl_nright++] = i; + else + v->spl_left[v->spl_nleft++] = i; + + /* we need not compute union keys, caller took care of it */ +} + +/* + * gistSplitByKey: main entry point for page-splitting algorithm + * + * r: index relation + * page: page being split + * itup: array of IndexTuples to be processed + * len: number of IndexTuples to be processed (must be at least 2) + * giststate: additional info about index + * v: working state and output area + * attno: column we are working on (zero-based index) + * + * Outside caller must initialize v->spl_lisnull and v->spl_risnull arrays + * to all-true. On return, spl_left/spl_nleft contain indexes of tuples + * to go left, spl_right/spl_nright contain indexes of tuples to go right, + * spl_lattr/spl_lisnull contain left-side union key values, and + * spl_rattr/spl_risnull contain right-side union key values. Other fields + * in this struct are workspace for this file. + * + * Outside caller must pass zero for attno. The function may internally + * recurse to the next column by passing attno+1. + */ +void +gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, + GISTSTATE *giststate, GistSplitVector *v, int attno) +{ + GistEntryVector *entryvec; + OffsetNumber *offNullTuples; + int nOffNullTuples = 0; + int i; + + /* generate the item array, and identify tuples with null keys */ + /* note that entryvec->vector[0] goes unused in this code */ + entryvec = palloc(GEVHDRSZ + (len + 1) * sizeof(GISTENTRY)); + entryvec->n = len + 1; + offNullTuples = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + + for (i = 1; i <= len; i++) + { + Datum datum; + bool IsNull; + + datum = index_getattr(itup[i - 1], attno + 1, giststate->leafTupdesc, + &IsNull); + gistdentryinit(giststate, attno, &(entryvec->vector[i]), + datum, r, page, i, + false, IsNull); + if (IsNull) + offNullTuples[nOffNullTuples++] = i; + } + + if (nOffNullTuples == len) + { + /* + * Corner case: All keys in attno column are null, so just transfer + * our attention to the next column. If there's no next column, just + * split page in half. + */ + v->spl_risnull[attno] = v->spl_lisnull[attno] = true; + + if (attno + 1 < giststate->nonLeafTupdesc->natts) + gistSplitByKey(r, page, itup, len, giststate, v, attno + 1); + else + gistSplitHalf(&v->splitVector, len); + } + else if (nOffNullTuples > 0) + { + int j = 0; + + /* + * We don't want to mix NULL and not-NULL keys on one page, so split + * nulls to right page and not-nulls to left. + */ + v->splitVector.spl_right = offNullTuples; + v->splitVector.spl_nright = nOffNullTuples; + v->spl_risnull[attno] = true; + + v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + v->splitVector.spl_nleft = 0; + for (i = 1; i <= len; i++) + if (j < v->splitVector.spl_nright && offNullTuples[j] == i) + j++; + else + v->splitVector.spl_left[v->splitVector.spl_nleft++] = i; + + /* Compute union keys, unless outer recursion level will handle it */ + if (attno == 0 && giststate->nonLeafTupdesc->natts == 1) + { + v->spl_dontcare = NULL; + gistunionsubkey(giststate, itup, v); + } + } + else + { + /* + * All keys are not-null, so apply user-defined PickSplit method + */ + if (gistUserPicksplit(r, entryvec, attno, v, itup, len, giststate)) + { + /* + * Splitting on attno column is not optimal, so consider + * redistributing don't-care tuples according to the next column + */ + Assert(attno + 1 < giststate->nonLeafTupdesc->natts); + + if (v->spl_dontcare == NULL) + { + /* + * This split was actually degenerate, so ignore it altogether + * and just split according to the next column. + */ + gistSplitByKey(r, page, itup, len, giststate, v, attno + 1); + } + else + { + /* + * Form an array of just the don't-care tuples to pass to a + * recursive invocation of this function for the next column. + */ + IndexTuple *newitup = (IndexTuple *) palloc(len * sizeof(IndexTuple)); + OffsetNumber *map = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + int newlen = 0; + GIST_SPLITVEC backupSplit; + + for (i = 0; i < len; i++) + { + if (v->spl_dontcare[i + 1]) + { + newitup[newlen] = itup[i]; + map[newlen] = i + 1; + newlen++; + } + } + + Assert(newlen > 0); + + /* + * Make a backup copy of v->splitVector, since the recursive + * call will overwrite that with its own result. + */ + backupSplit = v->splitVector; + backupSplit.spl_left = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); + memcpy(backupSplit.spl_left, v->splitVector.spl_left, sizeof(OffsetNumber) * v->splitVector.spl_nleft); + backupSplit.spl_right = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); + memcpy(backupSplit.spl_right, v->splitVector.spl_right, sizeof(OffsetNumber) * v->splitVector.spl_nright); + + /* Recursively decide how to split the don't-care tuples */ + gistSplitByKey(r, page, newitup, newlen, giststate, v, attno + 1); + + /* Merge result of subsplit with non-don't-care tuples */ + for (i = 0; i < v->splitVector.spl_nleft; i++) + backupSplit.spl_left[backupSplit.spl_nleft++] = map[v->splitVector.spl_left[i] - 1]; + for (i = 0; i < v->splitVector.spl_nright; i++) + backupSplit.spl_right[backupSplit.spl_nright++] = map[v->splitVector.spl_right[i] - 1]; + + v->splitVector = backupSplit; + } + } + } + + /* + * If we're handling a multicolumn index, at the end of the recursion + * recompute the left and right union datums for all index columns. This + * makes sure we hand back correct union datums in all corner cases, + * including when we haven't processed all columns to start with, or when + * a secondary split moved "don't care" tuples from one side to the other + * (we really shouldn't assume that that didn't change the union datums). + * + * Note: when we're in an internal recursion (attno > 0), we do not worry + * about whether the union datums we return with are sensible, since + * calling levels won't care. Also, in a single-column index, we expect + * that PickSplit (or the special cases above) produced correct union + * datums. + */ + if (attno == 0 && giststate->nonLeafTupdesc->natts > 1) + { + v->spl_dontcare = NULL; + gistunionsubkey(giststate, itup, v); + } +} diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c new file mode 100644 index 0000000..43ba03b --- /dev/null +++ b/src/backend/access/gist/gistutil.c @@ -0,0 +1,1066 @@ +/*------------------------------------------------------------------------- + * + * gistutil.c + * utilities routines for the postgres GiST index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistutil.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "access/gist_private.h" +#include "access/htup_details.h" +#include "access/reloptions.h" +#include "catalog/pg_opclass.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "utils/float.h" +#include "utils/lsyscache.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + +/* + * Write itup vector to page, has no control of free space. + */ +void +gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off) +{ + int i; + + if (off == InvalidOffsetNumber) + off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + for (i = 0; i < len; i++) + { + Size sz = IndexTupleSize(itup[i]); + OffsetNumber l; + + l = PageAddItem(page, (Item) itup[i], sz, off, false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes", + i, len, (int) sz); + off++; + } +} + +/* + * Check space for itup vector on page + */ +bool +gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace) +{ + unsigned int size = freespace, + deleted = 0; + int i; + + for (i = 0; i < len; i++) + size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData); + + if (todelete != InvalidOffsetNumber) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete)); + + deleted = IndexTupleSize(itup) + sizeof(ItemIdData); + } + + return (PageGetFreeSpace(page) + deleted < size); +} + +bool +gistfitpage(IndexTuple *itvec, int len) +{ + int i; + Size size = 0; + + for (i = 0; i < len; i++) + size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData); + + /* TODO: Consider fillfactor */ + return (size <= GiSTPageSize); +} + +/* + * Read buffer into itup vector + */ +IndexTuple * +gistextractpage(Page page, int *len /* out */ ) +{ + OffsetNumber i, + maxoff; + IndexTuple *itvec; + + maxoff = PageGetMaxOffsetNumber(page); + *len = maxoff; + itvec = palloc(sizeof(IndexTuple) * maxoff); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + itvec[i - FirstOffsetNumber] = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + + return itvec; +} + +/* + * join two vectors into one + */ +IndexTuple * +gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen) +{ + itvec = (IndexTuple *) repalloc((void *) itvec, sizeof(IndexTuple) * ((*len) + addlen)); + memmove(&itvec[*len], additvec, sizeof(IndexTuple) * addlen); + *len += addlen; + return itvec; +} + +/* + * make plain IndexTuple vector + */ + +IndexTupleData * +gistfillitupvec(IndexTuple *vec, int veclen, int *memlen) +{ + char *ptr, + *ret; + int i; + + *memlen = 0; + + for (i = 0; i < veclen; i++) + *memlen += IndexTupleSize(vec[i]); + + ptr = ret = palloc(*memlen); + + for (i = 0; i < veclen; i++) + { + memcpy(ptr, vec[i], IndexTupleSize(vec[i])); + ptr += IndexTupleSize(vec[i]); + } + + return (IndexTupleData *) ret; +} + +/* + * Make unions of keys in IndexTuple vector (one union datum per index column). + * Union Datums are returned into the attr/isnull arrays. + * Resulting Datums aren't compressed. + */ +void +gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, + Datum *attr, bool *isnull) +{ + int i; + GistEntryVector *evec; + int attrsize; + + evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ); + + for (i = 0; i < giststate->nonLeafTupdesc->natts; i++) + { + int j; + + /* Collect non-null datums for this column */ + evec->n = 0; + for (j = 0; j < len; j++) + { + Datum datum; + bool IsNull; + + datum = index_getattr(itvec[j], i + 1, giststate->leafTupdesc, + &IsNull); + if (IsNull) + continue; + + gistdentryinit(giststate, i, + evec->vector + evec->n, + datum, + NULL, NULL, (OffsetNumber) 0, + false, IsNull); + evec->n++; + } + + /* If this column was all NULLs, the union is NULL */ + if (evec->n == 0) + { + attr[i] = (Datum) 0; + isnull[i] = true; + } + else + { + if (evec->n == 1) + { + /* unionFn may expect at least two inputs */ + evec->n = 2; + evec->vector[1] = evec->vector[0]; + } + + /* Make union and store in attr array */ + attr[i] = FunctionCall2Coll(&giststate->unionFn[i], + giststate->supportCollation[i], + PointerGetDatum(evec), + PointerGetDatum(&attrsize)); + + isnull[i] = false; + } + } +} + +/* + * Return an IndexTuple containing the result of applying the "union" + * method to the specified IndexTuple vector. + */ +IndexTuple +gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate) +{ + Datum attr[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + + gistMakeUnionItVec(giststate, itvec, len, attr, isnull); + + return gistFormTuple(giststate, r, attr, isnull, false); +} + +/* + * makes union of two key + */ +void +gistMakeUnionKey(GISTSTATE *giststate, int attno, + GISTENTRY *entry1, bool isnull1, + GISTENTRY *entry2, bool isnull2, + Datum *dst, bool *dstisnull) +{ + /* we need a GistEntryVector with room for exactly 2 elements */ + union + { + GistEntryVector gev; + char padding[2 * sizeof(GISTENTRY) + GEVHDRSZ]; + } storage; + GistEntryVector *evec = &storage.gev; + int dstsize; + + evec->n = 2; + + if (isnull1 && isnull2) + { + *dstisnull = true; + *dst = (Datum) 0; + } + else + { + if (isnull1 == false && isnull2 == false) + { + evec->vector[0] = *entry1; + evec->vector[1] = *entry2; + } + else if (isnull1 == false) + { + evec->vector[0] = *entry1; + evec->vector[1] = *entry1; + } + else + { + evec->vector[0] = *entry2; + evec->vector[1] = *entry2; + } + + *dstisnull = false; + *dst = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&dstsize)); + } +} + +bool +gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b) +{ + bool result; + + FunctionCall3Coll(&giststate->equalFn[attno], + giststate->supportCollation[attno], + a, b, + PointerGetDatum(&result)); + return result; +} + +/* + * Decompress all keys in tuple + */ +void +gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, + OffsetNumber o, GISTENTRY *attdata, bool *isnull) +{ + int i; + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) + { + Datum datum; + + datum = index_getattr(tuple, i + 1, giststate->leafTupdesc, &isnull[i]); + gistdentryinit(giststate, i, &attdata[i], + datum, r, p, o, + false, isnull[i]); + } +} + +/* + * Forms union of oldtup and addtup, if union == oldtup then return NULL + */ +IndexTuple +gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate) +{ + bool neednew = false; + GISTENTRY oldentries[INDEX_MAX_KEYS], + addentries[INDEX_MAX_KEYS]; + bool oldisnull[INDEX_MAX_KEYS], + addisnull[INDEX_MAX_KEYS]; + Datum attr[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + IndexTuple newtup = NULL; + int i; + + gistDeCompressAtt(giststate, r, oldtup, NULL, + (OffsetNumber) 0, oldentries, oldisnull); + + gistDeCompressAtt(giststate, r, addtup, NULL, + (OffsetNumber) 0, addentries, addisnull); + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) + { + gistMakeUnionKey(giststate, i, + oldentries + i, oldisnull[i], + addentries + i, addisnull[i], + attr + i, isnull + i); + + if (neednew) + /* we already need new key, so we can skip check */ + continue; + + if (isnull[i]) + /* union of key may be NULL if and only if both keys are NULL */ + continue; + + if (!addisnull[i]) + { + if (oldisnull[i] || + !gistKeyIsEQ(giststate, i, oldentries[i].key, attr[i])) + neednew = true; + } + } + + if (neednew) + { + /* need to update key */ + newtup = gistFormTuple(giststate, r, attr, isnull, false); + newtup->t_tid = oldtup->t_tid; + } + + return newtup; +} + +/* + * Search an upper index page for the entry with lowest penalty for insertion + * of the new index key contained in "it". + * + * Returns the index of the page entry to insert into. + */ +OffsetNumber +gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ + GISTSTATE *giststate) +{ + OffsetNumber result; + OffsetNumber maxoff; + OffsetNumber i; + float best_penalty[INDEX_MAX_KEYS]; + GISTENTRY entry, + identry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + int keep_current_best; + + Assert(!GistPageIsLeaf(p)); + + gistDeCompressAtt(giststate, r, + it, NULL, (OffsetNumber) 0, + identry, isnull); + + /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */ + result = FirstOffsetNumber; + + /* + * The index may have multiple columns, and there's a penalty value for + * each column. The penalty associated with a column that appears earlier + * in the index definition is strictly more important than the penalty of + * a column that appears later in the index definition. + * + * best_penalty[j] is the best penalty we have seen so far for column j, + * or -1 when we haven't yet examined column j. Array entries to the + * right of the first -1 are undefined. + */ + best_penalty[0] = -1; + + /* + * If we find a tuple that's exactly as good as the currently best one, we + * could use either one. When inserting a lot of tuples with the same or + * similar keys, it's preferable to descend down the same path when + * possible, as that's more cache-friendly. On the other hand, if all + * inserts land on the same leaf page after a split, we're never going to + * insert anything to the other half of the split, and will end up using + * only 50% of the available space. Distributing the inserts evenly would + * lead to better space usage, but that hurts cache-locality during + * insertion. To get the best of both worlds, when we find a tuple that's + * exactly as good as the previous best, choose randomly whether to stick + * to the old best, or use the new one. Once we decide to stick to the + * old best, we keep sticking to it for any subsequent equally good tuples + * we might find. This favors tuples with low offsets, but still allows + * some inserts to go to other equally-good subtrees. + * + * keep_current_best is -1 if we haven't yet had to make a random choice + * whether to keep the current best tuple. If we have done so, and + * decided to keep it, keep_current_best is 1; if we've decided to + * replace, keep_current_best is 0. (This state will be reset to -1 as + * soon as we've made the replacement, but sometimes we make the choice in + * advance of actually finding a replacement best tuple.) + */ + keep_current_best = -1; + + /* + * Loop over tuples on page. + */ + maxoff = PageGetMaxOffsetNumber(p); + Assert(maxoff >= FirstOffsetNumber); + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); + bool zero_penalty; + int j; + + zero_penalty = true; + + /* Loop over index attributes. */ + for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++) + { + Datum datum; + float usize; + bool IsNull; + + /* Compute penalty for this column. */ + datum = index_getattr(itup, j + 1, giststate->leafTupdesc, + &IsNull); + gistdentryinit(giststate, j, &entry, datum, r, p, i, + false, IsNull); + usize = gistpenalty(giststate, j, &entry, IsNull, + &identry[j], isnull[j]); + if (usize > 0) + zero_penalty = false; + + if (best_penalty[j] < 0 || usize < best_penalty[j]) + { + /* + * New best penalty for column. Tentatively select this tuple + * as the target, and record the best penalty. Then reset the + * next column's penalty to "unknown" (and indirectly, the + * same for all the ones to its right). This will force us to + * adopt this tuple's penalty values as the best for all the + * remaining columns during subsequent loop iterations. + */ + result = i; + best_penalty[j] = usize; + + if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1) + best_penalty[j + 1] = -1; + + /* we have new best, so reset keep-it decision */ + keep_current_best = -1; + } + else if (best_penalty[j] == usize) + { + /* + * The current tuple is exactly as good for this column as the + * best tuple seen so far. The next iteration of this loop + * will compare the next column. + */ + } + else + { + /* + * The current tuple is worse for this column than the best + * tuple seen so far. Skip the remaining columns and move on + * to the next tuple, if any. + */ + zero_penalty = false; /* so outer loop won't exit */ + break; + } + } + + /* + * If we looped past the last column, and did not update "result", + * then this tuple is exactly as good as the prior best tuple. + */ + if (j == IndexRelationGetNumberOfKeyAttributes(r) && result != i) + { + if (keep_current_best == -1) + { + /* we didn't make the random choice yet for this old best */ + keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; + } + if (keep_current_best == 0) + { + /* we choose to use the new tuple */ + result = i; + /* choose again if there are even more exactly-as-good ones */ + keep_current_best = -1; + } + } + + /* + * If we find a tuple with zero penalty for all columns, and we've + * decided we don't want to search for another tuple with equal + * penalty, there's no need to examine remaining tuples; just break + * out of the loop and return it. + */ + if (zero_penalty) + { + if (keep_current_best == -1) + { + /* we didn't make the random choice yet for this old best */ + keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; + } + if (keep_current_best == 1) + break; + } + } + + return result; +} + +/* + * initialize a GiST entry with a decompressed version of key + */ +void +gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, + Datum k, Relation r, Page pg, OffsetNumber o, + bool l, bool isNull) +{ + if (!isNull) + { + GISTENTRY *dep; + + gistentryinit(*e, k, r, pg, o, l); + + /* there may not be a decompress function in opclass */ + if (!OidIsValid(giststate->decompressFn[nkey].fn_oid)) + return; + + dep = (GISTENTRY *) + DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey], + giststate->supportCollation[nkey], + PointerGetDatum(e))); + /* decompressFn may just return the given pointer */ + if (dep != e) + gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset, + dep->leafkey); + } + else + gistentryinit(*e, (Datum) 0, r, pg, o, l); +} + +IndexTuple +gistFormTuple(GISTSTATE *giststate, Relation r, + Datum *attdata, bool *isnull, bool isleaf) +{ + Datum compatt[INDEX_MAX_KEYS]; + IndexTuple res; + + gistCompressValues(giststate, r, attdata, isnull, isleaf, compatt); + + res = index_form_tuple(isleaf ? giststate->leafTupdesc : + giststate->nonLeafTupdesc, + compatt, isnull); + + /* + * The offset number on tuples on internal pages is unused. For historical + * reasons, it is set to 0xffff. + */ + ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); + return res; +} + +void +gistCompressValues(GISTSTATE *giststate, Relation r, + Datum *attdata, bool *isnull, bool isleaf, Datum *compatt) +{ + int i; + + /* + * Call the compress method on each attribute. + */ + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) + { + if (isnull[i]) + compatt[i] = (Datum) 0; + else + { + GISTENTRY centry; + GISTENTRY *cep; + + gistentryinit(centry, attdata[i], r, NULL, (OffsetNumber) 0, + isleaf); + /* there may not be a compress function in opclass */ + if (OidIsValid(giststate->compressFn[i].fn_oid)) + cep = (GISTENTRY *) + DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[i], + giststate->supportCollation[i], + PointerGetDatum(¢ry))); + else + cep = ¢ry; + compatt[i] = cep->key; + } + } + + if (isleaf) + { + /* + * Emplace each included attribute if any. + */ + for (; i < r->rd_att->natts; i++) + { + if (isnull[i]) + compatt[i] = (Datum) 0; + else + compatt[i] = attdata[i]; + } + } +} + +/* + * initialize a GiST entry with fetched value in key field + */ +static Datum +gistFetchAtt(GISTSTATE *giststate, int nkey, Datum k, Relation r) +{ + GISTENTRY fentry; + GISTENTRY *fep; + + gistentryinit(fentry, k, r, NULL, (OffsetNumber) 0, false); + + fep = (GISTENTRY *) + DatumGetPointer(FunctionCall1Coll(&giststate->fetchFn[nkey], + giststate->supportCollation[nkey], + PointerGetDatum(&fentry))); + + /* fetchFn set 'key', return it to the caller */ + return fep->key; +} + +/* + * Fetch all keys in tuple. + * Returns a new HeapTuple containing the originally-indexed data. + */ +HeapTuple +gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) +{ + MemoryContext oldcxt = MemoryContextSwitchTo(giststate->tempCxt); + Datum fetchatt[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + int i; + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) + { + Datum datum; + + datum = index_getattr(tuple, i + 1, giststate->leafTupdesc, &isnull[i]); + + if (giststate->fetchFn[i].fn_oid != InvalidOid) + { + if (!isnull[i]) + fetchatt[i] = gistFetchAtt(giststate, i, datum, r); + else + fetchatt[i] = (Datum) 0; + } + else if (giststate->compressFn[i].fn_oid == InvalidOid) + { + /* + * If opclass does not provide compress method that could change + * original value, att is necessarily stored in original form. + */ + if (!isnull[i]) + fetchatt[i] = datum; + else + fetchatt[i] = (Datum) 0; + } + else + { + /* + * Index-only scans not supported for this column. Since the + * planner chose an index-only scan anyway, it is not interested + * in this column, and we can replace it with a NULL. + */ + isnull[i] = true; + fetchatt[i] = (Datum) 0; + } + } + + /* + * Get each included attribute. + */ + for (; i < r->rd_att->natts; i++) + { + fetchatt[i] = index_getattr(tuple, i + 1, giststate->leafTupdesc, + &isnull[i]); + } + MemoryContextSwitchTo(oldcxt); + + return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); +} + +float +gistpenalty(GISTSTATE *giststate, int attno, + GISTENTRY *orig, bool isNullOrig, + GISTENTRY *add, bool isNullAdd) +{ + float penalty = 0.0; + + if (giststate->penaltyFn[attno].fn_strict == false || + (isNullOrig == false && isNullAdd == false)) + { + FunctionCall3Coll(&giststate->penaltyFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(orig), + PointerGetDatum(add), + PointerGetDatum(&penalty)); + /* disallow negative or NaN penalty */ + if (isnan(penalty) || penalty < 0.0) + penalty = 0.0; + } + else if (isNullOrig && isNullAdd) + penalty = 0.0; + else + { + /* try to prevent mixing null and non-null values */ + penalty = get_float4_infinity(); + } + + return penalty; +} + +/* + * Initialize a new index page + */ +void +gistinitpage(Page page, uint32 f) +{ + GISTPageOpaque opaque; + + PageInit(page, BLCKSZ, sizeof(GISTPageOpaqueData)); + + opaque = GistPageGetOpaque(page); + opaque->rightlink = InvalidBlockNumber; + opaque->flags = f; + opaque->gist_page_id = GIST_PAGE_ID; +} + +/* + * Initialize a new index buffer + */ +void +GISTInitBuffer(Buffer b, uint32 f) +{ + Page page; + + page = BufferGetPage(b); + gistinitpage(page, f); +} + +/* + * Verify that a freshly-read page looks sane. + */ +void +gistcheckpage(Relation rel, Buffer buf) +{ + Page page = BufferGetPage(buf); + + /* + * ReadBuffer verifies that every newly-read page passes + * PageHeaderIsValid, which means it either contains a reasonably sane + * page header or is all-zero. We have to defend against the all-zero + * case, however. + */ + if (PageIsNew(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains unexpected zero page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + + /* + * Additionally check that the special area looks sane. + */ + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains corrupted page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); +} + + +/* + * Allocate a new page (either by recycling, or by extending the index file) + * + * The returned buffer is already pinned and exclusive-locked + * + * Caller is responsible for initializing the page by calling GISTInitBuffer + */ +Buffer +gistNewBuffer(Relation r) +{ + Buffer buffer; + bool needLock; + + /* First, try to get a page from FSM */ + for (;;) + { + BlockNumber blkno = GetFreeIndexPage(r); + + if (blkno == InvalidBlockNumber) + break; /* nothing left in FSM */ + + buffer = ReadBuffer(r, blkno); + + /* + * We have to guard against the possibility that someone else already + * recycled this page; the buffer may be locked if so. + */ + if (ConditionalLockBuffer(buffer)) + { + Page page = BufferGetPage(buffer); + + /* + * If the page was never initialized, it's OK to use. + */ + if (PageIsNew(page)) + return buffer; + + gistcheckpage(r, buffer); + + /* + * Otherwise, recycle it if deleted, and too old to have any + * processes interested in it. + */ + if (gistPageRecyclable(page)) + { + /* + * If we are generating WAL for Hot Standby then create a WAL + * record that will allow us to conflict with queries running + * on standby, in case they have snapshots older than the + * page's deleteXid. + */ + if (XLogStandbyInfoActive() && RelationNeedsWAL(r)) + gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page)); + + return buffer; + } + + LockBuffer(buffer, GIST_UNLOCK); + } + + /* Can't use it, so release buffer and try again */ + ReleaseBuffer(buffer); + } + + /* Must extend the file */ + needLock = !RELATION_IS_LOCAL(r); + + if (needLock) + LockRelationForExtension(r, ExclusiveLock); + + buffer = ReadBuffer(r, P_NEW); + LockBuffer(buffer, GIST_EXCLUSIVE); + + if (needLock) + UnlockRelationForExtension(r, ExclusiveLock); + + return buffer; +} + +/* Can this page be recycled yet? */ +bool +gistPageRecyclable(Page page) +{ + if (PageIsNew(page)) + return true; + if (GistPageIsDeleted(page)) + { + /* + * The page was deleted, but when? If it was just deleted, a scan + * might have seen the downlink to it, and will read the page later. + * As long as that can happen, we must keep the deleted page around as + * a tombstone. + * + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have + * seen its downlink, and we can recycle it. + */ + FullTransactionId deletexid_full = GistPageGetDeleteXid(page); + + return GlobalVisCheckRemovableFullXid(NULL, deletexid_full); + } + return false; +} + +bytea * +gistoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)}, + {"buffering", RELOPT_TYPE_ENUM, offsetof(GiSTOptions, buffering_mode)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_GIST, + sizeof(GiSTOptions), + tab, lengthof(tab)); +} + +/* + * gistproperty() -- Check boolean properties of indexes. + * + * This is optional for most AMs, but is required for GiST because the core + * property code doesn't support AMPROP_DISTANCE_ORDERABLE. We also handle + * AMPROP_RETURNABLE here to save opening the rel to call gistcanreturn. + */ +bool +gistproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull) +{ + Oid opclass, + opfamily, + opcintype; + int16 procno; + + /* Only answer column-level inquiries */ + if (attno == 0) + return false; + + /* + * Currently, GiST distance-ordered scans require that there be a distance + * function in the opclass with the default types (i.e. the one loaded + * into the relcache entry, see initGISTstate). So we assume that if such + * a function exists, then there's a reason for it (rather than grubbing + * through all the opfamily's operators to find an ordered one). + * + * Essentially the same code can test whether we support returning the + * column data, since that's true if the opclass provides a fetch proc. + */ + + switch (prop) + { + case AMPROP_DISTANCE_ORDERABLE: + procno = GIST_DISTANCE_PROC; + break; + case AMPROP_RETURNABLE: + procno = GIST_FETCH_PROC; + break; + default: + return false; + } + + /* First we need to know the column's opclass. */ + opclass = get_index_column_opclass(index_oid, attno); + if (!OidIsValid(opclass)) + { + *isnull = true; + return true; + } + + /* Now look up the opclass family and input datatype. */ + if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype)) + { + *isnull = true; + return true; + } + + /* And now we can check whether the function is provided. */ + + *res = SearchSysCacheExists4(AMPROCNUM, + ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(opcintype), + ObjectIdGetDatum(opcintype), + Int16GetDatum(procno)); + + /* + * Special case: even without a fetch function, AMPROP_RETURNABLE is true + * if the opclass has no compress function. + */ + if (prop == AMPROP_RETURNABLE && !*res) + { + *res = !SearchSysCacheExists4(AMPROCNUM, + ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(opcintype), + ObjectIdGetDatum(opcintype), + Int16GetDatum(GIST_COMPRESS_PROC)); + } + + *isnull = false; + + return true; +} + +/* + * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page + * splits anyway. This function provides a fake sequence of LSNs for that + * purpose. + */ +XLogRecPtr +gistGetFakeLSN(Relation rel) +{ + if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) + { + /* + * Temporary relations are only accessible in our session, so a simple + * backend-local counter will do. + */ + static XLogRecPtr counter = FirstNormalUnloggedLSN; + + return counter++; + } + else if (RelationIsPermanent(rel)) + { + /* + * WAL-logging on this relation will start after commit, so its LSNs + * must be distinct numbers smaller than the LSN at the next commit. + * Emit a dummy WAL record if insert-LSN hasn't advanced after the + * last call. + */ + static XLogRecPtr lastlsn = InvalidXLogRecPtr; + XLogRecPtr currlsn = GetXLogInsertRecPtr(); + + /* Shouldn't be called for WAL-logging relations */ + Assert(!RelationNeedsWAL(rel)); + + /* No need for an actual record if we already have a distinct LSN */ + if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn) + currlsn = gistXLogAssignLSN(); + + lastlsn = currlsn; + return currlsn; + } + else + { + /* + * Unlogged relations are accessible from other backends, and survive + * (clean) restarts. GetFakeLSNForUnloggedRel() handles that for us. + */ + Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED); + return GetFakeLSNForUnloggedRel(); + } +} diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c new file mode 100644 index 0000000..0663193 --- /dev/null +++ b/src/backend/access/gist/gistvacuum.c @@ -0,0 +1,668 @@ +/*------------------------------------------------------------------------- + * + * gistvacuum.c + * vacuuming routines for the postgres GiST index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistvacuum.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/gist_private.h" +#include "access/transam.h" +#include "commands/vacuum.h" +#include "lib/integerset.h" +#include "miscadmin.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "utils/memutils.h" + +/* Working state needed by gistbulkdelete */ +typedef struct +{ + IndexVacuumInfo *info; + IndexBulkDeleteResult *stats; + IndexBulkDeleteCallback callback; + void *callback_state; + GistNSN startNSN; + + /* + * These are used to memorize all internal and empty leaf pages. They are + * used for deleting all the empty pages. + */ + IntegerSet *internal_page_set; + IntegerSet *empty_leaf_set; + MemoryContext page_set_context; +} GistVacState; + +static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state); +static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno, + BlockNumber orig_blkno); +static void gistvacuum_delete_empty_pages(IndexVacuumInfo *info, + GistVacState *vstate); +static bool gistdeletepage(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + Buffer buffer, OffsetNumber downlink, + Buffer leafBuffer); + +/* + * VACUUM bulkdelete stage: remove index entries. + */ +IndexBulkDeleteResult * +gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + gistvacuumscan(info, stats, callback, callback_state); + + return stats; +} + +/* + * VACUUM cleanup stage: delete empty pages, and update index statistics. + */ +IndexBulkDeleteResult * +gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + return stats; + + /* + * If gistbulkdelete was called, we need not do anything, just return the + * stats from the latest gistbulkdelete call. If it wasn't called, we + * still need to do a pass over the index, to obtain index statistics. + */ + if (stats == NULL) + { + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + gistvacuumscan(info, stats, NULL, NULL); + } + + /* + * It's quite possible for us to be fooled by concurrent page splits into + * double-counting some index tuples, so disbelieve any total that exceeds + * the underlying heap's count ... if we know that accurately. Otherwise + * this might just make matters worse. + */ + if (!info->estimated_count) + { + if (stats->num_index_tuples > info->num_heap_tuples) + stats->num_index_tuples = info->num_heap_tuples; + } + + return stats; +} + +/* + * gistvacuumscan --- scan the index for VACUUMing purposes + * + * This scans the index for leaf tuples that are deletable according to the + * vacuum callback, and updates the stats. Both btbulkdelete and + * btvacuumcleanup invoke this (the latter only if no btbulkdelete call + * occurred). + * + * This also makes note of any empty leaf pages, as well as all internal + * pages while looping over all index pages. After scanning all the pages, we + * remove the empty pages so that they can be reused. Any deleted pages are + * added directly to the free space map. (They should've been added there + * when they were originally deleted, already, but it's possible that the FSM + * was lost at a crash, for example.) + * + * The caller is responsible for initially allocating/zeroing a stats struct. + */ +static void +gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation rel = info->index; + GistVacState vstate; + BlockNumber num_pages; + bool needLock; + BlockNumber blkno; + MemoryContext oldctx; + + /* + * Reset fields that track information about the entire index now. This + * avoids double-counting in the case where a single VACUUM command + * requires multiple scans of the index. + * + * Avoid resetting the tuples_removed and pages_newly_deleted fields here, + * since they track information about the VACUUM command, and so must last + * across each call to gistvacuumscan(). + * + * (Note that pages_free is treated as state about the whole index, not + * the current VACUUM. This is appropriate because RecordFreeIndexPage() + * calls are idempotent, and get repeated for the same deleted pages in + * some scenarios. The point for us is to track the number of recyclable + * pages in the index at the end of the VACUUM command.) + */ + stats->num_pages = 0; + stats->estimated_count = false; + stats->num_index_tuples = 0; + stats->pages_deleted = 0; + stats->pages_free = 0; + + /* + * Create the integer sets to remember all the internal and the empty leaf + * pages in page_set_context. Internally, the integer set will remember + * this context so that the subsequent allocations for these integer sets + * will be done from the same context. + */ + vstate.page_set_context = GenerationContextCreate(CurrentMemoryContext, + "GiST VACUUM page set context", + 16 * 1024); + oldctx = MemoryContextSwitchTo(vstate.page_set_context); + vstate.internal_page_set = intset_create(); + vstate.empty_leaf_set = intset_create(); + MemoryContextSwitchTo(oldctx); + + /* Set up info to pass down to gistvacuumpage */ + vstate.info = info; + vstate.stats = stats; + vstate.callback = callback; + vstate.callback_state = callback_state; + if (RelationNeedsWAL(rel)) + vstate.startNSN = GetInsertRecPtr(); + else + vstate.startNSN = gistGetFakeLSN(rel); + + /* + * The outer loop iterates over all index pages, in physical order (we + * hope the kernel will cooperate in providing read-ahead for speed). It + * is critical that we visit all leaf pages, including ones added after we + * start the scan, else we might fail to delete some deletable tuples. + * Hence, we must repeatedly check the relation length. We must acquire + * the relation-extension lock while doing so to avoid a race condition: + * if someone else is extending the relation, there is a window where + * bufmgr/smgr have created a new all-zero page but it hasn't yet been + * write-locked by gistNewBuffer(). If we manage to scan such a page + * here, we'll improperly assume it can be recycled. Taking the lock + * synchronizes things enough to prevent a problem: either num_pages won't + * include the new page, or gistNewBuffer already has write lock on the + * buffer and it will be fully initialized before we can examine it. (See + * also vacuumlazy.c, which has the same issue.) Also, we need not worry + * if a page is added immediately after we look; the page splitting code + * already has write-lock on the left page before it adds a right page, so + * we must already have processed any tuples due to be moved into such a + * page. + * + * We can skip locking for new or temp relations, however, since no one + * else could be accessing them. + */ + needLock = !RELATION_IS_LOCAL(rel); + + blkno = GIST_ROOT_BLKNO; + for (;;) + { + /* Get the current relation length */ + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + num_pages = RelationGetNumberOfBlocks(rel); + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + /* Quit if we've scanned the whole relation */ + if (blkno >= num_pages) + break; + /* Iterate over pages, then loop back to recheck length */ + for (; blkno < num_pages; blkno++) + gistvacuumpage(&vstate, blkno, blkno); + } + + /* + * If we found any recyclable pages (and recorded them in the FSM), then + * forcibly update the upper-level FSM pages to ensure that searchers can + * find them. It's possible that the pages were also found during + * previous scans and so this is a waste of time, but it's cheap enough + * relative to scanning the index that it shouldn't matter much, and + * making sure that free pages are available sooner not later seems + * worthwhile. + * + * Note that if no recyclable pages exist, we don't bother vacuuming the + * FSM at all. + */ + if (stats->pages_free > 0) + IndexFreeSpaceMapVacuum(rel); + + /* update statistics */ + stats->num_pages = num_pages; + + /* + * If we saw any empty pages, try to unlink them from the tree so that + * they can be reused. + */ + gistvacuum_delete_empty_pages(info, &vstate); + + /* we don't need the internal and empty page sets anymore */ + MemoryContextDelete(vstate.page_set_context); + vstate.page_set_context = NULL; + vstate.internal_page_set = NULL; + vstate.empty_leaf_set = NULL; +} + +/* + * gistvacuumpage --- VACUUM one page + * + * This processes a single page for gistbulkdelete(). In some cases we + * must go back and re-examine previously-scanned pages; this routine + * recurses when necessary to handle that case. + * + * blkno is the page to process. orig_blkno is the highest block number + * reached by the outer gistvacuumscan loop (the same as blkno, unless we + * are recursing to re-examine a previous page). + */ +static void +gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) +{ + IndexVacuumInfo *info = vstate->info; + IndexBulkDeleteCallback callback = vstate->callback; + void *callback_state = vstate->callback_state; + Relation rel = info->index; + Buffer buffer; + Page page; + BlockNumber recurse_to; + +restart: + recurse_to = InvalidBlockNumber; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); + + /* + * We are not going to stay here for a long time, aggressively grab an + * exclusive lock. + */ + LockBuffer(buffer, GIST_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + if (gistPageRecyclable(page)) + { + /* Okay to recycle this page */ + RecordFreeIndexPage(rel, blkno); + vstate->stats->pages_deleted++; + vstate->stats->pages_free++; + } + else if (GistPageIsDeleted(page)) + { + /* Already deleted, but can't recycle yet */ + vstate->stats->pages_deleted++; + } + else if (GistPageIsLeaf(page)) + { + OffsetNumber todelete[MaxOffsetNumber]; + int ntodelete = 0; + int nremain; + GISTPageOpaque opaque = GistPageGetOpaque(page); + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + + /* + * Check whether we need to recurse back to earlier pages. What we + * are concerned about is a page split that happened since we started + * the vacuum scan. If the split moved some tuples to a lower page + * then we might have missed 'em. If so, set up for tail recursion. + * + * This is similar to the checks we do during searches, when following + * a downlink, but we don't need to jump to higher-numbered pages, + * because we will process them later, anyway. + */ + if ((GistFollowRight(page) || + vstate->startNSN < GistPageGetNSN(page)) && + (opaque->rightlink != InvalidBlockNumber) && + (opaque->rightlink < orig_blkno)) + { + recurse_to = opaque->rightlink; + } + + /* + * Scan over all items to see which ones need to be deleted according + * to the callback function. + */ + if (callback) + { + OffsetNumber off; + + for (off = FirstOffsetNumber; + off <= maxoff; + off = OffsetNumberNext(off)) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (callback(&(idxtuple->t_tid), callback_state)) + todelete[ntodelete++] = off; + } + } + + /* + * Apply any needed deletes. We issue just one WAL record per page, + * so as to minimize WAL traffic. + */ + if (ntodelete > 0) + { + START_CRIT_SECTION(); + + MarkBufferDirty(buffer); + + PageIndexMultiDelete(page, todelete, ntodelete); + GistMarkTuplesDeleted(page); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogUpdate(buffer, + todelete, ntodelete, + NULL, 0, InvalidBuffer); + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + + END_CRIT_SECTION(); + + vstate->stats->tuples_removed += ntodelete; + /* must recompute maxoff */ + maxoff = PageGetMaxOffsetNumber(page); + } + + nremain = maxoff - FirstOffsetNumber + 1; + if (nremain == 0) + { + /* + * The page is now completely empty. Remember its block number, + * so that we will try to delete the page in the second stage. + * + * Skip this when recursing, because IntegerSet requires that the + * values are added in ascending order. The next VACUUM will pick + * it up. + */ + if (blkno == orig_blkno) + intset_add_member(vstate->empty_leaf_set, blkno); + } + else + vstate->stats->num_index_tuples += nremain; + } + else + { + /* + * On an internal page, check for "invalid tuples", left behind by an + * incomplete page split on PostgreSQL 9.0 or below. These are not + * created by newer PostgreSQL versions, but unfortunately, there is + * no version number anywhere in a GiST index, so we don't know + * whether this index might still contain invalid tuples or not. + */ + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + OffsetNumber off; + + for (off = FirstOffsetNumber; + off <= maxoff; + off = OffsetNumberNext(off)) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (GistTupleIsInvalid(idxtuple)) + ereport(LOG, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(rel)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + } + + /* + * Remember the block number of this page, so that we can revisit it + * later in gistvacuum_delete_empty_pages(), when we search for + * parents of empty leaf pages. + */ + if (blkno == orig_blkno) + intset_add_member(vstate->internal_page_set, blkno); + } + + UnlockReleaseBuffer(buffer); + + /* + * This is really tail recursion, but if the compiler is too stupid to + * optimize it as such, we'd eat an uncomfortably large amount of stack + * space per recursion level (due to the deletable[] array). A failure is + * improbable since the number of levels isn't likely to be large ... but + * just in case, let's hand-optimize into a loop. + */ + if (recurse_to != InvalidBlockNumber) + { + blkno = recurse_to; + goto restart; + } +} + +/* + * Scan all internal pages, and try to delete their empty child pages. + */ +static void +gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistVacState *vstate) +{ + Relation rel = info->index; + BlockNumber empty_pages_remaining; + uint64 blkno; + + /* + * Rescan all inner pages to find those that have empty child pages. + */ + empty_pages_remaining = intset_num_entries(vstate->empty_leaf_set); + intset_begin_iterate(vstate->internal_page_set); + while (empty_pages_remaining > 0 && + intset_iterate_next(vstate->internal_page_set, &blkno)) + { + Buffer buffer; + Page page; + OffsetNumber off, + maxoff; + OffsetNumber todelete[MaxOffsetNumber]; + BlockNumber leafs_to_delete[MaxOffsetNumber]; + int ntodelete; + int deleted; + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber) blkno, + RBM_NORMAL, info->strategy); + + LockBuffer(buffer, GIST_SHARE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)) + { + /* + * This page was an internal page earlier, but now it's something + * else. Shouldn't happen... + */ + Assert(false); + UnlockReleaseBuffer(buffer); + continue; + } + + /* + * Scan all the downlinks, and see if any of them point to empty leaf + * pages. + */ + maxoff = PageGetMaxOffsetNumber(page); + ntodelete = 0; + for (off = FirstOffsetNumber; + off <= maxoff && ntodelete < maxoff - 1; + off = OffsetNumberNext(off)) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + BlockNumber leafblk; + + leafblk = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + if (intset_is_member(vstate->empty_leaf_set, leafblk)) + { + leafs_to_delete[ntodelete] = leafblk; + todelete[ntodelete++] = off; + } + } + + /* + * In order to avoid deadlock, child page must be locked before + * parent, so we must release the lock on the parent, lock the child, + * and then re-acquire the lock the parent. (And we wouldn't want to + * do I/O, while holding a lock, anyway.) + * + * At the instant that we're not holding a lock on the parent, the + * downlink might get moved by a concurrent insert, so we must + * re-check that it still points to the same child page after we have + * acquired both locks. Also, another backend might have inserted a + * tuple to the page, so that it is no longer empty. gistdeletepage() + * re-checks all these conditions. + */ + LockBuffer(buffer, GIST_UNLOCK); + + deleted = 0; + for (int i = 0; i < ntodelete; i++) + { + Buffer leafbuf; + + /* + * Don't remove the last downlink from the parent. That would + * confuse the insertion code. + */ + if (PageGetMaxOffsetNumber(page) == FirstOffsetNumber) + break; + + leafbuf = ReadBufferExtended(rel, MAIN_FORKNUM, leafs_to_delete[i], + RBM_NORMAL, info->strategy); + LockBuffer(leafbuf, GIST_EXCLUSIVE); + gistcheckpage(rel, leafbuf); + + LockBuffer(buffer, GIST_EXCLUSIVE); + if (gistdeletepage(info, vstate->stats, + buffer, todelete[i] - deleted, + leafbuf)) + deleted++; + LockBuffer(buffer, GIST_UNLOCK); + + UnlockReleaseBuffer(leafbuf); + } + + ReleaseBuffer(buffer); + + /* + * We can stop the scan as soon as we have seen the downlinks, even if + * we were not able to remove them all. + */ + empty_pages_remaining -= ntodelete; + } +} + +/* + * gistdeletepage takes a leaf page, and its parent, and tries to delete the + * leaf. Both pages must be locked. + * + * Even if the page was empty when we first saw it, a concurrent inserter might + * have added a tuple to it since. Similarly, the downlink might have moved. + * We re-check all the conditions, to make sure the page is still deletable, + * before modifying anything. + * + * Returns true, if the page was deleted, and false if a concurrent update + * prevented it. + */ +static bool +gistdeletepage(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + Buffer parentBuffer, OffsetNumber downlink, + Buffer leafBuffer) +{ + Page parentPage = BufferGetPage(parentBuffer); + Page leafPage = BufferGetPage(leafBuffer); + ItemId iid; + IndexTuple idxtuple; + XLogRecPtr recptr; + FullTransactionId txid; + + /* + * Check that the leaf is still empty and deletable. + */ + if (!GistPageIsLeaf(leafPage)) + { + /* a leaf page should never become a non-leaf page */ + Assert(false); + return false; + } + + if (GistFollowRight(leafPage)) + return false; /* don't mess with a concurrent page split */ + + if (PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber) + return false; /* not empty anymore */ + + /* + * Ok, the leaf is deletable. Is the downlink in the parent page still + * valid? It might have been moved by a concurrent insert. We could try + * to re-find it by scanning the page again, possibly moving right if the + * was split. But for now, let's keep it simple and just give up. The + * next VACUUM will pick it up. + */ + if (PageIsNew(parentPage) || GistPageIsDeleted(parentPage) || + GistPageIsLeaf(parentPage)) + { + /* shouldn't happen, internal pages are never deleted */ + Assert(false); + return false; + } + + if (PageGetMaxOffsetNumber(parentPage) < downlink + || PageGetMaxOffsetNumber(parentPage) <= FirstOffsetNumber) + return false; + + iid = PageGetItemId(parentPage, downlink); + idxtuple = (IndexTuple) PageGetItem(parentPage, iid); + if (BufferGetBlockNumber(leafBuffer) != + ItemPointerGetBlockNumber(&(idxtuple->t_tid))) + return false; + + /* + * All good, proceed with the deletion. + * + * The page cannot be immediately recycled, because in-progress scans that + * saw the downlink might still visit it. Mark the page with the current + * next-XID counter, so that we know when it can be recycled. Once that + * XID becomes older than GlobalXmin, we know that all scans that are + * currently in progress must have ended. (That's much more conservative + * than needed, but let's keep it safe and simple.) + */ + txid = ReadNextFullTransactionId(); + + START_CRIT_SECTION(); + + /* mark the page as deleted */ + MarkBufferDirty(leafBuffer); + GistPageSetDeleted(leafPage, txid); + stats->pages_newly_deleted++; + stats->pages_deleted++; + + /* remove the downlink from the parent */ + MarkBufferDirty(parentBuffer); + PageIndexTupleDelete(parentPage, downlink); + + if (RelationNeedsWAL(info->index)) + recptr = gistXLogPageDelete(leafBuffer, txid, parentBuffer, downlink); + else + recptr = gistGetFakeLSN(info->index); + PageSetLSN(parentPage, recptr); + PageSetLSN(leafPage, recptr); + + END_CRIT_SECTION(); + + return true; +} diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c new file mode 100644 index 0000000..b885fa2 --- /dev/null +++ b/src/backend/access/gist/gistvalidate.c @@ -0,0 +1,355 @@ +/*------------------------------------------------------------------------- + * + * gistvalidate.c + * Opclass validator for GiST. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/gist_private.h" +#include "access/htup_details.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + + +/* + * Validator for a GiST opclass. + */ +bool +gistvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + Oid opckeytype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opckeytype = classform->opckeytype; + if (!OidIsValid(opckeytype)) + opckeytype = opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* + * All GiST support functions should be registered with matching + * left/right types + */ + if (procform->amproclefttype != procform->amprocrighttype) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types", + opfamilyname, "gist", + format_procedure(procform->amproc)))); + result = false; + } + + /* + * We can't check signatures except within the specific opclass, since + * we need to know the associated opckeytype in many cases. + */ + if (procform->amproclefttype != opcintype) + continue; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case GIST_CONSISTENT_PROC: + ok = check_amproc_signature(procform->amproc, BOOLOID, false, + 5, 5, INTERNALOID, opcintype, + INT2OID, OIDOID, INTERNALOID); + break; + case GIST_UNION_PROC: + ok = check_amproc_signature(procform->amproc, opckeytype, false, + 2, 2, INTERNALOID, INTERNALOID); + break; + case GIST_COMPRESS_PROC: + case GIST_DECOMPRESS_PROC: + case GIST_FETCH_PROC: + ok = check_amproc_signature(procform->amproc, INTERNALOID, true, + 1, 1, INTERNALOID); + break; + case GIST_PENALTY_PROC: + ok = check_amproc_signature(procform->amproc, INTERNALOID, true, + 3, 3, INTERNALOID, + INTERNALOID, INTERNALOID); + break; + case GIST_PICKSPLIT_PROC: + ok = check_amproc_signature(procform->amproc, INTERNALOID, true, + 2, 2, INTERNALOID, INTERNALOID); + break; + case GIST_EQUAL_PROC: + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 3, 3, opckeytype, opckeytype, + INTERNALOID); + break; + case GIST_DISTANCE_PROC: + ok = check_amproc_signature(procform->amproc, FLOAT8OID, false, + 5, 5, INTERNALOID, opcintype, + INT2OID, OIDOID, INTERNALOID); + break; + case GIST_OPTIONS_PROC: + ok = check_amoptsproc_signature(procform->amproc); + break; + case GIST_SORTSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "gist", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* don't want additional message */ + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "gist", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + Oid op_rettype; + + /* TODO: Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "gist", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* GiST supports ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH) + { + /* ... but must have matching distance proc */ + if (!OidIsValid(get_opfamily_proc(opfamilyoid, + oprform->amoplefttype, + oprform->amoplefttype, + GIST_DISTANCE_PROC))) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains unsupported ORDER BY specification for operator %s", + opfamilyname, "gist", + format_operator(oprform->amopopr)))); + result = false; + } + /* ... and operator result must match the claimed btree opfamily */ + op_rettype = get_op_rettype(oprform->amopopr); + if (!opfamily_can_sort_type(oprform->amopsortfamily, op_rettype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains incorrect ORDER BY opfamily specification for operator %s", + opfamilyname, "gist", + format_operator(oprform->amopopr)))); + result = false; + } + } + else + { + /* Search operators must always return bool */ + op_rettype = BOOLOID; + } + + /* Check operator signature */ + if (!check_amop_signature(oprform->amopopr, op_rettype, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "gist", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * There is not a lot we can do to check the operator sets, since each + * GiST opclass is more or less a law unto itself, and some contain + * only operators that are binary-compatible with the opclass datatype + * (meaning that empty operator sets can be OK). That case also means + * that we shouldn't insist on nonempty function sets except for the + * opclass's own group. + */ + } + + /* Check that the originally-named opclass is complete */ + for (i = 1; i <= GISTNProcs; i++) + { + if (opclassgroup && + (opclassgroup->functionset & (((uint64) 1) << i)) != 0) + continue; /* got it */ + if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC || + i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC || + i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC) + continue; /* optional methods */ + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d", + opclassname, "gist", i))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + +/* + * Prechecking function for adding operators/functions to a GiST opfamily. + */ +void +gistadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions) +{ + ListCell *lc; + + /* + * Operator members of a GiST opfamily should never have hard + * dependencies, since their connection to the opfamily depends only on + * what the support functions think, and that can be altered. For + * consistency, we make all soft dependencies point to the opfamily, + * though a soft dependency on the opclass would work as well in the + * CREATE OPERATOR CLASS case. + */ + foreach(lc, operators) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + + /* + * Required support functions should have hard dependencies. Preferably + * those are just dependencies on the opclass, but if we're in ALTER + * OPERATOR FAMILY, we leave the dependency pointing at the whole + * opfamily. (Given that GiST opclasses generally don't share opfamilies, + * it seems unlikely to be worth working harder.) + */ + foreach(lc, functions) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + switch (op->number) + { + case GIST_CONSISTENT_PROC: + case GIST_UNION_PROC: + case GIST_PENALTY_PROC: + case GIST_PICKSPLIT_PROC: + case GIST_EQUAL_PROC: + /* Required support function */ + op->ref_is_hard = true; + break; + case GIST_COMPRESS_PROC: + case GIST_DECOMPRESS_PROC: + case GIST_DISTANCE_PROC: + case GIST_FETCH_PROC: + case GIST_OPTIONS_PROC: + case GIST_SORTSUPPORT_PROC: + /* Optional, so force it to be a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("support function number %d is invalid for access method %s", + op->number, "gist"))); + break; + } + } +} diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c new file mode 100644 index 0000000..6464cb9 --- /dev/null +++ b/src/backend/access/gist/gistxlog.c @@ -0,0 +1,696 @@ +/*------------------------------------------------------------------------- + * + * gistxlog.c + * WAL replay logic for GiST. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistxlog.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/gist_private.h" +#include "access/gistxlog.h" +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/procarray.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +static MemoryContext opCtx; /* working memory for operations */ + +/* + * Replay the clearing of F_FOLLOW_RIGHT flag on a child page. + * + * Even if the WAL record includes a full-page image, we have to update the + * follow-right flag, because that change is not included in the full-page + * image. To be sure that the intermediate state with the wrong flag value is + * not visible to concurrent Hot Standby queries, this function handles + * restoring the full-page image as well as updating the flag. (Note that + * we never need to do anything else to the child page in the current WAL + * action.) + */ +static void +gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + XLogRedoAction action; + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the updated NSN is not included in the image. + */ + action = XLogReadBufferForRedo(record, block_id, &buffer); + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + page = BufferGetPage(buffer); + + GistPageSetNSN(page, lsn); + GistClearFollowRight(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * redo any page update (except page split) + */ +static void +gistRedoPageUpdateRecord(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + char *begin; + char *data; + Size datalen; + int ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 0, &datalen); + + page = (Page) BufferGetPage(buffer); + + if (xldata->ntodelete == 1 && xldata->ntoinsert == 1) + { + /* + * When replacing one tuple with one other tuple, we must use + * PageIndexTupleOverwrite for consistency with gistplacetopage. + */ + OffsetNumber offnum = *((OffsetNumber *) data); + IndexTuple itup; + Size itupsize; + + data += sizeof(OffsetNumber); + itup = (IndexTuple) data; + itupsize = IndexTupleSize(itup); + if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize)) + elog(ERROR, "failed to add item to GiST index page, size %d bytes", + (int) itupsize); + data += itupsize; + /* should be nothing left after consuming 1 tuple */ + Assert(data - begin == datalen); + /* update insertion count for assert check below */ + ninserted++; + } + else if (xldata->ntodelete > 0) + { + /* Otherwise, delete old tuples if any */ + OffsetNumber *todelete = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntodelete; + + PageIndexMultiDelete(page, todelete, xldata->ntodelete); + if (GistPageIsLeaf(page)) + GistMarkTuplesDeleted(page); + } + + /* Add new tuples if any */ + if (data - begin < datalen) + { + OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size sz = IndexTupleSize(itup); + OffsetNumber l; + + data += sz; + + l = PageAddItem(page, (Item) itup, sz, off, false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to GiST index page, size %d bytes", + (int) sz); + off++; + ninserted++; + } + } + + /* Check that XLOG record contained expected number of tuples */ + Assert(ninserted == xldata->ntoinsert); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + /* + * Fix follow-right data on left child page + * + * This must be done while still holding the lock on the target page. Note + * that even if the target page no longer exists, we still attempt to + * replay the change on the child page. + */ + if (XLogRecHasBlockRef(record, 1)) + gistRedoClearFollowRight(record, 1); + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + + +/* + * redo delete on gist index page to remove tuples marked as DEAD during index + * tuple insertion + */ +static void +gistRedoDeleteRecord(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record); + Buffer buffer; + Page page; + + /* + * If we have any conflict processing to do, it must happen before we + * update the page. + * + * GiST delete records can conflict with standby queries. You might think + * that vacuum records would conflict as well, but we've handled that + * already. XLOG_HEAP2_PRUNE records provide the highest xid cleaned by + * the vacuum of the heap and so we can resolve any conflicts just once + * when that arrives. After that we know that no conflicts exist from + * individual gist vacuum records on that index. + */ + if (InHotStandby) + { + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + + ResolveRecoveryConflictWithSnapshot(xldata->latestRemovedXid, rnode); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete) + { + OffsetNumber *todelete; + + todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete); + + PageIndexMultiDelete(page, todelete, xldata->ntodelete); + } + + GistClearPageHasGarbage(page); + GistMarkTuplesDeleted(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * Returns an array of index pointers. + */ +static IndexTuple * +decodePageSplitRecord(char *begin, int len, int *n) +{ + char *ptr; + int i = 0; + IndexTuple *tuples; + + /* extract the number of tuples */ + memcpy(n, begin, sizeof(int)); + ptr = begin + sizeof(int); + + tuples = palloc(*n * sizeof(IndexTuple)); + + for (i = 0; i < *n; i++) + { + Assert(ptr - begin < len); + tuples[i] = (IndexTuple) ptr; + ptr += IndexTupleSize((IndexTuple) ptr); + } + Assert(ptr - begin == len); + + return tuples; +} + +static void +gistRedoPageSplitRecord(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); + Buffer firstbuffer = InvalidBuffer; + Buffer buffer; + Page page; + int i; + bool isrootsplit = false; + + /* + * We must hold lock on the first-listed page throughout the action, + * including while updating the left child page (if any). We can unlock + * remaining pages in the list as soon as they've been written, because + * there is no path for concurrent queries to reach those pages without + * first visiting the first-listed page. + */ + + /* loop around all pages */ + for (i = 0; i < xldata->npage; i++) + { + int flags; + char *data; + Size datalen; + int num; + BlockNumber blkno; + IndexTuple *tuples; + + XLogRecGetBlockTag(record, i + 1, NULL, NULL, &blkno); + if (blkno == GIST_ROOT_BLKNO) + { + Assert(i == 0); + isrootsplit = true; + } + + buffer = XLogInitBufferForRedo(record, i + 1); + page = (Page) BufferGetPage(buffer); + data = XLogRecGetBlockData(record, i + 1, &datalen); + + tuples = decodePageSplitRecord(data, datalen, &num); + + /* ok, clear buffer */ + if (xldata->origleaf && blkno != GIST_ROOT_BLKNO) + flags = F_LEAF; + else + flags = 0; + GISTInitBuffer(buffer, flags); + + /* and fill it */ + gistfillbuffer(page, tuples, num, FirstOffsetNumber); + + if (blkno == GIST_ROOT_BLKNO) + { + GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; + GistPageSetNSN(page, xldata->orignsn); + GistClearFollowRight(page); + } + else + { + if (i < xldata->npage - 1) + { + BlockNumber nextblkno; + + XLogRecGetBlockTag(record, i + 2, NULL, NULL, &nextblkno); + GistPageGetOpaque(page)->rightlink = nextblkno; + } + else + GistPageGetOpaque(page)->rightlink = xldata->origrlink; + GistPageSetNSN(page, xldata->orignsn); + if (i < xldata->npage - 1 && !isrootsplit && + xldata->markfollowright) + GistMarkFollowRight(page); + else + GistClearFollowRight(page); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + if (i == 0) + firstbuffer = buffer; + else + UnlockReleaseBuffer(buffer); + } + + /* Fix follow-right data on left child page, if any */ + if (XLogRecHasBlockRef(record, 0)) + gistRedoClearFollowRight(record, 0); + + /* Finally, release lock on the first page */ + UnlockReleaseBuffer(firstbuffer); +} + +/* redo page deletion */ +static void +gistRedoPageDelete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record); + Buffer parentBuffer; + Buffer leafBuffer; + + if (XLogReadBufferForRedo(record, 0, &leafBuffer) == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(leafBuffer); + + GistPageSetDeleted(page, xldata->deleteXid); + + PageSetLSN(page, lsn); + MarkBufferDirty(leafBuffer); + } + + if (XLogReadBufferForRedo(record, 1, &parentBuffer) == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(parentBuffer); + + PageIndexTupleDelete(page, xldata->downlinkOffset); + + PageSetLSN(page, lsn); + MarkBufferDirty(parentBuffer); + } + + if (BufferIsValid(parentBuffer)) + UnlockReleaseBuffer(parentBuffer); + if (BufferIsValid(leafBuffer)) + UnlockReleaseBuffer(leafBuffer); +} + +static void +gistRedoPageReuse(XLogReaderState *record) +{ + gistxlogPageReuse *xlrec = (gistxlogPageReuse *) XLogRecGetData(record); + + /* + * PAGE_REUSE records exist to provide a conflict point when we reuse + * pages in the index via the FSM. That's all they do though. + * + * latestRemovedXid was the page's deleteXid. The + * GlobalVisCheckRemovableFullXid(deleteXid) test in gistPageRecyclable() + * conceptually mirrors the PGPROC->xmin > limitXmin test in + * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the + * same exclusion effect on primary and standby. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); +} + +void +gist_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCxt; + + /* + * GiST indexes do not require any conflict processing. NB: If we ever + * implement a similar optimization we have in b-tree, and remove killed + * tuples outside VACUUM, we'll need to handle that here. + */ + + oldCxt = MemoryContextSwitchTo(opCtx); + switch (info) + { + case XLOG_GIST_PAGE_UPDATE: + gistRedoPageUpdateRecord(record); + break; + case XLOG_GIST_DELETE: + gistRedoDeleteRecord(record); + break; + case XLOG_GIST_PAGE_REUSE: + gistRedoPageReuse(record); + break; + case XLOG_GIST_PAGE_SPLIT: + gistRedoPageSplitRecord(record); + break; + case XLOG_GIST_PAGE_DELETE: + gistRedoPageDelete(record); + break; + case XLOG_GIST_ASSIGN_LSN: + /* nop. See gistGetFakeLSN(). */ + break; + default: + elog(PANIC, "gist_redo: unknown op code %u", info); + } + + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(opCtx); +} + +void +gist_xlog_startup(void) +{ + opCtx = createTempGistContext(); +} + +void +gist_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); +} + +/* + * Mask a Gist page before running consistency checks on it. + */ +void +gist_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + /* + * NSN is nothing but a special purpose LSN. Hence, mask it for the same + * reason as mask_page_lsn_and_checksum. + */ + GistPageSetNSN(page, (uint64) MASK_MARKER); + + /* + * We update F_FOLLOW_RIGHT flag on the left child after writing WAL + * record. Hence, mask this flag. See gistplacetopage() for details. + */ + GistMarkFollowRight(page); + + if (GistPageIsLeaf(page)) + { + /* + * In gist leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * gistkillitems() for details. + */ + mask_lp_flags(page); + } + + /* + * During gist redo, we never mark a page as garbage. Hence, mask it to + * ignore any differences. + */ + GistClearPageHasGarbage(page); +} + +/* + * Write WAL record of a page split. + */ +XLogRecPtr +gistXLogSplit(bool page_is_leaf, + SplitedPageLayout *dist, + BlockNumber origrlink, GistNSN orignsn, + Buffer leftchildbuf, bool markfollowright) +{ + gistxlogPageSplit xlrec; + SplitedPageLayout *ptr; + int npage = 0; + XLogRecPtr recptr; + int i; + + for (ptr = dist; ptr; ptr = ptr->next) + npage++; + + xlrec.origrlink = origrlink; + xlrec.orignsn = orignsn; + xlrec.origleaf = page_is_leaf; + xlrec.npage = (uint16) npage; + xlrec.markfollowright = markfollowright; + + XLogBeginInsert(); + + /* + * Include a full page image of the child buf. (only necessary if a + * checkpoint happened since the child page was split) + */ + if (BufferIsValid(leftchildbuf)) + XLogRegisterBuffer(0, leftchildbuf, REGBUF_STANDARD); + + /* + * NOTE: We register a lot of data. The caller must've called + * XLogEnsureRecordSpace() to prepare for that. We cannot do it here, + * because we're already in a critical section. If you change the number + * of buffer or data registrations here, make sure you modify the + * XLogEnsureRecordSpace() calls accordingly! + */ + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageSplit)); + + i = 1; + for (ptr = dist; ptr; ptr = ptr->next) + { + XLogRegisterBuffer(i, ptr->buffer, REGBUF_WILL_INIT); + XLogRegisterBufData(i, (char *) &(ptr->block.num), sizeof(int)); + XLogRegisterBufData(i, (char *) ptr->list, ptr->lenlist); + i++; + } + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT); + + return recptr; +} + +/* + * Write XLOG record describing a page deletion. This also includes removal of + * downlink from the parent page. + */ +XLogRecPtr +gistXLogPageDelete(Buffer buffer, FullTransactionId xid, + Buffer parentBuffer, OffsetNumber downlinkOffset) +{ + gistxlogPageDelete xlrec; + XLogRecPtr recptr; + + xlrec.deleteXid = xid; + xlrec.downlinkOffset = downlinkOffset; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfGistxlogPageDelete); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, parentBuffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE); + + return recptr; +} + +/* + * Write an empty XLOG record to assign a distinct LSN. + */ +XLogRecPtr +gistXLogAssignLSN(void) +{ + int dummy = 0; + + /* + * Records other than SWITCH_WAL must have content. We use an integer 0 to + * follow the restriction. + */ + XLogBeginInsert(); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + XLogRegisterData((char *) &dummy, sizeof(dummy)); + return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN); +} + +/* + * Write XLOG record about reuse of a deleted page. + */ +void +gistXLogPageReuse(Relation rel, BlockNumber blkno, FullTransactionId latestRemovedXid) +{ + gistxlogPageReuse xlrec_reuse; + + /* + * Note that we don't register the buffer with the record, because this + * operation doesn't modify the page. This record only exists to provide a + * conflict point for Hot Standby. + */ + + /* XLOG stuff */ + xlrec_reuse.node = rel->rd_node; + xlrec_reuse.block = blkno; + xlrec_reuse.latestRemovedFullXid = latestRemovedXid; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec_reuse, SizeOfGistxlogPageReuse); + + XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_REUSE); +} + +/* + * Write XLOG record describing a page update. The update can include any + * number of deletions and/or insertions of tuples on a single index page. + * + * If this update inserts a downlink for a split page, also record that + * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set. + * + * Note that both the todelete array and the tuples are marked as belonging + * to the target buffer; they need not be stored in XLOG if XLogInsert decides + * to log the whole buffer contents instead. + */ +XLogRecPtr +gistXLogUpdate(Buffer buffer, + OffsetNumber *todelete, int ntodelete, + IndexTuple *itup, int ituplen, + Buffer leftchildbuf) +{ + gistxlogPageUpdate xlrec; + int i; + XLogRecPtr recptr; + + xlrec.ntodelete = ntodelete; + xlrec.ntoinsert = ituplen; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete); + + /* new tuples */ + for (i = 0; i < ituplen; i++) + XLogRegisterBufData(0, (char *) (itup[i]), IndexTupleSize(itup[i])); + + /* + * Include a full page image of the child buf. (only necessary if a + * checkpoint happened since the child page was split) + */ + if (BufferIsValid(leftchildbuf)) + XLogRegisterBuffer(1, leftchildbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE); + + return recptr; +} + +/* + * Write XLOG record describing a delete of leaf index tuples marked as DEAD + * during new tuple insertion. One may think that this case is already covered + * by gistXLogUpdate(). But deletion of index tuples might conflict with + * standby queries and needs special handling. + */ +XLogRecPtr +gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete, + TransactionId latestRemovedXid) +{ + gistxlogDelete xlrec; + XLogRecPtr recptr; + + xlrec.latestRemovedXid = latestRemovedXid; + xlrec.ntodelete = ntodelete; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete); + + /* + * We need the target-offsets array whether or not we store the whole + * buffer, to allow us to find the latestRemovedXid on a standby server. + */ + XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE); + + return recptr; +} |