diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/access/gin | |
parent | Initial commit. (diff) | |
download | postgresql-14-upstream.tar.xz postgresql-14-upstream.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/access/gin')
-rw-r--r-- | src/backend/access/gin/Makefile | 32 | ||||
-rw-r--r-- | src/backend/access/gin/README | 562 | ||||
-rw-r--r-- | src/backend/access/gin/ginarrayproc.c | 305 | ||||
-rw-r--r-- | src/backend/access/gin/ginbtree.c | 795 | ||||
-rw-r--r-- | src/backend/access/gin/ginbulk.c | 293 | ||||
-rw-r--r-- | src/backend/access/gin/gindatapage.c | 1942 | ||||
-rw-r--r-- | src/backend/access/gin/ginentrypage.c | 772 | ||||
-rw-r--r-- | src/backend/access/gin/ginfast.c | 1068 | ||||
-rw-r--r-- | src/backend/access/gin/ginget.c | 1970 | ||||
-rw-r--r-- | src/backend/access/gin/gininsert.c | 541 | ||||
-rw-r--r-- | src/backend/access/gin/ginlogic.c | 246 | ||||
-rw-r--r-- | src/backend/access/gin/ginpostinglist.c | 434 | ||||
-rw-r--r-- | src/backend/access/gin/ginscan.c | 468 | ||||
-rw-r--r-- | src/backend/access/gin/ginutil.c | 707 | ||||
-rw-r--r-- | src/backend/access/gin/ginvacuum.c | 822 | ||||
-rw-r--r-- | src/backend/access/gin/ginvalidate.c | 338 | ||||
-rw-r--r-- | src/backend/access/gin/ginxlog.c | 813 |
17 files changed, 12108 insertions, 0 deletions
diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile new file mode 100644 index 0000000..3fceaee --- /dev/null +++ b/src/backend/access/gin/Makefile @@ -0,0 +1,32 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/gin +# +# IDENTIFICATION +# src/backend/access/gin/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/gin +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + ginarrayproc.o \ + ginbtree.o \ + ginbulk.o \ + gindatapage.o \ + ginentrypage.o \ + ginfast.o \ + ginget.o \ + gininsert.o \ + ginlogic.o \ + ginpostinglist.o \ + ginscan.o \ + ginutil.o \ + ginvacuum.o \ + ginvalidate.o \ + ginxlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README new file mode 100644 index 0000000..41d4e1e --- /dev/null +++ b/src/backend/access/gin/README @@ -0,0 +1,562 @@ +src/backend/access/gin/README + +Gin for PostgreSQL +================== + +Gin was sponsored by jfg://networks (http://www.jfg-networks.com/) + +Gin stands for Generalized Inverted Index and should be considered as a genie, +not a drink. + +Generalized means that the index does not know which operation it accelerates. +It instead works with custom strategies, defined for specific data types (read +"Index Method Strategies" in the PostgreSQL documentation). In that sense, Gin +is similar to GiST and differs from btree indices, which have predefined, +comparison-based operations. + +An inverted index is an index structure storing a set of (key, posting list) +pairs, where 'posting list' is a set of heap rows in which the key occurs. +(A text document would usually contain many keys.) The primary goal of +Gin indices is support for highly scalable, full-text search in PostgreSQL. + +A Gin index consists of a B-tree index constructed over key values, +where each key is an element of some indexed items (element of array, lexeme +for tsvector) and where each tuple in a leaf page contains either a pointer to +a B-tree over item pointers (posting tree), or a simple list of item pointers +(posting list) if the list is small enough. + +Note: There is no delete operation in the key (entry) tree. The reason for +this is that in our experience, the set of distinct words in a large corpus +changes very slowly. This greatly simplifies the code and concurrency +algorithms. + +Core PostgreSQL includes built-in Gin support for one-dimensional arrays +(eg. integer[], text[]). The following operations are available: + + * contains: value_array @> query_array + * overlaps: value_array && query_array + * is contained by: value_array <@ query_array + +Synopsis +-------- + +=# create index txt_idx on aa using gin(a); + +Features +-------- + + * Concurrency + * Write-Ahead Logging (WAL). (Recoverability from crashes.) + * User-defined opclasses. (The scheme is similar to GiST.) + * Optimized index creation (Makes use of maintenance_work_mem to accumulate + postings in memory.) + * Text search support via an opclass + * Soft upper limit on the returned results set using a GUC variable: + gin_fuzzy_search_limit + +Gin Fuzzy Limit +--------------- + +There are often situations when a full-text search returns a very large set of +results. Since reading tuples from the disk and sorting them could take a +lot of time, this is unacceptable for production. (Note that the search +itself is very fast.) + +Such queries usually contain very frequent lexemes, so the results are not +very helpful. To facilitate execution of such queries Gin has a configurable +soft upper limit on the size of the returned set, determined by the +'gin_fuzzy_search_limit' GUC variable. This is set to 0 by default (no +limit). + +If a non-zero search limit is set, then the returned set is a subset of the +whole result set, chosen at random. + +"Soft" means that the actual number of returned results could differ +from the specified limit, depending on the query and the quality of the +system's random number generator. + +From experience, a value of 'gin_fuzzy_search_limit' in the thousands +(eg. 5000-20000) works well. This means that 'gin_fuzzy_search_limit' will +have no effect for queries returning a result set with less tuples than this +number. + +Index structure +--------------- + +The "items" that a GIN index indexes are composite values that contain +zero or more "keys". For example, an item might be an integer array, and +then the keys would be the individual integer values. The index actually +stores and searches for the key values, not the items per se. In the +pg_opclass entry for a GIN opclass, the opcintype is the data type of the +items, and the opckeytype is the data type of the keys. GIN is optimized +for cases where items contain many keys and the same key values appear +in many different items. + +A GIN index contains a metapage, a btree of key entries, and possibly +"posting tree" pages, which hold the overflow when a key entry acquires +too many heap tuple pointers to fit in a btree page. Additionally, if the +fast-update feature is enabled, there can be "list pages" holding "pending" +key entries that haven't yet been merged into the main btree. The list +pages have to be scanned linearly when doing a search, so the pending +entries should be merged into the main btree before there get to be too +many of them. The advantage of the pending list is that bulk insertion of +a few thousand entries can be much faster than retail insertion. (The win +comes mainly from not having to do multiple searches/insertions when the +same key appears in multiple new heap tuples.) + +Key entries are nominally of the same IndexTuple format as used in other +index types, but since a leaf key entry typically refers to multiple heap +tuples, there are significant differences. (See GinFormTuple, which works +by building a "normal" index tuple and then modifying it.) The points to +know are: + +* In a single-column index, a key tuple just contains the key datum, but +in a multi-column index, a key tuple contains the pair (column number, +key datum) where the column number is stored as an int2. This is needed +to support different key data types in different columns. This much of +the tuple is built by index_form_tuple according to the usual rules. +The column number (if present) can never be null, but the key datum can +be, in which case a null bitmap is present as usual. (As usual for index +tuples, the size of the null bitmap is fixed at INDEX_MAX_KEYS.) + +* If the key datum is null (ie, IndexTupleHasNulls() is true), then +just after the nominal index data (ie, at offset IndexInfoFindDataOffset +or IndexInfoFindDataOffset + sizeof(int2)) there is a byte indicating +the "category" of the null entry. These are the possible categories: + 1 = ordinary null key value extracted from an indexable item + 2 = placeholder for zero-key indexable item + 3 = placeholder for null indexable item +Placeholder null entries are inserted into the index because otherwise +there would be no index entry at all for an empty or null indexable item, +which would mean that full index scans couldn't be done and various corner +cases would give wrong answers. The different categories of null entries +are treated as distinct keys by the btree, but heap itempointers for the +same category of null entry are merged into one index entry just as happens +with ordinary key entries. + +* In a key entry at the btree leaf level, at the next SHORTALIGN boundary, +there is a list of item pointers, in compressed format (see Posting List +Compression section), pointing to the heap tuples for which the indexable +items contain this key. This is called the "posting list". + +If the list would be too big for the index tuple to fit on an index page, the +ItemPointers are pushed out to a separate posting page or pages, and none +appear in the key entry itself. The separate pages are called a "posting +tree" (see below); Note that in either case, the ItemPointers associated with +a key can easily be read out in sorted order; this is relied on by the scan +algorithms. + +* The index tuple header fields of a leaf key entry are abused as follows: + +1) Posting list case: + +* ItemPointerGetBlockNumber(&itup->t_tid) contains the offset from index + tuple start to the posting list. + Access macros: GinGetPostingOffset(itup) / GinSetPostingOffset(itup,n) + +* ItemPointerGetOffsetNumber(&itup->t_tid) contains the number of elements + in the posting list (number of heap itempointers). + Access macros: GinGetNPosting(itup) / GinSetNPosting(itup,n) + +* If IndexTupleHasNulls(itup) is true, the null category byte can be + accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c) + +* The posting list can be accessed with GinGetPosting(itup) + +* If GinItupIsCompressed(itup), the posting list is stored in compressed + format. Otherwise it is just an array of ItemPointers. New tuples are always + stored in compressed format, uncompressed items can be present if the + database was migrated from 9.3 or earlier version. + +2) Posting tree case: + +* ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number + of the root of the posting tree. + Access macros: GinGetPostingTree(itup) / GinSetPostingTree(itup, blkno) + +* ItemPointerGetOffsetNumber(&itup->t_tid) contains the magic number + GIN_TREE_POSTING, which distinguishes this from the posting-list case + (it's large enough that that many heap itempointers couldn't possibly + fit on an index page). This value is inserted automatically by the + GinSetPostingTree macro. + +* If IndexTupleHasNulls(itup) is true, the null category byte can be + accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c) + +* The posting list is not present and must not be accessed. + +Use the macro GinIsPostingTree(itup) to determine which case applies. + +In both cases, itup->t_info & INDEX_SIZE_MASK contains actual total size of +tuple, and the INDEX_VAR_MASK and INDEX_NULL_MASK bits have their normal +meanings as set by index_form_tuple. + +Index tuples in non-leaf levels of the btree contain the optional column +number, key datum, and null category byte as above. They do not contain +a posting list. ItemPointerGetBlockNumber(&itup->t_tid) is the downlink +to the next lower btree level, and ItemPointerGetOffsetNumber(&itup->t_tid) +is InvalidOffsetNumber. Use the access macros GinGetDownlink/GinSetDownlink +to get/set the downlink. + +Index entries that appear in "pending list" pages work a tad differently as +well. The optional column number, key datum, and null category byte are as +for other GIN index entries. However, there is always exactly one heap +itempointer associated with a pending entry, and it is stored in the t_tid +header field just as in non-GIN indexes. There is no posting list. +Furthermore, the code that searches the pending list assumes that all +entries for a given heap tuple appear consecutively in the pending list and +are sorted by the column-number-plus-key-datum. The GIN_LIST_FULLROW page +flag bit tells whether entries for a given heap tuple are spread across +multiple pending-list pages. If GIN_LIST_FULLROW is set, the page contains +all the entries for one or more heap tuples. If GIN_LIST_FULLROW is clear, +the page contains entries for only one heap tuple, *and* they are not all +the entries for that tuple. (Thus, a heap tuple whose entries do not all +fit on one pending-list page must have those pages to itself, even if this +results in wasting much of the space on the preceding page and the last +page for the tuple.) + +GIN packs downlinks and pivot keys into internal page tuples in a different way +than nbtree does. Lehman & Yao defines it as following. + +P_0, K_1, P_1, K_2, P_2, ... , K_n, P_n, K_{n+1} + +There P_i is a downlink and K_i is a key. K_i splits key space between P_{i-1} +and P_i (0 <= i <= n). K_{n+1} is high key. + +In internal page tuple is key and downlink grouped together. nbtree packs +keys and downlinks into tuples as following. + +(K_{n+1}, None), (-Inf, P_0), (K_1, P_1), ... , (K_n, P_n) + +There tuples are shown in parentheses. So, highkey is stored separately. P_i +is grouped with K_i. P_0 is grouped with -Inf key. + +GIN packs keys and downlinks into tuples in a different way. + +(P_0, K_1), (P_1, K_2), ... , (P_n, K_{n+1}) + +P_i is grouped with K_{i+1}. -Inf key is not needed. + +There are couple of additional notes regarding K_{n+1} key. +1) In entry tree rightmost page, a key coupled with P_n doesn't really matter. +Highkey is assumed to be infinity. +2) In posting tree, a key coupled with P_n always doesn't matter. Highkey for +non-rightmost pages is stored separately and accessed via +GinDataPageGetRightBound(). + +Posting tree +------------ + +If a posting list is too large to store in-line in a key entry, a posting tree +is created. A posting tree is a B-tree structure, where the ItemPointer is +used as the key. + +Internal posting tree pages use the standard PageHeader and the same "opaque" +struct as other GIN page, but do not contain regular index tuples. Instead, +the contents of the page is an array of PostingItem structs. Each PostingItem +consists of the block number of the child page, and the right bound of that +child page, as an ItemPointer. The right bound of the page is stored right +after the page header, before the PostingItem array. + +Posting tree leaf pages also use the standard PageHeader and opaque struct, +and the right bound of the page is stored right after the page header, but +the page content comprises of a number of compressed posting lists. The +compressed posting lists are stored one after each other, between page header +and pd_lower. The space between pd_lower and pd_upper is unused, which allows +full-page images of posting tree leaf pages to skip the unused space in middle +(buffer_std = true in XLogRecData). + +The item pointers are stored in a number of independent compressed posting +lists (also called segments), instead of one big one, to make random access +to a given item pointer faster: to find an item in a compressed list, you +have to read the list from the beginning, but when the items are split into +multiple lists, you can first skip over to the list containing the item you're +looking for, and read only that segment. Also, an update only needs to +re-encode the affected segment. + +Posting List Compression +------------------------ + +To fit as many item pointers on a page as possible, posting tree leaf pages +and posting lists stored inline in entry tree leaf tuples use a lightweight +form of compression. We take advantage of the fact that the item pointers +are stored in sorted order. Instead of storing the block and offset number of +each item pointer separately, we store the difference from the previous item. +That in itself doesn't do much, but it allows us to use so-called varbyte +encoding to compress them. + +Varbyte encoding is a method to encode integers, allowing smaller numbers to +take less space at the cost of larger numbers. Each integer is represented by +variable number of bytes. High bit of each byte in varbyte encoding determines +whether the next byte is still part of this number. Therefore, to read a single +varbyte encoded number, you have to read bytes until you find a byte with the +high bit not set. + +When encoding, the block and offset number forming the item pointer are +combined into a single integer. The offset number is stored in the 11 low +bits (see MaxHeapTuplesPerPageBits in ginpostinglist.c), and the block number +is stored in the higher bits. That requires 43 bits in total, which +conveniently fits in at most 6 bytes. + +A compressed posting list is passed around and stored on disk in a +GinPostingList struct. The first item in the list is stored uncompressed +as a regular ItemPointerData, followed by the length of the list in bytes, +followed by the packed items. + +Concurrency +----------- + +The entry tree and each posting tree are B-trees, with right-links connecting +sibling pages at the same level. This is the same structure that is used in +the regular B-tree indexam (invented by Lehman & Yao), but we don't support +scanning a GIN trees backwards, so we don't need left-links. The entry tree +leaves don't have dedicated high keys, instead greatest leaf tuple serves as +high key. That works because tuples are never deleted from the entry tree. + +The algorithms used to operate entry and posting trees are considered below. + +### Locating the leaf page + +When we search for leaf page in GIN btree to perform a read, we descend from +the root page to the leaf through using downlinks taking pin and shared lock on +one page at once. So, we release pin and shared lock on previous page before +getting them on the next page. + +The picture below shows tree state after finding the leaf page. Lower case +letters depicts tree pages. 'S' depicts shared lock on the page. + + a + / | \ + b c d + / | \ | \ | \ + eS f g h i j k + +### Steping right + +Concurrent page splits move the keyspace to right, so after following a +downlink, the page actually containing the key we're looking for might be +somewhere to the right of the page we landed on. In that case, we follow the +right-links until we find the page we're looking for. + +During stepping right we take pin and shared lock on the right sibling before +releasing them from the current page. This mechanism was designed to protect +from stepping to delete page. We step to the right sibling while hold lock on +the rightlink pointing there. So, it's guaranteed that nobody updates rightlink +concurrently and doesn't delete right sibling accordingly. + +The picture below shows two pages locked at once during stepping right. + + a + / | \ + b c d + / | \ | \ | \ + eS fS g h i j k + +### Insert + +While finding appropriate leaf for insertion we also descend from the root to +leaf, while shared locking one page at once in. But during insertion we don't +release pins from root and internal pages. That could save us some lookups to +the buffers hash table for downlinks insertion assuming parents are not changed +due to concurrent splits. Once we reach leaf we re-lock the page in exclusive +mode. + +The picture below shows leaf page locked in exclusive mode and ready for +insertion. 'P' and 'E' depict pin and exclusive lock correspondingly. + + + aP + / | \ + b cP d + / | \ | \ | \ + e f g hE i j k + + +If insert causes a page split, the parent is locked in exclusive mode before +unlocking the left child. So, insertion algorithm can exclusively lock both +parent and child pages at once starting from child. + +The picture below shows tree state after leaf page split. 'q' is new page +produced by split. Parent 'c' is about to have downlink inserted. + + aP + / | \ + b cE d + / | \ / | \ | \ + e f g hE q i j k + + +### Page deletion + +Vacuum never deletes tuples or pages from the entry tree. It traverses entry +tree leafs in logical order by rightlinks and removes deletable TIDs from +posting lists. Posting trees are processed by links from entry tree leafs. They +are vacuumed in two stages. At first stage, deletable TIDs are removed from +leafs. If first stage detects at least one empty page, then at the second stage +ginScanToDelete() deletes empty pages. + +ginScanToDelete() traverses the whole tree in depth-first manner. It starts +from the super-exclusive lock on the tree root. This lock prevents all the +concurrent insertions into this tree while we're deleting pages. However, +there are still might be some in-progress readers, who traversed root before +we locked it. + +The picture below shows tree state after page deletion algorithm traversed to +leftmost leaf of the tree. + + aE + / | \ + bE c d + / | \ | \ | \ + eE f g h i j k + +Deletion algorithm keeps exclusive locks on left siblings of pages comprising +currently investigated path. Thus, if current page is to be removed, all +required pages to remove both downlink and rightlink are already locked. That +avoids potential right to left page locking order, which could deadlock with +concurrent stepping right. + +A search concurrent to page deletion might already have read a pointer to the +page to be deleted, and might be just about to follow it. A page can be reached +via the right-link of its left sibling, or via its downlink in the parent. + +To prevent a backend from reaching a deleted page via a right-link, stepping +right algorithm doesn't release lock on the current page until lock of the +right page is acquired. + +The downlink is more tricky. A search descending the tree must release the lock +on the parent page before locking the child, or it could deadlock with a +concurrent split of the child page; a page split locks the parent, while already +holding a lock on the child page. So, deleted page cannot be reclaimed +immediately. Instead, we have to wait for every transaction, which might wait +to reference this page, to finish. Corresponding processes must observe that +the page is marked deleted and recover accordingly. + +The picture below shows tree state after page deletion algorithm further +traversed the tree. Currently investigated path is 'a-c-h'. Left siblings 'b' +and 'g' of 'c' and 'h' correspondingly are also exclusively locked. + + aE + / | \ + bE cE d + / | \ | \ | \ + e f gE hE i j k + +The next picture shows tree state after page 'h' was deleted. It's marked with +'deleted' flag and newest xid, which might visit it. Downlink from 'c' to 'h' +is also deleted. + + aE + / | \ + bE cE d + / | \ \ | \ + e f gE hD iE j k + +However, it's still possible that concurrent reader has seen downlink from 'c' +to 'h' before we deleted it. In that case this reader will step right from 'h' +to till find non-deleted page. Xid-marking of page 'h' guarantees that this +page wouldn't be reused till all such readers gone. Next leaf page under +investigation is 'i'. 'g' remains locked as it becomes left sibling of 'i'. + +The next picture shows tree state after 'i' and 'c' was deleted. Internal page +'c' was deleted because it appeared to have no downlinks. The path under +investigation is 'a-d-j'. Pages 'b' and 'g' are locked as self siblings of 'd' +and 'j'. + + aE + / \ + bE cD dE + / | \ | \ + e f gE hD iD jE k + +During the replay of page deletion at standby, the page's left sibling, the +target page, and its parent, are locked in that order. This order guarantees +no deadlock with concurrent reads. + +Predicate Locking +----------------- + +GIN supports predicate locking, for serializable snapshot isolation. +A predicate locks represent that a scan has scanned a range of values. They +are not concerned with physical pages as such, but the logical key values. +A predicate lock on a page covers the key range that would belong on that +page, whether or not there are any matching tuples there currently. In other +words, a predicate lock on an index page covers the "gaps" between the index +tuples. To minimize false positives, predicate locks are acquired at the +finest level possible. + +* Like in the B-tree index, it is enough to lock only leaf pages, because all + insertions happen at the leaf level. + +* In an equality search (i.e. not a partial match search), if a key entry has + a posting tree, we lock the posting tree root page, to represent a lock on + just that key entry. Otherwise, we lock the entry tree page. We also lock + the entry tree page if no match is found, to lock the "gap" where the entry + would've been, had there been one. + +* In a partial match search, we lock all the entry leaf pages that we scan, + in addition to locks on posting tree roots, to represent the "gaps" between + values. + +* In addition to the locks on entry leaf pages and posting tree roots, all + scans grab a lock the metapage. This is to interlock with insertions to + the fast update pending list. An insertion to the pending list can really + belong anywhere in the tree, and the lock on the metapage represents that. + +The interlock for fastupdate pending lists means that with fastupdate=on, +we effectively always grab a full-index lock, so you could get a lot of false +positives. + +Compatibility +------------- + +Compression of TIDs was introduced in 9.4. Some GIN indexes could remain in +uncompressed format because of pg_upgrade from 9.3 or earlier versions. +For compatibility, old uncompressed format is also supported. Following +rules are used to handle it: + +* GIN_ITUP_COMPRESSED flag marks index tuples that contain a posting list. +This flag is stored in high bit of ItemPointerGetBlockNumber(&itup->t_tid). +Use GinItupIsCompressed(itup) to check the flag. + +* Posting tree pages in the new format are marked with the GIN_COMPRESSED flag. + Macros GinPageIsCompressed(page) and GinPageSetCompressed(page) are used to + check and set this flag. + +* All scan operations check format of posting list add use corresponding code +to read its content. + +* When updating an index tuple containing an uncompressed posting list, it +will be replaced with new index tuple containing a compressed list. + +* When updating an uncompressed posting tree leaf page, it's compressed. + +* If vacuum finds some dead TIDs in uncompressed posting lists, they are +converted into compressed posting lists. This assumes that the compressed +posting list fits in the space occupied by the uncompressed list. IOW, we +assume that the compressed version of the page, with the dead items removed, +takes less space than the old uncompressed version. + +Limitations +----------- + + * Gin doesn't use scan->kill_prior_tuple & scan->ignore_killed_tuples + * Gin searches entries only by equality matching, or simple range + matching using the "partial match" feature. + +TODO +---- + +Nearest future: + + * Opclasses for more types (no programming, just many catalog changes) + +Distant future: + + * Replace B-tree of entries to something like GiST + +Authors +------- + +Original work was done by Teodor Sigaev (teodor@sigaev.ru) and Oleg Bartunov +(oleg@sai.msu.su). diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c new file mode 100644 index 0000000..bf73e32 --- /dev/null +++ b/src/backend/access/gin/ginarrayproc.c @@ -0,0 +1,305 @@ +/*------------------------------------------------------------------------- + * + * ginarrayproc.c + * support functions for GIN's indexing of any array + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginarrayproc.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gin.h" +#include "access/stratnum.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + + +#define GinOverlapStrategy 1 +#define GinContainsStrategy 2 +#define GinContainedStrategy 3 +#define GinEqualStrategy 4 + + +/* + * extractValue support function + */ +Datum +ginarrayextract(PG_FUNCTION_ARGS) +{ + /* Make copy of array input to ensure it doesn't disappear while in use */ + ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0); + int32 *nkeys = (int32 *) PG_GETARG_POINTER(1); + bool **nullFlags = (bool **) PG_GETARG_POINTER(2); + int16 elmlen; + bool elmbyval; + char elmalign; + Datum *elems; + bool *nulls; + int nelems; + + get_typlenbyvalalign(ARR_ELEMTYPE(array), + &elmlen, &elmbyval, &elmalign); + + deconstruct_array(array, + ARR_ELEMTYPE(array), + elmlen, elmbyval, elmalign, + &elems, &nulls, &nelems); + + *nkeys = nelems; + *nullFlags = nulls; + + /* we should not free array, elems[i] points into it */ + PG_RETURN_POINTER(elems); +} + +/* + * Formerly, ginarrayextract had only two arguments. Now it has three, + * but we still need a pg_proc entry with two args to support reloading + * pre-9.1 contrib/intarray opclass declarations. This compatibility + * function should go away eventually. + */ +Datum +ginarrayextract_2args(PG_FUNCTION_ARGS) +{ + if (PG_NARGS() < 3) /* should not happen */ + elog(ERROR, "ginarrayextract requires three arguments"); + return ginarrayextract(fcinfo); +} + +/* + * extractQuery support function + */ +Datum +ginqueryarrayextract(PG_FUNCTION_ARGS) +{ + /* Make copy of array input to ensure it doesn't disappear while in use */ + ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0); + int32 *nkeys = (int32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + + /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */ + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + bool **nullFlags = (bool **) PG_GETARG_POINTER(5); + int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); + int16 elmlen; + bool elmbyval; + char elmalign; + Datum *elems; + bool *nulls; + int nelems; + + get_typlenbyvalalign(ARR_ELEMTYPE(array), + &elmlen, &elmbyval, &elmalign); + + deconstruct_array(array, + ARR_ELEMTYPE(array), + elmlen, elmbyval, elmalign, + &elems, &nulls, &nelems); + + *nkeys = nelems; + *nullFlags = nulls; + + switch (strategy) + { + case GinOverlapStrategy: + *searchMode = GIN_SEARCH_MODE_DEFAULT; + break; + case GinContainsStrategy: + if (nelems > 0) + *searchMode = GIN_SEARCH_MODE_DEFAULT; + else /* everything contains the empty set */ + *searchMode = GIN_SEARCH_MODE_ALL; + break; + case GinContainedStrategy: + /* empty set is contained in everything */ + *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY; + break; + case GinEqualStrategy: + if (nelems > 0) + *searchMode = GIN_SEARCH_MODE_DEFAULT; + else + *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY; + break; + default: + elog(ERROR, "ginqueryarrayextract: unknown strategy number: %d", + strategy); + } + + /* we should not free array, elems[i] points into it */ + PG_RETURN_POINTER(elems); +} + +/* + * consistent support function + */ +Datum +ginarrayconsistent(PG_FUNCTION_ARGS) +{ + bool *check = (bool *) PG_GETARG_POINTER(0); + StrategyNumber strategy = PG_GETARG_UINT16(1); + + /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */ + int32 nkeys = PG_GETARG_INT32(3); + + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + bool *recheck = (bool *) PG_GETARG_POINTER(5); + + /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(6); */ + bool *nullFlags = (bool *) PG_GETARG_POINTER(7); + bool res; + int32 i; + + switch (strategy) + { + case GinOverlapStrategy: + /* result is not lossy */ + *recheck = false; + /* must have a match for at least one non-null element */ + res = false; + for (i = 0; i < nkeys; i++) + { + if (check[i] && !nullFlags[i]) + { + res = true; + break; + } + } + break; + case GinContainsStrategy: + /* result is not lossy */ + *recheck = false; + /* must have all elements in check[] true, and no nulls */ + res = true; + for (i = 0; i < nkeys; i++) + { + if (!check[i] || nullFlags[i]) + { + res = false; + break; + } + } + break; + case GinContainedStrategy: + /* we will need recheck */ + *recheck = true; + /* can't do anything else useful here */ + res = true; + break; + case GinEqualStrategy: + /* we will need recheck */ + *recheck = true; + + /* + * Must have all elements in check[] true; no discrimination + * against nulls here. This is because array_contain_compare and + * array_eq handle nulls differently ... + */ + res = true; + for (i = 0; i < nkeys; i++) + { + if (!check[i]) + { + res = false; + break; + } + } + break; + default: + elog(ERROR, "ginarrayconsistent: unknown strategy number: %d", + strategy); + res = false; + } + + PG_RETURN_BOOL(res); +} + +/* + * triconsistent support function + */ +Datum +ginarraytriconsistent(PG_FUNCTION_ARGS) +{ + GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0); + StrategyNumber strategy = PG_GETARG_UINT16(1); + + /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */ + int32 nkeys = PG_GETARG_INT32(3); + + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(5); */ + bool *nullFlags = (bool *) PG_GETARG_POINTER(6); + GinTernaryValue res; + int32 i; + + switch (strategy) + { + case GinOverlapStrategy: + /* must have a match for at least one non-null element */ + res = GIN_FALSE; + for (i = 0; i < nkeys; i++) + { + if (!nullFlags[i]) + { + if (check[i] == GIN_TRUE) + { + res = GIN_TRUE; + break; + } + else if (check[i] == GIN_MAYBE && res == GIN_FALSE) + { + res = GIN_MAYBE; + } + } + } + break; + case GinContainsStrategy: + /* must have all elements in check[] true, and no nulls */ + res = GIN_TRUE; + for (i = 0; i < nkeys; i++) + { + if (check[i] == GIN_FALSE || nullFlags[i]) + { + res = GIN_FALSE; + break; + } + if (check[i] == GIN_MAYBE) + { + res = GIN_MAYBE; + } + } + break; + case GinContainedStrategy: + /* can't do anything else useful here */ + res = GIN_MAYBE; + break; + case GinEqualStrategy: + + /* + * Must have all elements in check[] true; no discrimination + * against nulls here. This is because array_contain_compare and + * array_eq handle nulls differently ... + */ + res = GIN_MAYBE; + for (i = 0; i < nkeys; i++) + { + if (check[i] == GIN_FALSE) + { + res = GIN_FALSE; + break; + } + } + break; + default: + elog(ERROR, "ginarrayconsistent: unknown strategy number: %d", + strategy); + res = false; + } + + PG_RETURN_GIN_TERNARY_VALUE(res); +} diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c new file mode 100644 index 0000000..482cf10 --- /dev/null +++ b/src/backend/access/gin/ginbtree.c @@ -0,0 +1,795 @@ +/*------------------------------------------------------------------------- + * + * ginbtree.c + * page utilities routines for the postgres inverted index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginbtree.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/predicate.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +static void ginFindParents(GinBtree btree, GinBtreeStack *stack); +static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Buffer childbuf, GinStatsData *buildStats); +static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, + bool freestack, GinStatsData *buildStats); + +/* + * Lock buffer by needed method for search. + */ +int +ginTraverseLock(Buffer buffer, bool searchMode) +{ + Page page; + int access = GIN_SHARE; + + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + if (GinPageIsLeaf(page)) + { + if (searchMode == false) + { + /* we should relock our page */ + LockBuffer(buffer, GIN_UNLOCK); + LockBuffer(buffer, GIN_EXCLUSIVE); + + /* But root can become non-leaf during relock */ + if (!GinPageIsLeaf(page)) + { + /* restore old lock type (very rare) */ + LockBuffer(buffer, GIN_UNLOCK); + LockBuffer(buffer, GIN_SHARE); + } + else + access = GIN_EXCLUSIVE; + } + } + + return access; +} + +/* + * Descend the tree to the leaf page that contains or would contain the key + * we're searching for. The key should already be filled in 'btree', in + * tree-type specific manner. If btree->fullScan is true, descends to the + * leftmost leaf page. + * + * If 'searchmode' is false, on return stack->buffer is exclusively locked, + * and the stack represents the full path to the root. Otherwise stack->buffer + * is share-locked, and stack->parent is NULL. + * + * If 'rootConflictCheck' is true, tree root is checked for serialization + * conflict. + */ +GinBtreeStack * +ginFindLeafPage(GinBtree btree, bool searchMode, + bool rootConflictCheck, Snapshot snapshot) +{ + GinBtreeStack *stack; + + stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + stack->blkno = btree->rootBlkno; + stack->buffer = ReadBuffer(btree->index, btree->rootBlkno); + stack->parent = NULL; + stack->predictNumber = 1; + + if (rootConflictCheck) + CheckForSerializableConflictIn(btree->index, NULL, btree->rootBlkno); + + for (;;) + { + Page page; + BlockNumber child; + int access; + + stack->off = InvalidOffsetNumber; + + page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); + + access = ginTraverseLock(stack->buffer, searchMode); + + /* + * If we're going to modify the tree, finish any incomplete splits we + * encounter on the way. + */ + if (!searchMode && GinPageIsIncompleteSplit(page)) + ginFinishSplit(btree, stack, false, NULL); + + /* + * ok, page is correctly locked, we should check to move right .., + * root never has a right link, so small optimization + */ + while (btree->fullScan == false && stack->blkno != btree->rootBlkno && + btree->isMoveRight(btree, page)) + { + BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; + + if (rightlink == InvalidBlockNumber) + /* rightmost page */ + break; + + stack->buffer = ginStepRight(stack->buffer, btree->index, access); + stack->blkno = rightlink; + page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); + + if (!searchMode && GinPageIsIncompleteSplit(page)) + ginFinishSplit(btree, stack, false, NULL); + } + + if (GinPageIsLeaf(page)) /* we found, return locked page */ + return stack; + + /* now we have correct buffer, try to find child */ + child = btree->findChildPage(btree, stack); + + LockBuffer(stack->buffer, GIN_UNLOCK); + Assert(child != InvalidBlockNumber); + Assert(stack->blkno != child); + + if (searchMode) + { + /* in search mode we may forget path to leaf */ + stack->blkno = child; + stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); + } + else + { + GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + + ptr->parent = stack; + stack = ptr; + stack->blkno = child; + stack->buffer = ReadBuffer(btree->index, stack->blkno); + stack->predictNumber = 1; + } + } +} + +/* + * Step right from current page. + * + * The next page is locked first, before releasing the current page. This is + * crucial to protect from concurrent page deletion (see comment in + * ginDeletePage). + */ +Buffer +ginStepRight(Buffer buffer, Relation index, int lockmode) +{ + Buffer nextbuffer; + Page page = BufferGetPage(buffer); + bool isLeaf = GinPageIsLeaf(page); + bool isData = GinPageIsData(page); + BlockNumber blkno = GinPageGetOpaque(page)->rightlink; + + nextbuffer = ReadBuffer(index, blkno); + LockBuffer(nextbuffer, lockmode); + UnlockReleaseBuffer(buffer); + + /* Sanity check that the page we stepped to is of similar kind. */ + page = BufferGetPage(nextbuffer); + if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page)) + elog(ERROR, "right sibling of GIN page is of different type"); + + return nextbuffer; +} + +void +freeGinBtreeStack(GinBtreeStack *stack) +{ + while (stack) + { + GinBtreeStack *tmp = stack->parent; + + if (stack->buffer != InvalidBuffer) + ReleaseBuffer(stack->buffer); + + pfree(stack); + stack = tmp; + } +} + +/* + * Try to find parent for current stack position. Returns correct parent and + * child's offset in stack->parent. The root page is never released, to + * prevent conflict with vacuum process. + */ +static void +ginFindParents(GinBtree btree, GinBtreeStack *stack) +{ + Page page; + Buffer buffer; + BlockNumber blkno, + leftmostBlkno; + OffsetNumber offset; + GinBtreeStack *root; + GinBtreeStack *ptr; + + /* + * Unwind the stack all the way up to the root, leaving only the root + * item. + * + * Be careful not to release the pin on the root page! The pin on root + * page is required to lock out concurrent vacuums on the tree. + */ + root = stack->parent; + while (root->parent) + { + ReleaseBuffer(root->buffer); + root = root->parent; + } + + Assert(root->blkno == btree->rootBlkno); + Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno); + root->off = InvalidOffsetNumber; + + blkno = root->blkno; + buffer = root->buffer; + + ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + + for (;;) + { + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + if (GinPageIsLeaf(page)) + elog(ERROR, "Lost path"); + + if (GinPageIsIncompleteSplit(page)) + { + Assert(blkno != btree->rootBlkno); + ptr->blkno = blkno; + ptr->buffer = buffer; + + /* + * parent may be wrong, but if so, the ginFinishSplit call will + * recurse to call ginFindParents again to fix it. + */ + ptr->parent = root; + ptr->off = InvalidOffsetNumber; + + ginFinishSplit(btree, ptr, false, NULL); + } + + leftmostBlkno = btree->getLeftMostChild(btree, page); + + while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber) + { + blkno = GinPageGetOpaque(page)->rightlink; + if (blkno == InvalidBlockNumber) + { + UnlockReleaseBuffer(buffer); + break; + } + buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + + /* finish any incomplete splits, as above */ + if (GinPageIsIncompleteSplit(page)) + { + Assert(blkno != btree->rootBlkno); + ptr->blkno = blkno; + ptr->buffer = buffer; + ptr->parent = root; + ptr->off = InvalidOffsetNumber; + + ginFinishSplit(btree, ptr, false, NULL); + } + } + + if (blkno != InvalidBlockNumber) + { + ptr->blkno = blkno; + ptr->buffer = buffer; + ptr->parent = root; /* it may be wrong, but in next call we will + * correct */ + ptr->off = offset; + stack->parent = ptr; + return; + } + + /* Descend down to next level */ + blkno = leftmostBlkno; + buffer = ReadBuffer(btree->index, blkno); + } +} + +/* + * Insert a new item to a page. + * + * Returns true if the insertion was finished. On false, the page was split and + * the parent needs to be updated. (A root split returns true as it doesn't + * need any further action by the caller to complete.) + * + * When inserting a downlink to an internal page, 'childbuf' contains the + * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared + * atomically with the insert. Also, the existing item at offset stack->off + * in the target page is updated to point to updateblkno. + * + * stack->buffer is locked on entry, and is kept locked. + * Likewise for childbuf, if given. + */ +static bool +ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Buffer childbuf, GinStatsData *buildStats) +{ + Page page = BufferGetPage(stack->buffer); + bool result; + GinPlaceToPageRC rc; + uint16 xlflags = 0; + Page childpage = NULL; + Page newlpage = NULL, + newrpage = NULL; + void *ptp_workspace = NULL; + MemoryContext tmpCxt; + MemoryContext oldCxt; + + /* + * We do all the work of this function and its subfunctions in a temporary + * memory context. This avoids leakages and simplifies APIs, since some + * subfunctions allocate storage that has to survive until we've finished + * the WAL insertion. + */ + tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "ginPlaceToPage temporary context", + ALLOCSET_DEFAULT_SIZES); + oldCxt = MemoryContextSwitchTo(tmpCxt); + + if (GinPageIsData(page)) + xlflags |= GIN_INSERT_ISDATA; + if (GinPageIsLeaf(page)) + { + xlflags |= GIN_INSERT_ISLEAF; + Assert(!BufferIsValid(childbuf)); + Assert(updateblkno == InvalidBlockNumber); + } + else + { + Assert(BufferIsValid(childbuf)); + Assert(updateblkno != InvalidBlockNumber); + childpage = BufferGetPage(childbuf); + } + + /* + * See if the incoming tuple will fit on the page. beginPlaceToPage will + * decide if the page needs to be split, and will compute the split + * contents if so. See comments for beginPlaceToPage and execPlaceToPage + * functions for more details of the API here. + */ + rc = btree->beginPlaceToPage(btree, stack->buffer, stack, + insertdata, updateblkno, + &ptp_workspace, + &newlpage, &newrpage); + + if (rc == GPTP_NO_WORK) + { + /* Nothing to do */ + result = true; + } + else if (rc == GPTP_INSERT) + { + /* It will fit, perform the insertion */ + START_CRIT_SECTION(); + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + XLogBeginInsert(); + XLogRegisterBuffer(0, stack->buffer, REGBUF_STANDARD); + if (BufferIsValid(childbuf)) + XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD); + } + + /* Perform the page update, and register any extra WAL data */ + btree->execPlaceToPage(btree, stack->buffer, stack, + insertdata, updateblkno, ptp_workspace); + + MarkBufferDirty(stack->buffer); + + /* An insert to an internal page finishes the split of the child. */ + if (BufferIsValid(childbuf)) + { + GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; + MarkBufferDirty(childbuf); + } + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + XLogRecPtr recptr; + ginxlogInsert xlrec; + BlockIdData childblknos[2]; + + xlrec.flags = xlflags; + + XLogRegisterData((char *) &xlrec, sizeof(ginxlogInsert)); + + /* + * Log information about child if this was an insertion of a + * downlink. + */ + if (BufferIsValid(childbuf)) + { + BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); + BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); + XLogRegisterData((char *) childblknos, + sizeof(BlockIdData) * 2); + } + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT); + PageSetLSN(page, recptr); + if (BufferIsValid(childbuf)) + PageSetLSN(childpage, recptr); + } + + END_CRIT_SECTION(); + + /* Insertion is complete. */ + result = true; + } + else if (rc == GPTP_SPLIT) + { + /* + * Didn't fit, need to split. The split has been computed in newlpage + * and newrpage, which are pointers to palloc'd pages, not associated + * with buffers. stack->buffer is not touched yet. + */ + Buffer rbuffer; + BlockNumber savedRightLink; + ginxlogSplit data; + Buffer lbuffer = InvalidBuffer; + Page newrootpg = NULL; + + /* Get a new index page to become the right page */ + rbuffer = GinNewBuffer(btree->index); + + /* During index build, count the new page */ + if (buildStats) + { + if (btree->isData) + buildStats->nDataPages++; + else + buildStats->nEntryPages++; + } + + savedRightLink = GinPageGetOpaque(page)->rightlink; + + /* Begin setting up WAL record */ + data.node = btree->index->rd_node; + data.flags = xlflags; + if (BufferIsValid(childbuf)) + { + data.leftChildBlkno = BufferGetBlockNumber(childbuf); + data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; + } + else + data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; + + if (stack->parent == NULL) + { + /* + * splitting the root, so we need to allocate new left page and + * place pointers to left and right page on root page. + */ + lbuffer = GinNewBuffer(btree->index); + + /* During index build, count the new left page */ + if (buildStats) + { + if (btree->isData) + buildStats->nDataPages++; + else + buildStats->nEntryPages++; + } + + data.rrlink = InvalidBlockNumber; + data.flags |= GIN_SPLIT_ROOT; + + GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; + GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); + + /* + * Construct a new root page containing downlinks to the new left + * and right pages. (Do this in a temporary copy rather than + * overwriting the original page directly, since we're not in the + * critical section yet.) + */ + newrootpg = PageGetTempPage(newrpage); + GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); + + btree->fillRoot(btree, newrootpg, + BufferGetBlockNumber(lbuffer), newlpage, + BufferGetBlockNumber(rbuffer), newrpage); + + if (GinPageIsLeaf(BufferGetPage(stack->buffer))) + { + + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(lbuffer)); + + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(rbuffer)); + } + + } + else + { + /* splitting a non-root page */ + data.rrlink = savedRightLink; + + GinPageGetOpaque(newrpage)->rightlink = savedRightLink; + GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; + GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); + + if (GinPageIsLeaf(BufferGetPage(stack->buffer))) + { + + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(rbuffer)); + } + } + + /* + * OK, we have the new contents of the left page in a temporary copy + * now (newlpage), and likewise for the new contents of the + * newly-allocated right block. The original page is still unchanged. + * + * If this is a root split, we also have a temporary page containing + * the new contents of the root. + */ + + START_CRIT_SECTION(); + + MarkBufferDirty(rbuffer); + MarkBufferDirty(stack->buffer); + + /* + * Restore the temporary copies over the real buffers. + */ + if (stack->parent == NULL) + { + /* Splitting the root, three pages to update */ + MarkBufferDirty(lbuffer); + memcpy(page, newrootpg, BLCKSZ); + memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); + memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); + } + else + { + /* Normal split, only two pages to update */ + memcpy(page, newlpage, BLCKSZ); + memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); + } + + /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ + if (BufferIsValid(childbuf)) + { + GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; + MarkBufferDirty(childbuf); + } + + /* write WAL record */ + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + /* + * We just take full page images of all the split pages. Splits + * are uncommon enough that it's not worth complicating the code + * to be more efficient. + */ + if (stack->parent == NULL) + { + XLogRegisterBuffer(0, lbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(2, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + } + else + { + XLogRegisterBuffer(0, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + } + if (BufferIsValid(childbuf)) + XLogRegisterBuffer(3, childbuf, REGBUF_STANDARD); + + XLogRegisterData((char *) &data, sizeof(ginxlogSplit)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT); + + PageSetLSN(page, recptr); + PageSetLSN(BufferGetPage(rbuffer), recptr); + if (stack->parent == NULL) + PageSetLSN(BufferGetPage(lbuffer), recptr); + if (BufferIsValid(childbuf)) + PageSetLSN(childpage, recptr); + } + END_CRIT_SECTION(); + + /* + * We can release the locks/pins on the new pages now, but keep + * stack->buffer locked. childbuf doesn't get unlocked either. + */ + UnlockReleaseBuffer(rbuffer); + if (stack->parent == NULL) + UnlockReleaseBuffer(lbuffer); + + /* + * If we split the root, we're done. Otherwise the split is not + * complete until the downlink for the new page has been inserted to + * the parent. + */ + result = (stack->parent == NULL); + } + else + { + elog(ERROR, "invalid return code from GIN beginPlaceToPage method: %d", rc); + result = false; /* keep compiler quiet */ + } + + /* Clean up temp context */ + MemoryContextSwitchTo(oldCxt); + MemoryContextDelete(tmpCxt); + + return result; +} + +/* + * Finish a split by inserting the downlink for the new page to parent. + * + * On entry, stack->buffer is exclusively locked. + * + * If freestack is true, all the buffers are released and unlocked as we + * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept + * locked, and stack is unmodified, except for possibly moving right to find + * the correct parent of page. + */ +static void +ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, + GinStatsData *buildStats) +{ + Page page; + bool done; + bool first = true; + + /* + * freestack == false when we encounter an incompletely split page during + * a scan, while freestack == true is used in the normal scenario that a + * split is finished right after the initial insert. + */ + if (!freestack) + elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"", + stack->blkno, RelationGetRelationName(btree->index)); + + /* this loop crawls up the stack until the insertion is complete */ + do + { + GinBtreeStack *parent = stack->parent; + void *insertdata; + BlockNumber updateblkno; + + /* search parent to lock */ + LockBuffer(parent->buffer, GIN_EXCLUSIVE); + + /* + * If the parent page was incompletely split, finish that split first, + * then continue with the current one. + * + * Note: we have to finish *all* incomplete splits we encounter, even + * if we have to move right. Otherwise we might choose as the target a + * page that has no downlink in the parent, and splitting it further + * would fail. + */ + if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) + ginFinishSplit(btree, parent, false, buildStats); + + /* move right if it's needed */ + page = BufferGetPage(parent->buffer); + while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) + { + if (GinPageRightMost(page)) + { + /* + * rightmost page, but we don't find parent, we should use + * plain search... + */ + LockBuffer(parent->buffer, GIN_UNLOCK); + ginFindParents(btree, stack); + parent = stack->parent; + Assert(parent != NULL); + break; + } + + parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE); + parent->blkno = BufferGetBlockNumber(parent->buffer); + page = BufferGetPage(parent->buffer); + + if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) + ginFinishSplit(btree, parent, false, buildStats); + } + + /* insert the downlink */ + insertdata = btree->prepareDownlink(btree, stack->buffer); + updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink; + done = ginPlaceToPage(btree, parent, + insertdata, updateblkno, + stack->buffer, buildStats); + pfree(insertdata); + + /* + * If the caller requested to free the stack, unlock and release the + * child buffer now. Otherwise keep it pinned and locked, but if we + * have to recurse up the tree, we can unlock the upper pages, only + * keeping the page at the bottom of the stack locked. + */ + if (!first || freestack) + LockBuffer(stack->buffer, GIN_UNLOCK); + if (freestack) + { + ReleaseBuffer(stack->buffer); + pfree(stack); + } + stack = parent; + + first = false; + } while (!done); + + /* unlock the parent */ + LockBuffer(stack->buffer, GIN_UNLOCK); + + if (freestack) + freeGinBtreeStack(stack); +} + +/* + * Insert a value to tree described by stack. + * + * The value to be inserted is given in 'insertdata'. Its format depends + * on whether this is an entry or data tree, ginInsertValue just passes it + * through to the tree-specific callback function. + * + * During an index build, buildStats is non-null and the counters it contains + * are incremented as needed. + * + * NB: the passed-in stack is freed, as though by freeGinBtreeStack. + */ +void +ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata, + GinStatsData *buildStats) +{ + bool done; + + /* If the leaf page was incompletely split, finish the split first */ + if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer))) + ginFinishSplit(btree, stack, false, buildStats); + + done = ginPlaceToPage(btree, stack, + insertdata, InvalidBlockNumber, + InvalidBuffer, buildStats); + if (done) + { + LockBuffer(stack->buffer, GIN_UNLOCK); + freeGinBtreeStack(stack); + } + else + ginFinishSplit(btree, stack, true, buildStats); +} diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c new file mode 100644 index 0000000..4c5067c --- /dev/null +++ b/src/backend/access/gin/ginbulk.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * ginbulk.c + * routines for fast build of inverted index + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginbulk.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <limits.h> + +#include "access/gin_private.h" +#include "utils/datum.h" +#include "utils/memutils.h" + + +#define DEF_NENTRY 2048 /* GinEntryAccumulator allocation quantum */ +#define DEF_NPTR 5 /* ItemPointer initial allocation quantum */ + + +/* Combiner function for rbtree.c */ +static void +ginCombineData(RBTNode *existing, const RBTNode *newdata, void *arg) +{ + GinEntryAccumulator *eo = (GinEntryAccumulator *) existing; + const GinEntryAccumulator *en = (const GinEntryAccumulator *) newdata; + BuildAccumulator *accum = (BuildAccumulator *) arg; + + /* + * Note this code assumes that newdata contains only one itempointer. + */ + if (eo->count >= eo->maxcount) + { + if (eo->maxcount > INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("posting list is too long"), + errhint("Reduce maintenance_work_mem."))); + + accum->allocatedMemory -= GetMemoryChunkSpace(eo->list); + eo->maxcount *= 2; + eo->list = (ItemPointerData *) + repalloc_huge(eo->list, sizeof(ItemPointerData) * eo->maxcount); + accum->allocatedMemory += GetMemoryChunkSpace(eo->list); + } + + /* If item pointers are not ordered, they will need to be sorted later */ + if (eo->shouldSort == false) + { + int res; + + res = ginCompareItemPointers(eo->list + eo->count - 1, en->list); + Assert(res != 0); + + if (res > 0) + eo->shouldSort = true; + } + + eo->list[eo->count] = en->list[0]; + eo->count++; +} + +/* Comparator function for rbtree.c */ +static int +cmpEntryAccumulator(const RBTNode *a, const RBTNode *b, void *arg) +{ + const GinEntryAccumulator *ea = (const GinEntryAccumulator *) a; + const GinEntryAccumulator *eb = (const GinEntryAccumulator *) b; + BuildAccumulator *accum = (BuildAccumulator *) arg; + + return ginCompareAttEntries(accum->ginstate, + ea->attnum, ea->key, ea->category, + eb->attnum, eb->key, eb->category); +} + +/* Allocator function for rbtree.c */ +static RBTNode * +ginAllocEntryAccumulator(void *arg) +{ + BuildAccumulator *accum = (BuildAccumulator *) arg; + GinEntryAccumulator *ea; + + /* + * Allocate memory by rather big chunks to decrease overhead. We have no + * need to reclaim RBTNodes individually, so this costs nothing. + */ + if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY) + { + accum->entryallocator = palloc(sizeof(GinEntryAccumulator) * DEF_NENTRY); + accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator); + accum->eas_used = 0; + } + + /* Allocate new RBTNode from current chunk */ + ea = accum->entryallocator + accum->eas_used; + accum->eas_used++; + + return (RBTNode *) ea; +} + +void +ginInitBA(BuildAccumulator *accum) +{ + /* accum->ginstate is intentionally not set here */ + accum->allocatedMemory = 0; + accum->entryallocator = NULL; + accum->eas_used = 0; + accum->tree = rbt_create(sizeof(GinEntryAccumulator), + cmpEntryAccumulator, + ginCombineData, + ginAllocEntryAccumulator, + NULL, /* no freefunc needed */ + (void *) accum); +} + +/* + * This is basically the same as datumCopy(), but extended to count + * palloc'd space in accum->allocatedMemory. + */ +static Datum +getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value) +{ + Form_pg_attribute att; + Datum res; + + att = TupleDescAttr(accum->ginstate->origTupdesc, attnum - 1); + if (att->attbyval) + res = value; + else + { + res = datumCopy(value, false, att->attlen); + accum->allocatedMemory += GetMemoryChunkSpace(DatumGetPointer(res)); + } + return res; +} + +/* + * Find/store one entry from indexed value. + */ +static void +ginInsertBAEntry(BuildAccumulator *accum, + ItemPointer heapptr, OffsetNumber attnum, + Datum key, GinNullCategory category) +{ + GinEntryAccumulator eatmp; + GinEntryAccumulator *ea; + bool isNew; + + /* + * For the moment, fill only the fields of eatmp that will be looked at by + * cmpEntryAccumulator or ginCombineData. + */ + eatmp.attnum = attnum; + eatmp.key = key; + eatmp.category = category; + /* temporarily set up single-entry itempointer list */ + eatmp.list = heapptr; + + ea = (GinEntryAccumulator *) rbt_insert(accum->tree, (RBTNode *) &eatmp, + &isNew); + + if (isNew) + { + /* + * Finish initializing new tree entry, including making permanent + * copies of the datum (if it's not null) and itempointer. + */ + if (category == GIN_CAT_NORM_KEY) + ea->key = getDatumCopy(accum, attnum, key); + ea->maxcount = DEF_NPTR; + ea->count = 1; + ea->shouldSort = false; + ea->list = + (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR); + ea->list[0] = *heapptr; + accum->allocatedMemory += GetMemoryChunkSpace(ea->list); + } + else + { + /* + * ginCombineData did everything needed. + */ + } +} + +/* + * Insert the entries for one heap pointer. + * + * Since the entries are being inserted into a balanced binary tree, you + * might think that the order of insertion wouldn't be critical, but it turns + * out that inserting the entries in sorted order results in a lot of + * rebalancing operations and is slow. To prevent this, we attempt to insert + * the nodes in an order that will produce a nearly-balanced tree if the input + * is in fact sorted. + * + * We do this as follows. First, we imagine that we have an array whose size + * is the smallest power of two greater than or equal to the actual array + * size. Second, we insert the middle entry of our virtual array into the + * tree; then, we insert the middles of each half of our virtual array, then + * middles of quarters, etc. + */ +void +ginInsertBAEntries(BuildAccumulator *accum, + ItemPointer heapptr, OffsetNumber attnum, + Datum *entries, GinNullCategory *categories, + int32 nentries) +{ + uint32 step = nentries; + + if (nentries <= 0) + return; + + Assert(ItemPointerIsValid(heapptr) && attnum >= FirstOffsetNumber); + + /* + * step will contain largest power of 2 and <= nentries + */ + step |= (step >> 1); + step |= (step >> 2); + step |= (step >> 4); + step |= (step >> 8); + step |= (step >> 16); + step >>= 1; + step++; + + while (step > 0) + { + int i; + + for (i = step - 1; i < nentries && i >= 0; i += step << 1 /* *2 */ ) + ginInsertBAEntry(accum, heapptr, attnum, + entries[i], categories[i]); + + step >>= 1; /* /2 */ + } +} + +static int +qsortCompareItemPointers(const void *a, const void *b) +{ + int res = ginCompareItemPointers((ItemPointer) a, (ItemPointer) b); + + /* Assert that there are no equal item pointers being sorted */ + Assert(res != 0); + return res; +} + +/* Prepare to read out the rbtree contents using ginGetBAEntry */ +void +ginBeginBAScan(BuildAccumulator *accum) +{ + rbt_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk); +} + +/* + * Get the next entry in sequence from the BuildAccumulator's rbtree. + * This consists of a single key datum and a list (array) of one or more + * heap TIDs in which that key is found. The list is guaranteed sorted. + */ +ItemPointerData * +ginGetBAEntry(BuildAccumulator *accum, + OffsetNumber *attnum, Datum *key, GinNullCategory *category, + uint32 *n) +{ + GinEntryAccumulator *entry; + ItemPointerData *list; + + entry = (GinEntryAccumulator *) rbt_iterate(&accum->tree_walk); + + if (entry == NULL) + return NULL; /* no more entries */ + + *attnum = entry->attnum; + *key = entry->key; + *category = entry->category; + list = entry->list; + *n = entry->count; + + Assert(list != NULL && entry->count > 0); + + if (entry->shouldSort && entry->count > 1) + qsort(list, entry->count, sizeof(ItemPointerData), + qsortCompareItemPointers); + + return list; +} diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c new file mode 100644 index 0000000..06c0586 --- /dev/null +++ b/src/backend/access/gin/gindatapage.c @@ -0,0 +1,1942 @@ +/*------------------------------------------------------------------------- + * + * gindatapage.c + * routines for handling GIN posting tree pages. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/gindatapage.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "storage/predicate.h" +#include "utils/rel.h" + +/* + * Min, Max and Target size of posting lists stored on leaf pages, in bytes. + * + * The code can deal with any size, but random access is more efficient when + * a number of smaller lists are stored, rather than one big list. If a + * posting list would become larger than Max size as a result of insertions, + * it is split into two. If a posting list would be smaller than minimum + * size, it is merged with the next posting list. + */ +#define GinPostingListSegmentMaxSize 384 +#define GinPostingListSegmentTargetSize 256 +#define GinPostingListSegmentMinSize 128 + +/* + * At least this many items fit in a GinPostingListSegmentMaxSize-bytes + * long segment. This is used when estimating how much space is required + * for N items, at minimum. + */ +#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6) + +/* + * A working struct for manipulating a posting tree leaf page. + */ +typedef struct +{ + dlist_head segments; /* a list of leafSegmentInfos */ + + /* + * The following fields represent how the segments are split across pages, + * if a page split is required. Filled in by leafRepackItems. + */ + dlist_node *lastleft; /* last segment on left page */ + int lsize; /* total size on left page */ + int rsize; /* total size on right page */ + + bool oldformat; /* page is in pre-9.4 format on disk */ + + /* + * If we need WAL data representing the reconstructed leaf page, it's + * stored here by computeLeafRecompressWALData. + */ + char *walinfo; /* buffer start */ + int walinfolen; /* and length */ +} disassembledLeaf; + +typedef struct +{ + dlist_node node; /* linked list pointers */ + + /*------------- + * 'action' indicates the status of this in-memory segment, compared to + * what's on disk. It is one of the GIN_SEGMENT_* action codes: + * + * UNMODIFIED no changes + * DELETE the segment is to be removed. 'seg' and 'items' are + * ignored + * INSERT this is a completely new segment + * REPLACE this replaces an existing segment with new content + * ADDITEMS like REPLACE, but no items have been removed, and we track + * in detail what items have been added to this segment, in + * 'modifieditems' + *------------- + */ + char action; + + ItemPointerData *modifieditems; + uint16 nmodifieditems; + + /* + * The following fields represent the items in this segment. If 'items' is + * not NULL, it contains a palloc'd array of the items in this segment. If + * 'seg' is not NULL, it contains the items in an already-compressed + * format. It can point to an on-disk page (!modified), or a palloc'd + * segment in memory. If both are set, they must represent the same items. + */ + GinPostingList *seg; + ItemPointer items; + int nitems; /* # of items in 'items', if items != NULL */ +} leafSegmentInfo; + +static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); +static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Page *newlpage, Page *newrpage); + +static disassembledLeaf *disassembleLeaf(Page page); +static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); +static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, + int nNewItems); + +static void computeLeafRecompressWALData(disassembledLeaf *leaf); +static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); +static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf, + ItemPointerData lbound, ItemPointerData rbound, + Page lpage, Page rpage); + +/* + * Read TIDs from leaf data page to single uncompressed array. The TIDs are + * returned in ascending order. + * + * advancePast is a hint, indicating that the caller is only interested in + * TIDs > advancePast. To return all items, use ItemPointerSetMin. + * + * Note: This function can still return items smaller than advancePast that + * are in the same posting list as the items of interest, so the caller must + * still check all the returned items. But passing it allows this function to + * skip whole posting lists. + */ +ItemPointer +GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast) +{ + ItemPointer result; + + if (GinPageIsCompressed(page)) + { + GinPostingList *seg = GinDataLeafPageGetPostingList(page); + Size len = GinDataLeafPageGetPostingListSize(page); + Pointer endptr = ((Pointer) seg) + len; + GinPostingList *next; + + /* Skip to the segment containing advancePast+1 */ + if (ItemPointerIsValid(&advancePast)) + { + next = GinNextPostingListSegment(seg); + while ((Pointer) next < endptr && + ginCompareItemPointers(&next->first, &advancePast) <= 0) + { + seg = next; + next = GinNextPostingListSegment(seg); + } + len = endptr - (Pointer) seg; + } + + if (len > 0) + result = ginPostingListDecodeAllSegments(seg, len, nitems); + else + { + result = NULL; + *nitems = 0; + } + } + else + { + ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems); + + result = palloc((*nitems) * sizeof(ItemPointerData)); + memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData)); + } + + return result; +} + +/* + * Places all TIDs from leaf data page to bitmap. + */ +int +GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm) +{ + ItemPointer uncompressed; + int nitems; + + if (GinPageIsCompressed(page)) + { + GinPostingList *segment = GinDataLeafPageGetPostingList(page); + Size len = GinDataLeafPageGetPostingListSize(page); + + nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm); + } + else + { + uncompressed = dataLeafPageGetUncompressed(page, &nitems); + + if (nitems > 0) + tbm_add_tuples(tbm, uncompressed, nitems, false); + } + + return nitems; +} + +/* + * Get pointer to the uncompressed array of items on a pre-9.4 format + * uncompressed leaf page. The number of items in the array is returned in + * *nitems. + */ +static ItemPointer +dataLeafPageGetUncompressed(Page page, int *nitems) +{ + ItemPointer items; + + Assert(!GinPageIsCompressed(page)); + + /* + * In the old pre-9.4 page format, the whole page content is used for + * uncompressed items, and the number of items is stored in 'maxoff' + */ + items = (ItemPointer) GinDataPageGetData(page); + *nitems = GinPageGetOpaque(page)->maxoff; + + return items; +} + +/* + * Check if we should follow the right link to find the item we're searching + * for. + * + * Compares inserting item pointer with the right bound of the current page. + */ +static bool +dataIsMoveRight(GinBtree btree, Page page) +{ + ItemPointer iptr = GinDataPageGetRightBound(page); + + if (GinPageRightMost(page)) + return false; + + if (GinPageIsDeleted(page)) + return true; + + return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? true : false; +} + +/* + * Find correct PostingItem in non-leaf page. It is assumed that this is + * the correct page, and the searched value SHOULD be on the page. + */ +static BlockNumber +dataLocateItem(GinBtree btree, GinBtreeStack *stack) +{ + OffsetNumber low, + high, + maxoff; + PostingItem *pitem = NULL; + int result; + Page page = BufferGetPage(stack->buffer); + + Assert(!GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + + if (btree->fullScan) + { + stack->off = FirstOffsetNumber; + stack->predictNumber *= GinPageGetOpaque(page)->maxoff; + return btree->getLeftMostChild(btree, page); + } + + low = FirstOffsetNumber; + maxoff = high = GinPageGetOpaque(page)->maxoff; + Assert(high >= low); + + high++; + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + pitem = GinDataPageGetPostingItem(page, mid); + + if (mid == maxoff) + { + /* + * Right infinity, page already correctly chosen with a help of + * dataIsMoveRight + */ + result = -1; + } + else + { + pitem = GinDataPageGetPostingItem(page, mid); + result = ginCompareItemPointers(&btree->itemptr, &(pitem->key)); + } + + if (result == 0) + { + stack->off = mid; + return PostingItemGetBlockNumber(pitem); + } + else if (result > 0) + low = mid + 1; + else + high = mid; + } + + Assert(high >= FirstOffsetNumber && high <= maxoff); + + stack->off = high; + pitem = GinDataPageGetPostingItem(page, high); + return PostingItemGetBlockNumber(pitem); +} + +/* + * Find link to blkno on non-leaf page, returns offset of PostingItem + */ +static OffsetNumber +dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) +{ + OffsetNumber i, + maxoff = GinPageGetOpaque(page)->maxoff; + PostingItem *pitem; + + Assert(!GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + + /* if page isn't changed, we return storedOff */ + if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) + { + pitem = GinDataPageGetPostingItem(page, storedOff); + if (PostingItemGetBlockNumber(pitem) == blkno) + return storedOff; + + /* + * we hope, that needed pointer goes to right. It's true if there + * wasn't a deletion + */ + for (i = storedOff + 1; i <= maxoff; i++) + { + pitem = GinDataPageGetPostingItem(page, i); + if (PostingItemGetBlockNumber(pitem) == blkno) + return i; + } + + maxoff = storedOff - 1; + } + + /* last chance */ + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + pitem = GinDataPageGetPostingItem(page, i); + if (PostingItemGetBlockNumber(pitem) == blkno) + return i; + } + + return InvalidOffsetNumber; +} + +/* + * Return blkno of leftmost child + */ +static BlockNumber +dataGetLeftMostPage(GinBtree btree, Page page) +{ + PostingItem *pitem; + + Assert(!GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber); + + pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber); + return PostingItemGetBlockNumber(pitem); +} + +/* + * Add PostingItem to a non-leaf page. + */ +void +GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset) +{ + OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; + char *ptr; + + Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber); + Assert(!GinPageIsLeaf(page)); + + if (offset == InvalidOffsetNumber) + { + ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1); + } + else + { + ptr = (char *) GinDataPageGetPostingItem(page, offset); + if (offset != maxoff + 1) + memmove(ptr + sizeof(PostingItem), + ptr, + (maxoff - offset + 1) * sizeof(PostingItem)); + } + memcpy(ptr, data, sizeof(PostingItem)); + + maxoff++; + GinPageGetOpaque(page)->maxoff = maxoff; + + /* + * Also set pd_lower to the end of the posting items, to follow the + * "standard" page layout, so that we can squeeze out the unused space + * from full-page images. + */ + GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem)); +} + +/* + * Delete posting item from non-leaf page + */ +void +GinPageDeletePostingItem(Page page, OffsetNumber offset) +{ + OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; + + Assert(!GinPageIsLeaf(page)); + Assert(offset >= FirstOffsetNumber && offset <= maxoff); + + if (offset != maxoff) + memmove(GinDataPageGetPostingItem(page, offset), + GinDataPageGetPostingItem(page, offset + 1), + sizeof(PostingItem) * (maxoff - offset)); + + maxoff--; + GinPageGetOpaque(page)->maxoff = maxoff; + + GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem)); +} + +/* + * Prepare to insert data on a leaf data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + */ +static GinPlaceToPageRC +dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + GinBtreeDataLeafInsertData *items = insertdata; + ItemPointer newItems = &items->items[items->curitem]; + int maxitems = items->nitem - items->curitem; + Page page = BufferGetPage(buf); + int i; + ItemPointerData rbound; + ItemPointerData lbound; + bool needsplit; + bool append; + int segsize; + Size freespace; + disassembledLeaf *leaf; + leafSegmentInfo *lastleftinfo; + ItemPointerData maxOldItem; + ItemPointerData remaining; + + rbound = *GinDataPageGetRightBound(page); + + /* + * Count how many of the new items belong to this page. + */ + if (!GinPageRightMost(page)) + { + for (i = 0; i < maxitems; i++) + { + if (ginCompareItemPointers(&newItems[i], &rbound) > 0) + { + /* + * This needs to go to some other location in the tree. (The + * caller should've chosen the insert location so that at + * least the first item goes here.) + */ + Assert(i > 0); + break; + } + } + maxitems = i; + } + + /* Disassemble the data on the page */ + leaf = disassembleLeaf(page); + + /* + * Are we appending to the end of the page? IOW, are all the new items + * larger than any of the existing items. + */ + if (!dlist_is_empty(&leaf->segments)) + { + lastleftinfo = dlist_container(leafSegmentInfo, node, + dlist_tail_node(&leaf->segments)); + if (!lastleftinfo->items) + lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, + &lastleftinfo->nitems); + maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1]; + if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0) + append = true; + else + append = false; + } + else + { + ItemPointerSetMin(&maxOldItem); + append = true; + } + + /* + * If we're appending to the end of the page, we will append as many items + * as we can fit (after splitting), and stop when the pages becomes full. + * Otherwise we have to limit the number of new items to insert, because + * once we start packing we can't just stop when we run out of space, + * because we must make sure that all the old items still fit. + */ + if (GinPageIsCompressed(page)) + freespace = GinDataLeafPageGetFreeSpace(page); + else + freespace = 0; + if (append) + { + /* + * Even when appending, trying to append more items than will fit is + * not completely free, because we will merge the new items and old + * items into an array below. In the best case, every new item fits in + * a single byte, and we can use all the free space on the old page as + * well as the new page. For simplicity, ignore segment overhead etc. + */ + maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize); + } + else + { + /* + * Calculate a conservative estimate of how many new items we can fit + * on the two pages after splitting. + * + * We can use any remaining free space on the old page to store full + * segments, as well as the new page. Each full-sized segment can hold + * at least MinTuplesPerSegment items + */ + int nnewsegments; + + nnewsegments = freespace / GinPostingListSegmentMaxSize; + nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize; + maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment); + } + + /* Add the new items to the segment list */ + if (!addItemsToLeaf(leaf, newItems, maxitems)) + { + /* all items were duplicates, we have nothing to do */ + items->curitem += maxitems; + + return GPTP_NO_WORK; + } + + /* + * Pack the items back to compressed segments, ready for writing to disk. + */ + needsplit = leafRepackItems(leaf, &remaining); + + /* + * Did all the new items fit? + * + * If we're appending, it's OK if they didn't. But as a sanity check, + * verify that all the old items fit. + */ + if (ItemPointerIsValid(&remaining)) + { + if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0) + elog(ERROR, "could not split GIN page; all old items didn't fit"); + + /* Count how many of the new items did fit. */ + for (i = 0; i < maxitems; i++) + { + if (ginCompareItemPointers(&newItems[i], &remaining) >= 0) + break; + } + if (i == 0) + elog(ERROR, "could not split GIN page; no new items fit"); + maxitems = i; + } + + if (!needsplit) + { + /* + * Great, all the items fit on a single page. If needed, prepare data + * for a WAL record describing the changes we'll make. + */ + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + computeLeafRecompressWALData(leaf); + + /* + * We're ready to enter the critical section, but + * dataExecPlaceToPageLeaf will need access to the "leaf" data. + */ + *ptp_workspace = leaf; + + if (append) + elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + items->nitem - items->curitem - maxitems); + else + elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + items->nitem - items->curitem - maxitems); + } + else + { + /* + * Have to split. + * + * leafRepackItems already divided the segments between the left and + * the right page. It filled the left page as full as possible, and + * put the rest to the right page. When building a new index, that's + * good, because the table is scanned from beginning to end and there + * won't be any more insertions to the left page during the build. + * This packs the index as tight as possible. But otherwise, split + * 50/50, by moving segments from the left page to the right page + * until they're balanced. + * + * As a further heuristic, when appending items to the end of the + * page, try to make the left page 75% full, on the assumption that + * subsequent insertions will probably also go to the end. This packs + * the index somewhat tighter when appending to a table, which is very + * common. + */ + if (!btree->isBuild) + { + while (dlist_has_prev(&leaf->segments, leaf->lastleft)) + { + lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); + + /* ignore deleted segments */ + if (lastleftinfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(lastleftinfo->seg); + + /* + * Note that we check that the right page doesn't become + * more full than the left page even when appending. It's + * possible that we added enough items to make both pages + * more than 75% full. + */ + if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0) + break; + if (append) + { + if ((leaf->lsize - segsize) < (BLCKSZ * 3) / 4) + break; + } + + leaf->lsize -= segsize; + leaf->rsize += segsize; + } + leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft); + } + } + Assert(leaf->lsize <= GinDataPageMaxDataSize); + Assert(leaf->rsize <= GinDataPageMaxDataSize); + + /* + * Fetch the max item in the left page's last segment; it becomes the + * right bound of the page. + */ + lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); + if (!lastleftinfo->items) + lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, + &lastleftinfo->nitems); + lbound = lastleftinfo->items[lastleftinfo->nitems - 1]; + + /* + * Now allocate a couple of temporary page images, and fill them. + */ + *newlpage = palloc(BLCKSZ); + *newrpage = palloc(BLCKSZ); + + dataPlaceToPageLeafSplit(leaf, lbound, rbound, + *newlpage, *newrpage); + + Assert(GinPageRightMost(page) || + ginCompareItemPointers(GinDataPageGetRightBound(*newlpage), + GinDataPageGetRightBound(*newrpage)) < 0); + + if (append) + elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + items->nitem - items->curitem - maxitems); + else + elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + items->nitem - items->curitem - maxitems); + } + + items->curitem += maxitems; + + return needsplit ? GPTP_SPLIT : GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, void *ptp_workspace) +{ + disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace; + + /* Apply changes to page */ + dataPlaceToPageLeafRecompress(buf, leaf); + + /* If needed, register WAL data built by computeLeafRecompressWALData */ + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen); + } +} + +/* + * Vacuum a posting tree leaf page. + */ +void +ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) +{ + Page page = BufferGetPage(buffer); + disassembledLeaf *leaf; + bool removedsomething = false; + dlist_iter iter; + + leaf = disassembleLeaf(page); + + /* Vacuum each segment. */ + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); + int oldsegsize; + ItemPointer cleaned; + int ncleaned; + + if (!seginfo->items) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + if (seginfo->seg) + oldsegsize = SizeOfGinPostingList(seginfo->seg); + else + oldsegsize = GinDataPageMaxDataSize; + + cleaned = ginVacuumItemPointers(gvs, + seginfo->items, + seginfo->nitems, + &ncleaned); + pfree(seginfo->items); + seginfo->items = NULL; + seginfo->nitems = 0; + if (cleaned) + { + if (ncleaned > 0) + { + int npacked; + + seginfo->seg = ginCompressPostingList(cleaned, + ncleaned, + oldsegsize, + &npacked); + /* Removing an item never increases the size of the segment */ + if (npacked != ncleaned) + elog(ERROR, "could not fit vacuumed posting list"); + seginfo->action = GIN_SEGMENT_REPLACE; + } + else + { + seginfo->seg = NULL; + seginfo->items = NULL; + seginfo->action = GIN_SEGMENT_DELETE; + } + seginfo->nitems = ncleaned; + + removedsomething = true; + } + } + + /* + * If we removed any items, reconstruct the page from the pieces. + * + * We don't try to re-encode the segments here, even though some of them + * might be really small now that we've removed some items from them. It + * seems like a waste of effort, as there isn't really any benefit from + * larger segments per se; larger segments only help to pack more items in + * the same space. We might as well delay doing that until the next + * insertion, which will need to re-encode at least part of the page + * anyway. + * + * Also note if the page was in uncompressed, pre-9.4 format before, it is + * now represented as one huge segment that contains all the items. It + * might make sense to split that, to speed up random access, but we don't + * bother. You'll have to REINDEX anyway if you want the full gain of the + * new tighter index format. + */ + if (removedsomething) + { + bool modified; + + /* + * Make sure we have a palloc'd copy of all segments, after the first + * segment that is modified. (dataPlaceToPageLeafRecompress requires + * this). + */ + modified = false; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + modified = true; + if (modified && seginfo->action != GIN_SEGMENT_DELETE) + { + int segsize = SizeOfGinPostingList(seginfo->seg); + GinPostingList *tmp = (GinPostingList *) palloc(segsize); + + memcpy(tmp, seginfo->seg, segsize); + seginfo->seg = tmp; + } + } + + if (RelationNeedsWAL(indexrel)) + computeLeafRecompressWALData(leaf); + + /* Apply changes to page */ + START_CRIT_SECTION(); + + dataPlaceToPageLeafRecompress(buffer, leaf); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(indexrel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } +} + +/* + * Construct a ginxlogRecompressDataLeaf record representing the changes + * in *leaf. (Because this requires a palloc, we have to do it before + * we enter the critical section that actually updates the page.) + */ +static void +computeLeafRecompressWALData(disassembledLeaf *leaf) +{ + int nmodified = 0; + char *walbufbegin; + char *walbufend; + dlist_iter iter; + int segno; + ginxlogRecompressDataLeaf *recompress_xlog; + + /* Count the modified segments */ + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + nmodified++; + } + + walbufbegin = + palloc(sizeof(ginxlogRecompressDataLeaf) + + BLCKSZ + /* max size needed to hold the segment data */ + nmodified * 2 /* (segno + action) per action */ + ); + walbufend = walbufbegin; + + recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend; + walbufend += sizeof(ginxlogRecompressDataLeaf); + + recompress_xlog->nactions = nmodified; + + segno = 0; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + int segsize = 0; + int datalen; + uint8 action = seginfo->action; + + if (action == GIN_SEGMENT_UNMODIFIED) + { + segno++; + continue; + } + + if (action != GIN_SEGMENT_DELETE) + segsize = SizeOfGinPostingList(seginfo->seg); + + /* + * If storing the uncompressed list of added item pointers would take + * more space than storing the compressed segment as is, do that + * instead. + */ + if (action == GIN_SEGMENT_ADDITEMS && + seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize) + { + action = GIN_SEGMENT_REPLACE; + } + + *((uint8 *) (walbufend++)) = segno; + *(walbufend++) = action; + + switch (action) + { + case GIN_SEGMENT_DELETE: + datalen = 0; + break; + + case GIN_SEGMENT_ADDITEMS: + datalen = seginfo->nmodifieditems * sizeof(ItemPointerData); + memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16)); + memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen); + datalen += sizeof(uint16); + break; + + case GIN_SEGMENT_INSERT: + case GIN_SEGMENT_REPLACE: + datalen = SHORTALIGN(segsize); + memcpy(walbufend, seginfo->seg, segsize); + break; + + default: + elog(ERROR, "unexpected GIN leaf action %d", action); + } + walbufend += datalen; + + if (action != GIN_SEGMENT_INSERT) + segno++; + } + + /* Pass back the constructed info via *leaf */ + leaf->walinfo = walbufbegin; + leaf->walinfolen = walbufend - walbufbegin; +} + +/* + * Assemble a disassembled posting tree leaf page back to a buffer. + * + * This just updates the target buffer; WAL stuff is caller's responsibility. + * + * NOTE: The segment pointers must not point directly to the same buffer, + * except for segments that have not been modified and whose preceding + * segments have not been modified either. + */ +static void +dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) +{ + Page page = BufferGetPage(buf); + char *ptr; + int newsize; + bool modified = false; + dlist_iter iter; + int segsize; + + /* + * If the page was in pre-9.4 format before, convert the header, and force + * all segments to be copied to the page whether they were modified or + * not. + */ + if (!GinPageIsCompressed(page)) + { + Assert(leaf->oldformat); + GinPageSetCompressed(page); + GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; + modified = true; + } + + ptr = (char *) GinDataLeafPageGetPostingList(page); + newsize = 0; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); + + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + modified = true; + + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + + if (modified) + memcpy(ptr, seginfo->seg, segsize); + + ptr += segsize; + newsize += segsize; + } + } + + Assert(newsize <= GinDataPageMaxDataSize); + GinDataPageSetDataSize(page, newsize); +} + +/* + * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf + * segments to two pages instead of one. + * + * This is different from the non-split cases in that this does not modify + * the original page directly, but writes to temporary in-memory copies of + * the new left and right pages. + */ +static void +dataPlaceToPageLeafSplit(disassembledLeaf *leaf, + ItemPointerData lbound, ItemPointerData rbound, + Page lpage, Page rpage) +{ + char *ptr; + int segsize; + int lsize; + int rsize; + dlist_node *node; + dlist_node *firstright; + leafSegmentInfo *seginfo; + + /* Initialize temporary pages to hold the new left and right pages */ + GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + + /* + * Copy the segments that go to the left page. + * + * XXX: We should skip copying the unmodified part of the left page, like + * we do when recompressing. + */ + lsize = 0; + ptr = (char *) GinDataLeafPageGetPostingList(lpage); + firstright = dlist_next_node(&leaf->segments, leaf->lastleft); + for (node = dlist_head_node(&leaf->segments); + node != firstright; + node = dlist_next_node(&leaf->segments, node)) + { + seginfo = dlist_container(leafSegmentInfo, node, node); + + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + lsize += segsize; + } + } + Assert(lsize == leaf->lsize); + GinDataPageSetDataSize(lpage, lsize); + *GinDataPageGetRightBound(lpage) = lbound; + + /* Copy the segments that go to the right page */ + ptr = (char *) GinDataLeafPageGetPostingList(rpage); + rsize = 0; + for (node = firstright; + ; + node = dlist_next_node(&leaf->segments, node)) + { + seginfo = dlist_container(leafSegmentInfo, node, node); + + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + rsize += segsize; + } + + if (!dlist_has_next(&leaf->segments, node)) + break; + } + Assert(rsize == leaf->rsize); + GinDataPageSetDataSize(rpage, rsize); + *GinDataPageGetRightBound(rpage) = rbound; +} + +/* + * Prepare to insert data on an internal data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + */ +static GinPlaceToPageRC +dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + Page page = BufferGetPage(buf); + + /* If it doesn't fit, deal with split case */ + if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) + { + dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, + newlpage, newrpage); + return GPTP_SPLIT; + } + + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace) +{ + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + PostingItem *pitem; + + /* Update existing downlink to point to next page (on internal page) */ + pitem = GinDataPageGetPostingItem(page, off); + PostingItemSetBlockNumber(pitem, updateblkno); + + /* Add new item */ + pitem = (PostingItem *) insertdata; + GinDataPageAddPostingItem(page, pitem, off); + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertDataInternal data; + + data.offset = off; + data.newitem = *pitem; + + XLogRegisterBufData(0, (char *) &data, + sizeof(ginxlogInsertDataInternal)); + } +} + +/* + * Prepare to insert data on a posting-tree data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. + */ +static GinPlaceToPageRC +dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + Page page = BufferGetPage(buf); + + Assert(GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace, + newlpage, newrpage); + else + return dataBeginPlaceToPageInternal(btree, buf, stack, + insertdata, updateblkno, + ptp_workspace, + newlpage, newrpage); +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. + */ +static void +dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace) +{ + Page page = BufferGetPage(buf); + + if (GinPageIsLeaf(page)) + dataExecPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace); + else + dataExecPlaceToPageInternal(btree, buf, stack, insertdata, + updateblkno, ptp_workspace); +} + +/* + * Split internal page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. + */ +static void +dataSplitPageInternal(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Page *newlpage, Page *newrpage) +{ + Page oldpage = BufferGetPage(origbuf); + OffsetNumber off = stack->off; + int nitems = GinPageGetOpaque(oldpage)->maxoff; + int nleftitems; + int nrightitems; + Size pageSize = PageGetPageSize(oldpage); + ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage); + ItemPointer bound; + Page lpage; + Page rpage; + OffsetNumber separator; + PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; + + lpage = PageGetTempPage(oldpage); + rpage = PageGetTempPage(oldpage); + GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize); + GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize); + + /* + * First construct a new list of PostingItems, which includes all the old + * items, and the new item. + */ + memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber), + (off - 1) * sizeof(PostingItem)); + + allitems[off - 1] = *((PostingItem *) insertdata); + memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off), + (nitems - (off - 1)) * sizeof(PostingItem)); + nitems++; + + /* Update existing downlink to point to next page */ + PostingItemSetBlockNumber(&allitems[off], updateblkno); + + /* + * When creating a new index, fit as many tuples as possible on the left + * page, on the assumption that the table is scanned from beginning to + * end. This packs the index as tight as possible. + */ + if (btree->isBuild && GinPageRightMost(oldpage)) + separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem); + else + separator = nitems / 2; + nleftitems = separator; + nrightitems = nitems - separator; + + memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber), + allitems, + nleftitems * sizeof(PostingItem)); + GinPageGetOpaque(lpage)->maxoff = nleftitems; + memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber), + &allitems[separator], + nrightitems * sizeof(PostingItem)); + GinPageGetOpaque(rpage)->maxoff = nrightitems; + + /* + * Also set pd_lower for both pages, like GinDataPageAddPostingItem does. + */ + GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem)); + GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem)); + + /* set up right bound for left page */ + bound = GinDataPageGetRightBound(lpage); + *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key; + + /* set up right bound for right page */ + *GinDataPageGetRightBound(rpage) = oldbound; + + /* return temp pages to caller */ + *newlpage = lpage; + *newrpage = rpage; +} + +/* + * Construct insertion payload for inserting the downlink for given buffer. + */ +static void * +dataPrepareDownlink(GinBtree btree, Buffer lbuf) +{ + PostingItem *pitem = palloc(sizeof(PostingItem)); + Page lpage = BufferGetPage(lbuf); + + PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf)); + pitem->key = *GinDataPageGetRightBound(lpage); + + return pitem; +} + +/* + * Fills new root by right bound values from child. + * Also called from ginxlog, should not use btree + */ +void +ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage) +{ + PostingItem li, + ri; + + li.key = *GinDataPageGetRightBound(lpage); + PostingItemSetBlockNumber(&li, lblkno); + GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber); + + ri.key = *GinDataPageGetRightBound(rpage); + PostingItemSetBlockNumber(&ri, rblkno); + GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber); +} + + +/*** Functions to work with disassembled leaf pages ***/ + +/* + * Disassemble page into a disassembledLeaf struct. + */ +static disassembledLeaf * +disassembleLeaf(Page page) +{ + disassembledLeaf *leaf; + GinPostingList *seg; + Pointer segbegin; + Pointer segend; + + leaf = palloc0(sizeof(disassembledLeaf)); + dlist_init(&leaf->segments); + + if (GinPageIsCompressed(page)) + { + /* + * Create a leafSegmentInfo entry for each segment. + */ + seg = GinDataLeafPageGetPostingList(page); + segbegin = (Pointer) seg; + segend = segbegin + GinDataLeafPageGetPostingListSize(page); + while ((Pointer) seg < segend) + { + leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->action = GIN_SEGMENT_UNMODIFIED; + seginfo->seg = seg; + seginfo->items = NULL; + seginfo->nitems = 0; + dlist_push_tail(&leaf->segments, &seginfo->node); + + seg = GinNextPostingListSegment(seg); + } + leaf->oldformat = false; + } + else + { + /* + * A pre-9.4 format uncompressed page is represented by a single + * segment, with an array of items. The corner case is uncompressed + * page containing no items, which is represented as no segments. + */ + ItemPointer uncompressed; + int nuncompressed; + leafSegmentInfo *seginfo; + + uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed); + + if (nuncompressed > 0) + { + seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->action = GIN_SEGMENT_REPLACE; + seginfo->seg = NULL; + seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData)); + memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData)); + seginfo->nitems = nuncompressed; + + dlist_push_tail(&leaf->segments, &seginfo->node); + } + + leaf->oldformat = true; + } + + return leaf; +} + +/* + * Distribute newItems to the segments. + * + * Any segments that acquire new items are decoded, and the new items are + * merged with the old items. + * + * Returns true if any new items were added. False means they were all + * duplicates of existing items on the page. + */ +static bool +addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) +{ + dlist_iter iter; + ItemPointer nextnew = newItems; + int newleft = nNewItems; + bool modified = false; + leafSegmentInfo *newseg; + + /* + * If the page is completely empty, just construct one new segment to hold + * all the new items. + */ + if (dlist_is_empty(&leaf->segments)) + { + newseg = palloc(sizeof(leafSegmentInfo)); + newseg->seg = NULL; + newseg->items = newItems; + newseg->nitems = nNewItems; + newseg->action = GIN_SEGMENT_INSERT; + dlist_push_tail(&leaf->segments, &newseg->node); + return true; + } + + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur); + int nthis; + ItemPointer tmpitems; + int ntmpitems; + + /* + * How many of the new items fall into this segment? + */ + if (!dlist_has_next(&leaf->segments, iter.cur)) + nthis = newleft; + else + { + leafSegmentInfo *next; + ItemPointerData next_first; + + next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, + dlist_next_node(&leaf->segments, iter.cur)); + if (next->items) + next_first = next->items[0]; + else + { + Assert(next->seg != NULL); + next_first = next->seg->first; + } + + nthis = 0; + while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0) + nthis++; + } + if (nthis == 0) + continue; + + /* Merge the new items with the existing items. */ + if (!cur->items) + cur->items = ginPostingListDecode(cur->seg, &cur->nitems); + + /* + * Fast path for the important special case that we're appending to + * the end of the page: don't let the last segment on the page grow + * larger than the target, create a new segment before that happens. + */ + if (!dlist_has_next(&leaf->segments, iter.cur) && + ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 && + cur->seg != NULL && + SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize) + { + newseg = palloc(sizeof(leafSegmentInfo)); + newseg->seg = NULL; + newseg->items = nextnew; + newseg->nitems = nthis; + newseg->action = GIN_SEGMENT_INSERT; + dlist_push_tail(&leaf->segments, &newseg->node); + modified = true; + break; + } + + tmpitems = ginMergeItemPointers(cur->items, cur->nitems, + nextnew, nthis, + &ntmpitems); + if (ntmpitems != cur->nitems) + { + /* + * If there are no duplicates, track the added items so that we + * can emit a compact ADDITEMS WAL record later on. (it doesn't + * seem worth re-checking which items were duplicates, if there + * were any) + */ + if (ntmpitems == nthis + cur->nitems && + cur->action == GIN_SEGMENT_UNMODIFIED) + { + cur->action = GIN_SEGMENT_ADDITEMS; + cur->modifieditems = nextnew; + cur->nmodifieditems = nthis; + } + else + cur->action = GIN_SEGMENT_REPLACE; + + cur->items = tmpitems; + cur->nitems = ntmpitems; + cur->seg = NULL; + modified = true; + } + + nextnew += nthis; + newleft -= nthis; + if (newleft == 0) + break; + } + + return modified; +} + +/* + * Recompresses all segments that have been modified. + * + * If not all the items fit on two pages (ie. after split), we store as + * many items as fit, and set *remaining to the first item that didn't fit. + * If all items fit, *remaining is set to invalid. + * + * Returns true if the page has to be split. + */ +static bool +leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining) +{ + int pgused = 0; + bool needsplit = false; + dlist_iter iter; + int segsize; + leafSegmentInfo *nextseg; + int npacked; + bool modified; + dlist_node *cur_node; + dlist_node *next_node; + + ItemPointerSetInvalid(remaining); + + /* + * cannot use dlist_foreach_modify here because we insert adjacent items + * while iterating. + */ + for (cur_node = dlist_head_node(&leaf->segments); + cur_node != NULL; + cur_node = next_node) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + cur_node); + + if (dlist_has_next(&leaf->segments, cur_node)) + next_node = dlist_next_node(&leaf->segments, cur_node); + else + next_node = NULL; + + /* Compress the posting list, if necessary */ + if (seginfo->action != GIN_SEGMENT_DELETE) + { + if (seginfo->seg == NULL) + { + if (seginfo->nitems > GinPostingListSegmentMaxSize) + npacked = 0; /* no chance that it would fit. */ + else + { + seginfo->seg = ginCompressPostingList(seginfo->items, + seginfo->nitems, + GinPostingListSegmentMaxSize, + &npacked); + } + if (npacked != seginfo->nitems) + { + /* + * Too large. Compress again to the target size, and + * create a new segment to represent the remaining items. + * The new segment is inserted after this one, so it will + * be processed in the next iteration of this loop. + */ + if (seginfo->seg) + pfree(seginfo->seg); + seginfo->seg = ginCompressPostingList(seginfo->items, + seginfo->nitems, + GinPostingListSegmentTargetSize, + &npacked); + if (seginfo->action != GIN_SEGMENT_INSERT) + seginfo->action = GIN_SEGMENT_REPLACE; + + nextseg = palloc(sizeof(leafSegmentInfo)); + nextseg->action = GIN_SEGMENT_INSERT; + nextseg->seg = NULL; + nextseg->items = &seginfo->items[npacked]; + nextseg->nitems = seginfo->nitems - npacked; + next_node = &nextseg->node; + dlist_insert_after(cur_node, next_node); + } + } + + /* + * If the segment is very small, merge it with the next segment. + */ + if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node) + { + int nmerged; + + nextseg = dlist_container(leafSegmentInfo, node, next_node); + + if (seginfo->items == NULL) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + if (nextseg->items == NULL) + nextseg->items = ginPostingListDecode(nextseg->seg, + &nextseg->nitems); + nextseg->items = + ginMergeItemPointers(seginfo->items, seginfo->nitems, + nextseg->items, nextseg->nitems, + &nmerged); + Assert(nmerged == seginfo->nitems + nextseg->nitems); + nextseg->nitems = nmerged; + nextseg->seg = NULL; + + nextseg->action = GIN_SEGMENT_REPLACE; + nextseg->modifieditems = NULL; + nextseg->nmodifieditems = 0; + + if (seginfo->action == GIN_SEGMENT_INSERT) + { + dlist_delete(cur_node); + continue; + } + else + { + seginfo->action = GIN_SEGMENT_DELETE; + seginfo->seg = NULL; + } + } + + seginfo->items = NULL; + seginfo->nitems = 0; + } + + if (seginfo->action == GIN_SEGMENT_DELETE) + continue; + + /* + * OK, we now have a compressed version of this segment ready for + * copying to the page. Did we exceed the size that fits on one page? + */ + segsize = SizeOfGinPostingList(seginfo->seg); + if (pgused + segsize > GinDataPageMaxDataSize) + { + if (!needsplit) + { + /* switch to right page */ + Assert(pgused > 0); + leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node); + needsplit = true; + leaf->lsize = pgused; + pgused = 0; + } + else + { + /* + * Filled both pages. The last segment we constructed did not + * fit. + */ + *remaining = seginfo->seg->first; + + /* + * remove all segments that did not fit from the list. + */ + while (dlist_has_next(&leaf->segments, cur_node)) + dlist_delete(dlist_next_node(&leaf->segments, cur_node)); + dlist_delete(cur_node); + break; + } + } + + pgused += segsize; + } + + if (!needsplit) + { + leaf->lsize = pgused; + leaf->rsize = 0; + } + else + leaf->rsize = pgused; + + Assert(leaf->lsize <= GinDataPageMaxDataSize); + Assert(leaf->rsize <= GinDataPageMaxDataSize); + + /* + * Make a palloc'd copy of every segment after the first modified one, + * because as we start copying items to the original page, we might + * overwrite an existing segment. + */ + modified = false; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED) + { + modified = true; + } + else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED) + { + GinPostingList *tmp; + + segsize = SizeOfGinPostingList(seginfo->seg); + tmp = palloc(segsize); + memcpy(tmp, seginfo->seg, segsize); + seginfo->seg = tmp; + } + } + + return needsplit; +} + + +/*** Functions that are exported to the rest of the GIN code ***/ + +/* + * Creates new posting tree containing the given TIDs. Returns the page + * number of the root of the new posting tree. + * + * items[] must be in sorted order with no duplicates. + */ +BlockNumber +createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, + GinStatsData *buildStats, Buffer entrybuffer) +{ + BlockNumber blkno; + Buffer buffer; + Page tmppage; + Page page; + Pointer ptr; + int nrootitems; + int rootsize; + bool is_build = (buildStats != NULL); + + /* Construct the new root page in memory first. */ + tmppage = (Page) palloc(BLCKSZ); + GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber; + + /* + * Write as many of the items to the root page as fit. In segments of max + * GinPostingListSegmentMaxSize bytes each. + */ + nrootitems = 0; + rootsize = 0; + ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage); + while (nrootitems < nitems) + { + GinPostingList *segment; + int npacked; + int segsize; + + segment = ginCompressPostingList(&items[nrootitems], + nitems - nrootitems, + GinPostingListSegmentMaxSize, + &npacked); + segsize = SizeOfGinPostingList(segment); + if (rootsize + segsize > GinDataPageMaxDataSize) + break; + + memcpy(ptr, segment, segsize); + ptr += segsize; + rootsize += segsize; + nrootitems += npacked; + pfree(segment); + } + GinDataPageSetDataSize(tmppage, rootsize); + + /* + * All set. Get a new physical page, and copy the in-memory page to it. + */ + buffer = GinNewBuffer(index); + page = BufferGetPage(buffer); + blkno = BufferGetBlockNumber(buffer); + + /* + * Copy any predicate locks from the entry tree leaf (containing posting + * list) to the posting tree. + */ + PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno); + + START_CRIT_SECTION(); + + PageRestoreTempPage(tmppage, page); + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index) && !is_build) + { + XLogRecPtr recptr; + ginxlogCreatePostingTree data; + + data.size = rootsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogCreatePostingTree)); + + XLogRegisterData((char *) GinDataLeafPageGetPostingList(page), + rootsize); + XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE); + PageSetLSN(page, recptr); + } + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* During index build, count the newly-added data page */ + if (buildStats) + buildStats->nDataPages++; + + elog(DEBUG2, "created GIN posting tree with %d items", nrootitems); + + /* + * Add any remaining TIDs to the newly-created posting tree. + */ + if (nitems > nrootitems) + { + ginInsertItemPointers(index, blkno, + items + nrootitems, + nitems - nrootitems, + buildStats); + } + + return blkno; +} + +static void +ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) +{ + memset(btree, 0, sizeof(GinBtreeData)); + + btree->index = index; + btree->rootBlkno = rootBlkno; + + btree->findChildPage = dataLocateItem; + btree->getLeftMostChild = dataGetLeftMostPage; + btree->isMoveRight = dataIsMoveRight; + btree->findItem = NULL; + btree->findChildPtr = dataFindChildPtr; + btree->beginPlaceToPage = dataBeginPlaceToPage; + btree->execPlaceToPage = dataExecPlaceToPage; + btree->fillRoot = ginDataFillRoot; + btree->prepareDownlink = dataPrepareDownlink; + + btree->isData = true; + btree->fullScan = false; + btree->isBuild = false; +} + +/* + * Inserts array of item pointers, may execute several tree scan (very rare) + */ +void +ginInsertItemPointers(Relation index, BlockNumber rootBlkno, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats) +{ + GinBtreeData btree; + GinBtreeDataLeafInsertData insertdata; + GinBtreeStack *stack; + + ginPrepareDataScan(&btree, index, rootBlkno); + btree.isBuild = (buildStats != NULL); + insertdata.items = items; + insertdata.nitem = nitem; + insertdata.curitem = 0; + + while (insertdata.curitem < insertdata.nitem) + { + /* search for the leaf page where the first item should go to */ + btree.itemptr = insertdata.items[insertdata.curitem]; + stack = ginFindLeafPage(&btree, false, true, NULL); + + ginInsertValue(&btree, stack, &insertdata, buildStats); + } +} + +/* + * Starts a new scan on a posting tree. + */ +GinBtreeStack * +ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, + Snapshot snapshot) +{ + GinBtreeStack *stack; + + ginPrepareDataScan(btree, index, rootBlkno); + + btree->fullScan = true; + + stack = ginFindLeafPage(btree, true, false, snapshot); + + return stack; +} diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c new file mode 100644 index 0000000..29c36bc --- /dev/null +++ b/src/backend/access/gin/ginentrypage.c @@ -0,0 +1,772 @@ +/*------------------------------------------------------------------------- + * + * ginentrypage.c + * routines for handling GIN entry tree pages. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginentrypage.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "utils/rel.h" + +static void entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + GinBtreeEntryInsertData *insertData, + BlockNumber updateblkno, + Page *newlpage, Page *newrpage); + +/* + * Form a tuple for entry tree. + * + * If the tuple would be too big to be stored, function throws a suitable + * error if errorTooBig is true, or returns NULL if errorTooBig is false. + * + * See src/backend/access/gin/README for a description of the index tuple + * format that is being built here. We build on the assumption that we + * are making a leaf-level key entry containing a posting list of nipd items. + * If the caller is actually trying to make a posting-tree entry, non-leaf + * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite + * the t_tid fields as necessary. In any case, 'data' can be NULL to skip + * filling in the posting list; the caller is responsible for filling it + * afterwards if data = NULL and nipd > 0. + */ +IndexTuple +GinFormTuple(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + Pointer data, Size dataSize, int nipd, + bool errorTooBig) +{ + Datum datums[2]; + bool isnull[2]; + IndexTuple itup; + uint32 newsize; + + /* Build the basic tuple: optional column number, plus key datum */ + if (ginstate->oneCol) + { + datums[0] = key; + isnull[0] = (category != GIN_CAT_NORM_KEY); + } + else + { + datums[0] = UInt16GetDatum(attnum); + isnull[0] = false; + datums[1] = key; + isnull[1] = (category != GIN_CAT_NORM_KEY); + } + + itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); + + /* + * Determine and store offset to the posting list, making sure there is + * room for the category byte if needed. + * + * Note: because index_form_tuple MAXALIGNs the tuple size, there may well + * be some wasted pad space. Is it worth recomputing the data length to + * prevent that? That would also allow us to Assert that the real data + * doesn't overlap the GinNullCategory byte, which this code currently + * takes on faith. + */ + newsize = IndexTupleSize(itup); + + if (IndexTupleHasNulls(itup)) + { + uint32 minsize; + + Assert(category != GIN_CAT_NORM_KEY); + minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory); + newsize = Max(newsize, minsize); + } + + newsize = SHORTALIGN(newsize); + + GinSetPostingOffset(itup, newsize); + GinSetNPosting(itup, nipd); + + /* + * Add space needed for posting list, if any. Then check that the tuple + * won't be too big to store. + */ + newsize += dataSize; + + newsize = MAXALIGN(newsize); + + if (newsize > GinMaxItemSize) + { + if (errorTooBig) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + (Size) newsize, (Size) GinMaxItemSize, + RelationGetRelationName(ginstate->index)))); + pfree(itup); + return NULL; + } + + /* + * Resize tuple if needed + */ + if (newsize != IndexTupleSize(itup)) + { + itup = repalloc(itup, newsize); + + /* + * PostgreSQL 9.3 and earlier did not clear this new space, so we + * might find uninitialized padding when reading tuples from disk. + */ + memset((char *) itup + IndexTupleSize(itup), + 0, newsize - IndexTupleSize(itup)); + /* set new size in tuple header */ + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + } + + /* + * Copy in the posting list, if provided + */ + if (data) + { + char *ptr = GinGetPosting(itup); + + memcpy(ptr, data, dataSize); + } + + /* + * Insert category byte, if needed + */ + if (category != GIN_CAT_NORM_KEY) + { + Assert(IndexTupleHasNulls(itup)); + GinSetNullCategory(itup, ginstate, category); + } + return itup; +} + +/* + * Read item pointers from leaf entry tuple. + * + * Returns a palloc'd array of ItemPointers. The number of items is returned + * in *nitems. + */ +ItemPointer +ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, + int *nitems) +{ + Pointer ptr = GinGetPosting(itup); + int nipd = GinGetNPosting(itup); + ItemPointer ipd; + int ndecoded; + + if (GinItupIsCompressed(itup)) + { + if (nipd > 0) + { + ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + if (nipd != ndecoded) + elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", + nipd, ndecoded); + } + else + { + ipd = palloc(0); + } + } + else + { + ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); + } + *nitems = nipd; + return ipd; +} + +/* + * Form a non-leaf entry tuple by copying the key data from the given tuple, + * which can be either a leaf or non-leaf entry tuple. + * + * Any posting list in the source tuple is not copied. The specified child + * block number is inserted into t_tid. + */ +static IndexTuple +GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk) +{ + IndexTuple nitup; + + if (GinPageIsLeaf(page) && !GinIsPostingTree(itup)) + { + /* Tuple contains a posting list, just copy stuff before that */ + uint32 origsize = GinGetPostingOffset(itup); + + origsize = MAXALIGN(origsize); + nitup = (IndexTuple) palloc(origsize); + memcpy(nitup, itup, origsize); + /* ... be sure to fix the size header field ... */ + nitup->t_info &= ~INDEX_SIZE_MASK; + nitup->t_info |= origsize; + } + else + { + /* Copy the tuple as-is */ + nitup = (IndexTuple) palloc(IndexTupleSize(itup)); + memcpy(nitup, itup, IndexTupleSize(itup)); + } + + /* Now insert the correct downlink */ + GinSetDownlink(nitup, childblk); + + return nitup; +} + +/* + * Entry tree is a "static", ie tuple never deletes from it, + * so we don't use right bound, we use rightmost key instead. + */ +static IndexTuple +getRightMostTuple(Page page) +{ + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + + return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff)); +} + +static bool +entryIsMoveRight(GinBtree btree, Page page) +{ + IndexTuple itup; + OffsetNumber attnum; + Datum key; + GinNullCategory category; + + if (GinPageRightMost(page)) + return false; + + itup = getRightMostTuple(page); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + + if (ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, btree->entryKey, btree->entryCategory, + attnum, key, category) > 0) + return true; + + return false; +} + +/* + * Find correct tuple in non-leaf page. It supposed that + * page correctly chosen and searching value SHOULD be on page + */ +static BlockNumber +entryLocateEntry(GinBtree btree, GinBtreeStack *stack) +{ + OffsetNumber low, + high, + maxoff; + IndexTuple itup = NULL; + int result; + Page page = BufferGetPage(stack->buffer); + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + if (btree->fullScan) + { + stack->off = FirstOffsetNumber; + stack->predictNumber *= PageGetMaxOffsetNumber(page); + return btree->getLeftMostChild(btree, page); + } + + low = FirstOffsetNumber; + maxoff = high = PageGetMaxOffsetNumber(page); + Assert(high >= low); + + high++; + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + if (mid == maxoff && GinPageRightMost(page)) + { + /* Right infinity */ + result = -1; + } + else + { + OffsetNumber attnum; + Datum key; + GinNullCategory category; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid)); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + result = ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, + btree->entryKey, + btree->entryCategory, + attnum, key, category); + } + + if (result == 0) + { + stack->off = mid; + Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO); + return GinGetDownlink(itup); + } + else if (result > 0) + low = mid + 1; + else + high = mid; + } + + Assert(high >= FirstOffsetNumber && high <= maxoff); + + stack->off = high; + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high)); + Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO); + return GinGetDownlink(itup); +} + +/* + * Searches correct position for value on leaf page. + * Page should be correctly chosen. + * Returns true if value found on page. + */ +static bool +entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack) +{ + Page page = BufferGetPage(stack->buffer); + OffsetNumber low, + high; + + Assert(GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + if (btree->fullScan) + { + stack->off = FirstOffsetNumber; + return true; + } + + low = FirstOffsetNumber; + high = PageGetMaxOffsetNumber(page); + + if (high < low) + { + stack->off = FirstOffsetNumber; + return false; + } + + high++; + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + IndexTuple itup; + OffsetNumber attnum; + Datum key; + GinNullCategory category; + int result; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid)); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + result = ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, + btree->entryKey, + btree->entryCategory, + attnum, key, category); + if (result == 0) + { + stack->off = mid; + return true; + } + else if (result > 0) + low = mid + 1; + else + high = mid; + } + + stack->off = high; + return false; +} + +static OffsetNumber +entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) +{ + OffsetNumber i, + maxoff = PageGetMaxOffsetNumber(page); + IndexTuple itup; + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + /* if page isn't changed, we returns storedOff */ + if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff)); + if (GinGetDownlink(itup) == blkno) + return storedOff; + + /* + * we hope, that needed pointer goes to right. It's true if there + * wasn't a deletion + */ + for (i = storedOff + 1; i <= maxoff; i++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + if (GinGetDownlink(itup) == blkno) + return i; + } + maxoff = storedOff - 1; + } + + /* last chance */ + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + if (GinGetDownlink(itup) == blkno) + return i; + } + + return InvalidOffsetNumber; +} + +static BlockNumber +entryGetLeftMostPage(GinBtree btree, Page page) +{ + IndexTuple itup; + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); + return GinGetDownlink(itup); +} + +static bool +entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off, + GinBtreeEntryInsertData *insertData) +{ + Size releasedsz = 0; + Size addedsz; + Page page = BufferGetPage(buf); + + Assert(insertData->entry); + Assert(!GinPageIsData(page)); + + if (insertData->isDelete) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + + releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + } + + addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData); + + if (PageGetFreeSpace(page) + releasedsz >= addedsz) + return true; + + return false; +} + +/* + * Delete tuple on leaf page if tuples existed and we + * should update it, update old child blkno to new right page + * if child split occurred + */ +static void +entryPreparePage(GinBtree btree, Page page, OffsetNumber off, + GinBtreeEntryInsertData *insertData, BlockNumber updateblkno) +{ + Assert(insertData->entry); + Assert(!GinPageIsData(page)); + + if (insertData->isDelete) + { + Assert(GinPageIsLeaf(page)); + PageIndexTupleDelete(page, off); + } + + if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + + GinSetDownlink(itup, updateblkno); + } +} + +/* + * Prepare to insert data on an entry page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + */ +static GinPlaceToPageRC +entryBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + GinBtreeEntryInsertData *insertData = insertPayload; + OffsetNumber off = stack->off; + + /* If it doesn't fit, deal with split case */ + if (!entryIsEnoughSpace(btree, buf, off, insertData)) + { + entrySplitPage(btree, buf, stack, insertData, updateblkno, + newlpage, newrpage); + return GPTP_SPLIT; + } + + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void *ptp_workspace) +{ + GinBtreeEntryInsertData *insertData = insertPayload; + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + OffsetNumber placed; + + entryPreparePage(btree, page, off, insertData, updateblkno); + + placed = PageAddItem(page, + (Item) insertData->entry, + IndexTupleSize(insertData->entry), + off, false, false); + if (placed != off) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(btree->index)); + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertEntry data; + + data.isDelete = insertData->isDelete; + data.offset = off; + + XLogRegisterBufData(0, (char *) &data, + offsetof(ginxlogInsertEntry, tuple)); + XLogRegisterBufData(0, (char *) insertData->entry, + IndexTupleSize(insertData->entry)); + } +} + +/* + * Split entry page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. + */ +static void +entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + GinBtreeEntryInsertData *insertData, + BlockNumber updateblkno, + Page *newlpage, Page *newrpage) +{ + OffsetNumber off = stack->off; + OffsetNumber i, + maxoff, + separator = InvalidOffsetNumber; + Size totalsize = 0; + Size lsize = 0, + size; + char *ptr; + IndexTuple itup; + Page page; + Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf)); + Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf)); + Size pageSize = PageGetPageSize(lpage); + PGAlignedBlock tupstore[2]; /* could need 2 pages' worth of tuples */ + + entryPreparePage(btree, lpage, off, insertData, updateblkno); + + /* + * First, append all the existing tuples and the new tuple we're inserting + * one after another in a temporary workspace. + */ + maxoff = PageGetMaxOffsetNumber(lpage); + ptr = tupstore[0].data; + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + if (i == off) + { + size = MAXALIGN(IndexTupleSize(insertData->entry)); + memcpy(ptr, insertData->entry, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i)); + size = MAXALIGN(IndexTupleSize(itup)); + memcpy(ptr, itup, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + if (off == maxoff + 1) + { + size = MAXALIGN(IndexTupleSize(insertData->entry)); + memcpy(ptr, insertData->entry, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + /* + * Initialize the left and right pages, and copy all the tuples back to + * them. + */ + GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); + GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize); + + ptr = tupstore[0].data; + maxoff++; + lsize = 0; + + page = lpage; + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + itup = (IndexTuple) ptr; + + /* + * Decide where to split. We try to equalize the pages' total data + * size, not number of tuples. + */ + if (lsize > totalsize / 2) + { + if (separator == InvalidOffsetNumber) + separator = i - 1; + page = rpage; + } + else + { + lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + } + + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(btree->index)); + ptr += MAXALIGN(IndexTupleSize(itup)); + } + + /* return temp pages to caller */ + *newlpage = lpage; + *newrpage = rpage; +} + +/* + * Construct insertion payload for inserting the downlink for given buffer. + */ +static void * +entryPrepareDownlink(GinBtree btree, Buffer lbuf) +{ + GinBtreeEntryInsertData *insertData; + Page lpage = BufferGetPage(lbuf); + BlockNumber lblkno = BufferGetBlockNumber(lbuf); + IndexTuple itup; + + itup = getRightMostTuple(lpage); + + insertData = palloc(sizeof(GinBtreeEntryInsertData)); + insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno); + insertData->isDelete = false; + + return insertData; +} + +/* + * Fills new root by rightest values from child. + * Also called from ginxlog, should not use btree + */ +void +ginEntryFillRoot(GinBtree btree, Page root, + BlockNumber lblkno, Page lpage, + BlockNumber rblkno, Page rpage) +{ + IndexTuple itup; + + itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno); + if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index root page"); + pfree(itup); + + itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno); + if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index root page"); + pfree(itup); +} + +/* + * Set up GinBtree for entry page access + * + * Note: during WAL recovery, there may be no valid data in ginstate + * other than a faked-up Relation pointer; the key datum is bogus too. + */ +void +ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, + Datum key, GinNullCategory category, + GinState *ginstate) +{ + memset(btree, 0, sizeof(GinBtreeData)); + + btree->index = ginstate->index; + btree->rootBlkno = GIN_ROOT_BLKNO; + btree->ginstate = ginstate; + + btree->findChildPage = entryLocateEntry; + btree->getLeftMostChild = entryGetLeftMostPage; + btree->isMoveRight = entryIsMoveRight; + btree->findItem = entryLocateLeafEntry; + btree->findChildPtr = entryFindChildPtr; + btree->beginPlaceToPage = entryBeginPlaceToPage; + btree->execPlaceToPage = entryExecPlaceToPage; + btree->fillRoot = ginEntryFillRoot; + btree->prepareDownlink = entryPrepareDownlink; + + btree->isData = false; + btree->fullScan = false; + btree->isBuild = false; + + btree->entryAttnum = attnum; + btree->entryKey = key; + btree->entryCategory = category; +} diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c new file mode 100644 index 0000000..e0d9940 --- /dev/null +++ b/src/backend/access/gin/ginfast.c @@ -0,0 +1,1068 @@ +/*------------------------------------------------------------------------- + * + * ginfast.c + * Fast insert routines for the Postgres inverted index access method. + * Pending entries are stored in linear list of pages. Later on + * (typically during VACUUM), ginInsertCleanup() will be invoked to + * transfer pending entries into the regular index structure. This + * wins because bulk insertion is much more efficient than retail. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginfast.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/pg_am.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "postmaster/autovacuum.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* GUC parameter */ +int gin_pending_list_limit = 0; + +#define GIN_PAGE_FREESIZE \ + ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) ) + +typedef struct KeyArray +{ + Datum *keys; /* expansible array */ + GinNullCategory *categories; /* another expansible array */ + int32 nvalues; /* current number of valid entries */ + int32 maxvalues; /* allocated size of arrays */ +} KeyArray; + + +/* + * Build a pending-list page from the given array of tuples, and write it out. + * + * Returns amount of free space left on the page. + */ +static int32 +writeListPage(Relation index, Buffer buffer, + IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) +{ + Page page = BufferGetPage(buffer); + int32 i, + freesize, + size = 0; + OffsetNumber l, + off; + PGAlignedBlock workspace; + char *ptr; + + START_CRIT_SECTION(); + + GinInitBuffer(buffer, GIN_LIST); + + off = FirstOffsetNumber; + ptr = workspace.data; + + for (i = 0; i < ntuples; i++) + { + int this_size = IndexTupleSize(tuples[i]); + + memcpy(ptr, tuples[i], this_size); + ptr += this_size; + size += this_size; + + l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(index)); + + off++; + } + + Assert(size <= BLCKSZ); /* else we overran workspace */ + + GinPageGetOpaque(page)->rightlink = rightlink; + + /* + * tail page may contain only whole row(s) or final part of row placed on + * previous pages (a "row" here meaning all the index tuples generated for + * one heap tuple) + */ + if (rightlink == InvalidBlockNumber) + { + GinPageSetFullRow(page); + GinPageGetOpaque(page)->maxoff = 1; + } + else + { + GinPageGetOpaque(page)->maxoff = 0; + } + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + ginxlogInsertListPage data; + XLogRecPtr recptr; + + data.rightlink = rightlink; + data.ntuples = ntuples; + + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage)); + + XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); + XLogRegisterBufData(0, workspace.data, size); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE); + PageSetLSN(page, recptr); + } + + /* get free space before releasing buffer */ + freesize = PageGetExactFreeSpace(page); + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + return freesize; +} + +static void +makeSublist(Relation index, IndexTuple *tuples, int32 ntuples, + GinMetaPageData *res) +{ + Buffer curBuffer = InvalidBuffer; + Buffer prevBuffer = InvalidBuffer; + int i, + size = 0, + tupsize; + int startTuple = 0; + + Assert(ntuples > 0); + + /* + * Split tuples into pages + */ + for (i = 0; i < ntuples; i++) + { + if (curBuffer == InvalidBuffer) + { + curBuffer = GinNewBuffer(index); + + if (prevBuffer != InvalidBuffer) + { + res->nPendingPages++; + writeListPage(index, prevBuffer, + tuples + startTuple, + i - startTuple, + BufferGetBlockNumber(curBuffer)); + } + else + { + res->head = BufferGetBlockNumber(curBuffer); + } + + prevBuffer = curBuffer; + startTuple = i; + size = 0; + } + + tupsize = MAXALIGN(IndexTupleSize(tuples[i])) + sizeof(ItemIdData); + + if (size + tupsize > GinListPageSize) + { + /* won't fit, force a new page and reprocess */ + i--; + curBuffer = InvalidBuffer; + } + else + { + size += tupsize; + } + } + + /* + * Write last page + */ + res->tail = BufferGetBlockNumber(curBuffer); + res->tailFreeSize = writeListPage(index, curBuffer, + tuples + startTuple, + ntuples - startTuple, + InvalidBlockNumber); + res->nPendingPages++; + /* that was only one heap tuple */ + res->nPendingHeapTuples = 1; +} + +/* + * Write the index tuples contained in *collector into the index's + * pending list. + * + * Function guarantees that all these tuples will be inserted consecutively, + * preserving order + */ +void +ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) +{ + Relation index = ginstate->index; + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata = NULL; + Buffer buffer = InvalidBuffer; + Page page = NULL; + ginxlogUpdateMeta data; + bool separateList = false; + bool needCleanup = false; + int cleanupSize; + bool needWal; + + if (collector->ntuples == 0) + return; + + needWal = RelationNeedsWAL(index); + + data.node = index->rd_node; + data.ntuples = 0; + data.newRightlink = data.prevTail = InvalidBlockNumber; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + metapage = BufferGetPage(metabuffer); + + /* + * An insertion to the pending list could logically belong anywhere in the + * tree, so it conflicts with all serializable scans. All scans acquire a + * predicate lock on the metabuffer to represent that. + */ + CheckForSerializableConflictIn(index, NULL, GIN_METAPAGE_BLKNO); + + if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) + { + /* + * Total size is greater than one page => make sublist + */ + separateList = true; + } + else + { + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metadata = GinPageGetMeta(metapage); + + if (metadata->head == InvalidBlockNumber || + collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize) + { + /* + * Pending list is empty or total size is greater than freespace + * on tail page => make sublist + * + * We unlock metabuffer to keep high concurrency + */ + separateList = true; + LockBuffer(metabuffer, GIN_UNLOCK); + } + } + + if (separateList) + { + /* + * We should make sublist separately and append it to the tail + */ + GinMetaPageData sublist; + + memset(&sublist, 0, sizeof(GinMetaPageData)); + makeSublist(index, collector->tuples, collector->ntuples, &sublist); + + if (needWal) + XLogBeginInsert(); + + /* + * metapage was unlocked, see above + */ + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metadata = GinPageGetMeta(metapage); + + if (metadata->head == InvalidBlockNumber) + { + /* + * Main list is empty, so just insert sublist as main list + */ + START_CRIT_SECTION(); + + metadata->head = sublist.head; + metadata->tail = sublist.tail; + metadata->tailFreeSize = sublist.tailFreeSize; + + metadata->nPendingPages = sublist.nPendingPages; + metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; + } + else + { + /* + * Merge lists + */ + data.prevTail = metadata->tail; + data.newRightlink = sublist.head; + + buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + + Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); + + START_CRIT_SECTION(); + + GinPageGetOpaque(page)->rightlink = sublist.head; + + MarkBufferDirty(buffer); + + metadata->tail = sublist.tail; + metadata->tailFreeSize = sublist.tailFreeSize; + + metadata->nPendingPages += sublist.nPendingPages; + metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; + + if (needWal) + XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); + } + } + else + { + /* + * Insert into tail page. Metapage is already locked + */ + OffsetNumber l, + off; + int i, + tupsize; + char *ptr; + char *collectordata; + + buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + + off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + collectordata = ptr = (char *) palloc(collector->sumsize); + + data.ntuples = collector->ntuples; + + if (needWal) + XLogBeginInsert(); + + START_CRIT_SECTION(); + + /* + * Increase counter of heap tuples + */ + Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples); + GinPageGetOpaque(page)->maxoff++; + metadata->nPendingHeapTuples++; + + for (i = 0; i < collector->ntuples; i++) + { + tupsize = IndexTupleSize(collector->tuples[i]); + l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(index)); + + memcpy(ptr, collector->tuples[i], tupsize); + ptr += tupsize; + + off++; + } + + Assert((ptr - collectordata) <= collector->sumsize); + if (needWal) + { + XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); + XLogRegisterBufData(1, collectordata, collector->sumsize); + } + + metadata->tailFreeSize = PageGetExactFreeSpace(page); + + MarkBufferDirty(buffer); + } + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. (We must do this here because pre-v11 versions of PG did not + * set the metapage's pd_lower correctly, so a pg_upgraded index might + * contain the wrong value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; + + /* + * Write metabuffer, make xlog entry + */ + MarkBufferDirty(metabuffer); + + if (needWal) + { + XLogRecPtr recptr; + + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); + + XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); + XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); + PageSetLSN(metapage, recptr); + + if (buffer != InvalidBuffer) + { + PageSetLSN(page, recptr); + } + } + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); + + /* + * Force pending list cleanup when it becomes too long. And, + * ginInsertCleanup could take significant amount of time, so we prefer to + * call it when it can do all the work in a single collection cycle. In + * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it + * while pending list is still small enough to fit into + * gin_pending_list_limit. + * + * ginInsertCleanup() should not be called inside our CRIT_SECTION. + */ + cleanupSize = GinGetPendingListCleanupSize(index); + if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L) + needCleanup = true; + + UnlockReleaseBuffer(metabuffer); + + END_CRIT_SECTION(); + + /* + * Since it could contend with concurrent cleanup process we cleanup + * pending list not forcibly. + */ + if (needCleanup) + ginInsertCleanup(ginstate, false, true, false, NULL); +} + +/* + * Create temporary index tuples for a single indexable item (one index column + * for the heap tuple specified by ht_ctid), and append them to the array + * in *collector. They will subsequently be written out using + * ginHeapTupleFastInsert. Note that to guarantee consistent state, all + * temp tuples for a given heap tuple must be written in one call to + * ginHeapTupleFastInsert. + */ +void +ginHeapTupleFastCollect(GinState *ginstate, + GinTupleCollector *collector, + OffsetNumber attnum, Datum value, bool isNull, + ItemPointer ht_ctid) +{ + Datum *entries; + GinNullCategory *categories; + int32 i, + nentries; + + /* + * Extract the key values that need to be inserted in the index + */ + entries = ginExtractEntries(ginstate, attnum, value, isNull, + &nentries, &categories); + + /* + * Protect against integer overflow in allocation calculations + */ + if (nentries < 0 || + collector->ntuples + nentries > MaxAllocSize / sizeof(IndexTuple)) + elog(ERROR, "too many entries for GIN index"); + + /* + * Allocate/reallocate memory for storing collected tuples + */ + if (collector->tuples == NULL) + { + /* + * Determine the number of elements to allocate in the tuples array + * initially. Make it a power of 2 to avoid wasting memory when + * resizing (since palloc likes powers of 2). + */ + collector->lentuples = pg_nextpower2_32(Max(16, nentries)); + collector->tuples = (IndexTuple *) palloc(sizeof(IndexTuple) * collector->lentuples); + } + else if (collector->lentuples < collector->ntuples + nentries) + { + /* + * Advance lentuples to the next suitable power of 2. This won't + * overflow, though we could get to a value that exceeds + * MaxAllocSize/sizeof(IndexTuple), causing an error in repalloc. + */ + collector->lentuples = pg_nextpower2_32(collector->ntuples + nentries); + collector->tuples = (IndexTuple *) repalloc(collector->tuples, + sizeof(IndexTuple) * collector->lentuples); + } + + /* + * Build an index tuple for each key value, and add to array. In pending + * tuples we just stick the heap TID into t_tid. + */ + for (i = 0; i < nentries; i++) + { + IndexTuple itup; + + itup = GinFormTuple(ginstate, attnum, entries[i], categories[i], + NULL, 0, 0, true); + itup->t_tid = *ht_ctid; + collector->tuples[collector->ntuples++] = itup; + collector->sumsize += IndexTupleSize(itup); + } +} + +/* + * Deletes pending list pages up to (not including) newHead page. + * If newHead == InvalidBlockNumber then function drops the whole list. + * + * metapage is pinned and exclusive-locked throughout this function. + */ +static void +shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, + bool fill_fsm, IndexBulkDeleteResult *stats) +{ + Page metapage; + GinMetaPageData *metadata; + BlockNumber blknoToDelete; + + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + blknoToDelete = metadata->head; + + do + { + Page page; + int i; + int64 nDeletedHeapTuples = 0; + ginxlogDeleteListPages data; + Buffer buffers[GIN_NDELETE_AT_ONCE]; + BlockNumber freespace[GIN_NDELETE_AT_ONCE]; + + data.ndeleted = 0; + while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) + { + freespace[data.ndeleted] = blknoToDelete; + buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); + LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); + page = BufferGetPage(buffers[data.ndeleted]); + + data.ndeleted++; + + Assert(!GinPageIsDeleted(page)); + + nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; + blknoToDelete = GinPageGetOpaque(page)->rightlink; + } + + if (stats) + stats->pages_deleted += data.ndeleted; + + /* + * This operation touches an unusually large number of pages, so + * prepare the XLogInsert machinery for that before entering the + * critical section. + */ + if (RelationNeedsWAL(index)) + XLogEnsureRecordSpace(data.ndeleted, 0); + + START_CRIT_SECTION(); + + metadata->head = blknoToDelete; + + Assert(metadata->nPendingPages >= data.ndeleted); + metadata->nPendingPages -= data.ndeleted; + Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); + metadata->nPendingHeapTuples -= nDeletedHeapTuples; + + if (blknoToDelete == InvalidBlockNumber) + { + metadata->tail = InvalidBlockNumber; + metadata->tailFreeSize = 0; + metadata->nPendingPages = 0; + metadata->nPendingHeapTuples = 0; + } + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c + * compresses the page. (We must do this here because pre-v11 + * versions of PG did not set the metapage's pd_lower correctly, so a + * pg_upgraded index might contain the wrong value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; + + MarkBufferDirty(metabuffer); + + for (i = 0; i < data.ndeleted; i++) + { + page = BufferGetPage(buffers[i]); + GinPageGetOpaque(page)->flags = GIN_DELETED; + MarkBufferDirty(buffers[i]); + } + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, metabuffer, + REGBUF_WILL_INIT | REGBUF_STANDARD); + for (i = 0; i < data.ndeleted; i++) + XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); + + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); + + XLogRegisterData((char *) &data, + sizeof(ginxlogDeleteListPages)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); + PageSetLSN(metapage, recptr); + + for (i = 0; i < data.ndeleted; i++) + { + page = BufferGetPage(buffers[i]); + PageSetLSN(page, recptr); + } + } + + for (i = 0; i < data.ndeleted; i++) + UnlockReleaseBuffer(buffers[i]); + + END_CRIT_SECTION(); + + for (i = 0; fill_fsm && i < data.ndeleted; i++) + RecordFreeIndexPage(index, freespace[i]); + + } while (blknoToDelete != newHead); +} + +/* Initialize empty KeyArray */ +static void +initKeyArray(KeyArray *keys, int32 maxvalues) +{ + keys->keys = (Datum *) palloc(sizeof(Datum) * maxvalues); + keys->categories = (GinNullCategory *) + palloc(sizeof(GinNullCategory) * maxvalues); + keys->nvalues = 0; + keys->maxvalues = maxvalues; +} + +/* Add datum to KeyArray, resizing if needed */ +static void +addDatum(KeyArray *keys, Datum datum, GinNullCategory category) +{ + if (keys->nvalues >= keys->maxvalues) + { + keys->maxvalues *= 2; + keys->keys = (Datum *) + repalloc(keys->keys, sizeof(Datum) * keys->maxvalues); + keys->categories = (GinNullCategory *) + repalloc(keys->categories, sizeof(GinNullCategory) * keys->maxvalues); + } + + keys->keys[keys->nvalues] = datum; + keys->categories[keys->nvalues] = category; + keys->nvalues++; +} + +/* + * Collect data from a pending-list page in preparation for insertion into + * the main index. + * + * Go through all tuples >= startoff on page and collect values in accum + * + * Note that ka is just workspace --- it does not carry any state across + * calls. + */ +static void +processPendingPage(BuildAccumulator *accum, KeyArray *ka, + Page page, OffsetNumber startoff) +{ + ItemPointerData heapptr; + OffsetNumber i, + maxoff; + OffsetNumber attrnum; + + /* reset *ka to empty */ + ka->nvalues = 0; + + maxoff = PageGetMaxOffsetNumber(page); + Assert(maxoff >= FirstOffsetNumber); + ItemPointerSetInvalid(&heapptr); + attrnum = 0; + + for (i = startoff; i <= maxoff; i = OffsetNumberNext(i)) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + OffsetNumber curattnum; + Datum curkey; + GinNullCategory curcategory; + + /* Check for change of heap TID or attnum */ + curattnum = gintuple_get_attrnum(accum->ginstate, itup); + + if (!ItemPointerIsValid(&heapptr)) + { + heapptr = itup->t_tid; + attrnum = curattnum; + } + else if (!(ItemPointerEquals(&heapptr, &itup->t_tid) && + curattnum == attrnum)) + { + /* + * ginInsertBAEntries can insert several datums per call, but only + * for one heap tuple and one column. So call it at a boundary, + * and reset ka. + */ + ginInsertBAEntries(accum, &heapptr, attrnum, + ka->keys, ka->categories, ka->nvalues); + ka->nvalues = 0; + heapptr = itup->t_tid; + attrnum = curattnum; + } + + /* Add key to KeyArray */ + curkey = gintuple_get_key(accum->ginstate, itup, &curcategory); + addDatum(ka, curkey, curcategory); + } + + /* Dump out all remaining keys */ + ginInsertBAEntries(accum, &heapptr, attrnum, + ka->keys, ka->categories, ka->nvalues); +} + +/* + * Move tuples from pending pages into regular GIN structure. + * + * On first glance it looks completely not crash-safe. But if we crash + * after posting entries to the main index and before removing them from the + * pending list, it's okay because when we redo the posting later on, nothing + * bad will happen. + * + * fill_fsm indicates that ginInsertCleanup should add deleted pages + * to FSM otherwise caller is responsible to put deleted pages into + * FSM. + * + * If stats isn't null, we count deleted pending pages into the counts. + */ +void +ginInsertCleanup(GinState *ginstate, bool full_clean, + bool fill_fsm, bool forceCleanup, + IndexBulkDeleteResult *stats) +{ + Relation index = ginstate->index; + Buffer metabuffer, + buffer; + Page metapage, + page; + GinMetaPageData *metadata; + MemoryContext opCtx, + oldCtx; + BuildAccumulator accum; + KeyArray datums; + BlockNumber blkno, + blknoFinish; + bool cleanupFinish = false; + bool fsm_vac = false; + Size workMemory; + + /* + * We would like to prevent concurrent cleanup process. For that we will + * lock metapage in exclusive mode using LockPage() call. Nobody other + * will use that lock for metapage, so we keep possibility of concurrent + * insertion into pending list + */ + + if (forceCleanup) + { + /* + * We are called from [auto]vacuum/analyze or gin_clean_pending_list() + * and we would like to wait concurrent cleanup to finish. + */ + LockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); + workMemory = + (IsAutoVacuumWorkerProcess() && autovacuum_work_mem != -1) ? + autovacuum_work_mem : maintenance_work_mem; + } + else + { + /* + * We are called from regular insert and if we see concurrent cleanup + * just exit in hope that concurrent process will clean up pending + * list. + */ + if (!ConditionalLockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock)) + return; + workMemory = work_mem; + } + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + if (metadata->head == InvalidBlockNumber) + { + /* Nothing to do */ + UnlockReleaseBuffer(metabuffer); + UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); + return; + } + + /* + * Remember a tail page to prevent infinite cleanup if other backends add + * new tuples faster than we can cleanup. + */ + blknoFinish = metadata->tail; + + /* + * Read and lock head of pending list + */ + blkno = metadata->head; + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + + LockBuffer(metabuffer, GIN_UNLOCK); + + /* + * Initialize. All temporary space will be in opCtx + */ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "GIN insert cleanup temporary context", + ALLOCSET_DEFAULT_SIZES); + + oldCtx = MemoryContextSwitchTo(opCtx); + + initKeyArray(&datums, 128); + ginInitBA(&accum); + accum.ginstate = ginstate; + + /* + * At the top of this loop, we have pin and lock on the current page of + * the pending list. However, we'll release that before exiting the loop. + * Note we also have pin but not lock on the metapage. + */ + for (;;) + { + Assert(!GinPageIsDeleted(page)); + + /* + * Are we walk through the page which as we remember was a tail when + * we start our cleanup? But if caller asks us to clean up whole + * pending list then ignore old tail, we will work until list becomes + * empty. + */ + if (blkno == blknoFinish && full_clean == false) + cleanupFinish = true; + + /* + * read page's datums into accum + */ + processPendingPage(&accum, &datums, page, FirstOffsetNumber); + + vacuum_delay_point(); + + /* + * Is it time to flush memory to disk? Flush if we are at the end of + * the pending list, or if we have a full row and memory is getting + * full. + */ + if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || + (GinPageHasFullRow(page) && + (accum.allocatedMemory >= workMemory * 1024L))) + { + ItemPointerData *list; + uint32 nlist; + Datum key; + GinNullCategory category; + OffsetNumber maxoff, + attnum; + + /* + * Unlock current page to increase performance. Changes of page + * will be checked later by comparing maxoff after completion of + * memory flush. + */ + maxoff = PageGetMaxOffsetNumber(page); + LockBuffer(buffer, GIN_UNLOCK); + + /* + * Moving collected data into regular structure can take + * significant amount of time - so, run it without locking pending + * list. + */ + ginBeginBAScan(&accum); + while ((list = ginGetBAEntry(&accum, + &attnum, &key, &category, &nlist)) != NULL) + { + ginEntryInsert(ginstate, attnum, key, category, + list, nlist, NULL); + vacuum_delay_point(); + } + + /* + * Lock the whole list to remove pages + */ + LockBuffer(metabuffer, GIN_EXCLUSIVE); + LockBuffer(buffer, GIN_SHARE); + + Assert(!GinPageIsDeleted(page)); + + /* + * While we left the page unlocked, more stuff might have gotten + * added to it. If so, process those entries immediately. There + * shouldn't be very many, so we don't worry about the fact that + * we're doing this with exclusive lock. Insertion algorithm + * guarantees that inserted row(s) will not continue on next page. + * NOTE: intentionally no vacuum_delay_point in this loop. + */ + if (PageGetMaxOffsetNumber(page) != maxoff) + { + ginInitBA(&accum); + processPendingPage(&accum, &datums, page, maxoff + 1); + + ginBeginBAScan(&accum); + while ((list = ginGetBAEntry(&accum, + &attnum, &key, &category, &nlist)) != NULL) + ginEntryInsert(ginstate, attnum, key, category, + list, nlist, NULL); + } + + /* + * Remember next page - it will become the new list head + */ + blkno = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); /* shiftList will do exclusive + * locking */ + + /* + * remove read pages from pending list, at this point all content + * of read pages is in regular structure + */ + shiftList(index, metabuffer, blkno, fill_fsm, stats); + + /* At this point, some pending pages have been freed up */ + fsm_vac = true; + + Assert(blkno == metadata->head); + LockBuffer(metabuffer, GIN_UNLOCK); + + /* + * if we removed the whole pending list or we cleanup tail (which + * we remembered on start our cleanup process) then just exit + */ + if (blkno == InvalidBlockNumber || cleanupFinish) + break; + + /* + * release memory used so far and reinit state + */ + MemoryContextReset(opCtx); + initKeyArray(&datums, datums.maxvalues); + ginInitBA(&accum); + } + else + { + blkno = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); + } + + /* + * Read next page in pending list + */ + vacuum_delay_point(); + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + } + + UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); + ReleaseBuffer(metabuffer); + + /* + * As pending list pages can have a high churn rate, it is desirable to + * recycle them immediately to the FreeSpaceMap when ordinary backends + * clean the list. + */ + if (fsm_vac && fill_fsm) + IndexFreeSpaceMapVacuum(index); + + /* Clean up temporary space */ + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(opCtx); +} + +/* + * SQL-callable function to clean the insert pending list + */ +Datum +gin_clean_pending_list(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + Relation indexRel = index_open(indexoid, RowExclusiveLock); + IndexBulkDeleteResult stats; + GinState ginstate; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("GIN pending list cannot be cleaned up during recovery."))); + + /* Must be a GIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != GIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a GIN index", + RelationGetRelationName(indexRel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(indexRel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary indexes of other sessions"))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (!pg_class_ownercheck(indexoid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, + RelationGetRelationName(indexRel)); + + memset(&stats, 0, sizeof(stats)); + initGinState(&ginstate, indexRel); + ginInsertCleanup(&ginstate, true, true, true, &stats); + + index_close(indexRel, RowExclusiveLock); + + PG_RETURN_INT64((int64) stats.pages_deleted); +} diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c new file mode 100644 index 0000000..03191e0 --- /dev/null +++ b/src/backend/access/gin/ginget.c @@ -0,0 +1,1970 @@ +/*------------------------------------------------------------------------- + * + * ginget.c + * fetch tuples from a GIN scan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginget.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/relscan.h" +#include "miscadmin.h" +#include "storage/predicate.h" +#include "utils/datum.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* GUC parameter */ +int GinFuzzySearchLimit = 0; + +typedef struct pendingPosition +{ + Buffer pendingBuffer; + OffsetNumber firstOffset; + OffsetNumber lastOffset; + ItemPointerData item; + bool *hasMatchKey; +} pendingPosition; + + +/* + * Goes to the next page if current offset is outside of bounds + */ +static bool +moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot) +{ + Page page = BufferGetPage(stack->buffer); + + if (stack->off > PageGetMaxOffsetNumber(page)) + { + /* + * We scanned the whole page, so we should take right page + */ + if (GinPageRightMost(page)) + return false; /* no more pages */ + + stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE); + stack->blkno = BufferGetBlockNumber(stack->buffer); + stack->off = FirstOffsetNumber; + PredicateLockPage(btree->index, stack->blkno, snapshot); + } + + return true; +} + +/* + * Scan all pages of a posting tree and save all its heap ItemPointers + * in scanEntry->matchBitmap + */ +static void +scanPostingTree(Relation index, GinScanEntry scanEntry, + BlockNumber rootPostingTree, Snapshot snapshot) +{ + GinBtreeData btree; + GinBtreeStack *stack; + Buffer buffer; + Page page; + + /* Descend to the leftmost leaf page */ + stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot); + buffer = stack->buffer; + + IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ + + freeGinBtreeStack(stack); + + /* + * Loop iterates through all leaf pages of posting tree + */ + for (;;) + { + page = BufferGetPage(buffer); + if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0) + { + int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap); + + scanEntry->predictNumberResult += n; + } + + if (GinPageRightMost(page)) + break; /* no more pages */ + + buffer = ginStepRight(buffer, index, GIN_SHARE); + } + + UnlockReleaseBuffer(buffer); +} + +/* + * Collects TIDs into scanEntry->matchBitmap for all heap tuples that + * match the search entry. This supports three different match modes: + * + * 1. Partial-match support: scan from current point until the + * comparePartialFn says we're done. + * 2. SEARCH_MODE_ALL: scan from current point (which should be first + * key for the current attnum) until we hit null items or end of attnum + * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first + * key for the current attnum) until we hit end of attnum + * + * Returns true if done, false if it's necessary to restart scan from scratch + */ +static bool +collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, + GinScanEntry scanEntry, Snapshot snapshot) +{ + OffsetNumber attnum; + Form_pg_attribute attr; + + /* Initialize empty bitmap result */ + scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL); + + /* Null query cannot partial-match anything */ + if (scanEntry->isPartialMatch && + scanEntry->queryCategory != GIN_CAT_NORM_KEY) + return true; + + /* Locate tupdesc entry for key column (for attbyval/attlen data) */ + attnum = scanEntry->attnum; + attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1); + + /* + * Predicate lock entry leaf page, following pages will be locked by + * moveRightIfItNeeded() + */ + PredicateLockPage(btree->index, stack->buffer, snapshot); + + for (;;) + { + Page page; + IndexTuple itup; + Datum idatum; + GinNullCategory icategory; + + /* + * stack->off points to the interested entry, buffer is already locked + */ + if (moveRightIfItNeeded(btree, stack, snapshot) == false) + return true; + + page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); + + /* + * If tuple stores another attribute then stop scan + */ + if (gintuple_get_attrnum(btree->ginstate, itup) != attnum) + return true; + + /* Safe to fetch attribute value */ + idatum = gintuple_get_key(btree->ginstate, itup, &icategory); + + /* + * Check for appropriate scan stop conditions + */ + if (scanEntry->isPartialMatch) + { + int32 cmp; + + /* + * In partial match, stop scan at any null (including + * placeholders); partial matches never match nulls + */ + if (icategory != GIN_CAT_NORM_KEY) + return true; + + /*---------- + * Check of partial match. + * case cmp == 0 => match + * case cmp > 0 => not match and finish scan + * case cmp < 0 => not match and continue scan + *---------- + */ + cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1], + btree->ginstate->supportCollation[attnum - 1], + scanEntry->queryKey, + idatum, + UInt16GetDatum(scanEntry->strategy), + PointerGetDatum(scanEntry->extra_data))); + + if (cmp > 0) + return true; + else if (cmp < 0) + { + stack->off++; + continue; + } + } + else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL) + { + /* + * In ALL mode, we are not interested in null items, so we can + * stop if we get to a null-item placeholder (which will be the + * last entry for a given attnum). We do want to include NULL_KEY + * and EMPTY_ITEM entries, though. + */ + if (icategory == GIN_CAT_NULL_ITEM) + return true; + } + + /* + * OK, we want to return the TIDs listed in this entry. + */ + if (GinIsPostingTree(itup)) + { + BlockNumber rootPostingTree = GinGetPostingTree(itup); + + /* + * We should unlock current page (but not unpin) during tree scan + * to prevent deadlock with vacuum processes. + * + * We save current entry value (idatum) to be able to re-find our + * tuple after re-locking + */ + if (icategory == GIN_CAT_NORM_KEY) + idatum = datumCopy(idatum, attr->attbyval, attr->attlen); + + LockBuffer(stack->buffer, GIN_UNLOCK); + + /* + * Acquire predicate lock on the posting tree. We already hold a + * lock on the entry page, but insertions to the posting tree + * don't check for conflicts on that level. + */ + PredicateLockPage(btree->index, rootPostingTree, snapshot); + + /* Collect all the TIDs in this entry's posting tree */ + scanPostingTree(btree->index, scanEntry, rootPostingTree, + snapshot); + + /* + * We lock again the entry page and while it was unlocked insert + * might have occurred, so we need to re-find our position. + */ + LockBuffer(stack->buffer, GIN_SHARE); + page = BufferGetPage(stack->buffer); + if (!GinPageIsLeaf(page)) + { + /* + * Root page becomes non-leaf while we unlock it. We will + * start again, this situation doesn't occur often - root can + * became a non-leaf only once per life of index. + */ + return false; + } + + /* Search forward to re-find idatum */ + for (;;) + { + if (moveRightIfItNeeded(btree, stack, snapshot) == false) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to re-find tuple within index \"%s\"", + RelationGetRelationName(btree->index)))); + + page = BufferGetPage(stack->buffer); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); + + if (gintuple_get_attrnum(btree->ginstate, itup) == attnum) + { + Datum newDatum; + GinNullCategory newCategory; + + newDatum = gintuple_get_key(btree->ginstate, itup, + &newCategory); + + if (ginCompareEntries(btree->ginstate, attnum, + newDatum, newCategory, + idatum, icategory) == 0) + break; /* Found! */ + } + + stack->off++; + } + + if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval) + pfree(DatumGetPointer(idatum)); + } + else + { + ItemPointer ipd; + int nipd; + + ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd); + tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false); + scanEntry->predictNumberResult += GinGetNPosting(itup); + pfree(ipd); + } + + /* + * Done with this entry, go to the next + */ + stack->off++; + } +} + +/* + * Start* functions setup beginning state of searches: finds correct buffer and pins it. + */ +static void +startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot) +{ + GinBtreeData btreeEntry; + GinBtreeStack *stackEntry; + Page page; + bool needUnlock; + +restartScanEntry: + entry->buffer = InvalidBuffer; + ItemPointerSetMin(&entry->curItem); + entry->offset = InvalidOffsetNumber; + if (entry->list) + pfree(entry->list); + entry->list = NULL; + entry->nlist = 0; + entry->matchBitmap = NULL; + entry->matchResult = NULL; + entry->reduceResult = false; + entry->predictNumberResult = 0; + + /* + * we should find entry, and begin scan of posting tree or just store + * posting list in memory + */ + ginPrepareEntryScan(&btreeEntry, entry->attnum, + entry->queryKey, entry->queryCategory, + ginstate); + stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot); + page = BufferGetPage(stackEntry->buffer); + + /* ginFindLeafPage() will have already checked snapshot age. */ + needUnlock = true; + + entry->isFinished = true; + + if (entry->isPartialMatch || + entry->queryCategory == GIN_CAT_EMPTY_QUERY) + { + /* + * btreeEntry.findItem locates the first item >= given search key. + * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item + * because of the way the GIN_CAT_EMPTY_QUERY category code is + * assigned.) We scan forward from there and collect all TIDs needed + * for the entry type. + */ + btreeEntry.findItem(&btreeEntry, stackEntry); + if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot) + == false) + { + /* + * GIN tree was seriously restructured, so we will cleanup all + * found data and rescan. See comments near 'return false' in + * collectMatchBitmap() + */ + if (entry->matchBitmap) + { + if (entry->matchIterator) + tbm_end_iterate(entry->matchIterator); + entry->matchIterator = NULL; + tbm_free(entry->matchBitmap); + entry->matchBitmap = NULL; + } + LockBuffer(stackEntry->buffer, GIN_UNLOCK); + freeGinBtreeStack(stackEntry); + goto restartScanEntry; + } + + if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap)) + { + entry->matchIterator = tbm_begin_iterate(entry->matchBitmap); + entry->isFinished = false; + } + } + else if (btreeEntry.findItem(&btreeEntry, stackEntry)) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); + + if (GinIsPostingTree(itup)) + { + BlockNumber rootPostingTree = GinGetPostingTree(itup); + GinBtreeStack *stack; + Page page; + ItemPointerData minItem; + + /* + * This is an equality scan, so lock the root of the posting tree. + * It represents a lock on the exact key value, and covers all the + * items in the posting tree. + */ + PredicateLockPage(ginstate->index, rootPostingTree, snapshot); + + /* + * We should unlock entry page before touching posting tree to + * prevent deadlocks with vacuum processes. Because entry is never + * deleted from page and posting tree is never reduced to the + * posting list, we can unlock page after getting BlockNumber of + * root of posting tree. + */ + LockBuffer(stackEntry->buffer, GIN_UNLOCK); + needUnlock = false; + + stack = ginScanBeginPostingTree(&entry->btree, ginstate->index, + rootPostingTree, snapshot); + entry->buffer = stack->buffer; + + /* + * We keep buffer pinned because we need to prevent deletion of + * page during scan. See GIN's vacuum implementation. RefCount is + * increased to keep buffer pinned after freeGinBtreeStack() call. + */ + IncrBufferRefCount(entry->buffer); + + page = BufferGetPage(entry->buffer); + + /* + * Load the first page into memory. + */ + ItemPointerSetMin(&minItem); + entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem); + + entry->predictNumberResult = stack->predictNumber * entry->nlist; + + LockBuffer(entry->buffer, GIN_UNLOCK); + freeGinBtreeStack(stack); + entry->isFinished = false; + } + else + { + /* + * Lock the entry leaf page. This is more coarse-grained than + * necessary, because it will conflict with any insertions that + * land on the same leaf page, not only the exact key we searched + * for. But locking an individual tuple would require updating + * that lock whenever it moves because of insertions or vacuums, + * which seems too complicated. + */ + PredicateLockPage(ginstate->index, + BufferGetBlockNumber(stackEntry->buffer), + snapshot); + if (GinGetNPosting(itup) > 0) + { + entry->list = ginReadTuple(ginstate, entry->attnum, itup, + &entry->nlist); + entry->predictNumberResult = entry->nlist; + + entry->isFinished = false; + } + } + } + else + { + /* + * No entry found. Predicate lock the leaf page, to lock the place + * where the entry would've been, had there been one. + */ + PredicateLockPage(ginstate->index, + BufferGetBlockNumber(stackEntry->buffer), snapshot); + } + + if (needUnlock) + LockBuffer(stackEntry->buffer, GIN_UNLOCK); + freeGinBtreeStack(stackEntry); +} + +/* + * Comparison function for scan entry indexes. Sorts by predictNumberResult, + * least frequent items first. + */ +static int +entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg) +{ + const GinScanKey key = (const GinScanKey) arg; + int i1 = *(const int *) a1; + int i2 = *(const int *) a2; + uint32 n1 = key->scanEntry[i1]->predictNumberResult; + uint32 n2 = key->scanEntry[i2]->predictNumberResult; + + if (n1 < n2) + return -1; + else if (n1 == n2) + return 0; + else + return 1; +} + +static void +startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key) +{ + MemoryContext oldCtx = CurrentMemoryContext; + int i; + int j; + int *entryIndexes; + + ItemPointerSetMin(&key->curItem); + key->curItemMatches = false; + key->recheckCurItem = false; + key->isFinished = false; + + /* + * Divide the entries into two distinct sets: required and additional. + * Additional entries are not enough for a match alone, without any items + * from the required set, but are needed by the consistent function to + * decide if an item matches. When scanning, we can skip over items from + * additional entries that have no corresponding matches in any of the + * required entries. That speeds up queries like "frequent & rare" + * considerably, if the frequent term can be put in the additional set. + * + * There can be many legal ways to divide them entries into these two + * sets. A conservative division is to just put everything in the required + * set, but the more you can put in the additional set, the more you can + * skip during the scan. To maximize skipping, we try to put as many + * frequent items as possible into additional, and less frequent ones into + * required. To do that, sort the entries by frequency + * (predictNumberResult), and put entries into the required set in that + * order, until the consistent function says that none of the remaining + * entries can form a match, without any items from the required set. The + * rest go to the additional set. + * + * Exclude-only scan keys are known to have no required entries. + */ + if (key->excludeOnly) + { + MemoryContextSwitchTo(so->keyCtx); + + key->nrequired = 0; + key->nadditional = key->nentries; + key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry)); + for (i = 0; i < key->nadditional; i++) + key->additionalEntries[i] = key->scanEntry[i]; + } + else if (key->nentries > 1) + { + MemoryContextSwitchTo(so->tempCtx); + + entryIndexes = (int *) palloc(sizeof(int) * key->nentries); + for (i = 0; i < key->nentries; i++) + entryIndexes[i] = i; + qsort_arg(entryIndexes, key->nentries, sizeof(int), + entryIndexByFrequencyCmp, key); + + for (i = 0; i < key->nentries - 1; i++) + { + /* Pass all entries <= i as FALSE, and the rest as MAYBE */ + for (j = 0; j <= i; j++) + key->entryRes[entryIndexes[j]] = GIN_FALSE; + for (j = i + 1; j < key->nentries; j++) + key->entryRes[entryIndexes[j]] = GIN_MAYBE; + + if (key->triConsistentFn(key) == GIN_FALSE) + break; + } + /* i is now the last required entry. */ + + MemoryContextSwitchTo(so->keyCtx); + + key->nrequired = i + 1; + key->nadditional = key->nentries - key->nrequired; + key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry)); + key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry)); + + j = 0; + for (i = 0; i < key->nrequired; i++) + key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]]; + for (i = 0; i < key->nadditional; i++) + key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]]; + + /* clean up after consistentFn calls (also frees entryIndexes) */ + MemoryContextReset(so->tempCtx); + } + else + { + MemoryContextSwitchTo(so->keyCtx); + + key->nrequired = 1; + key->nadditional = 0; + key->requiredEntries = palloc(1 * sizeof(GinScanEntry)); + key->requiredEntries[0] = key->scanEntry[0]; + } + MemoryContextSwitchTo(oldCtx); +} + +static void +startScan(IndexScanDesc scan) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + GinState *ginstate = &so->ginstate; + uint32 i; + + for (i = 0; i < so->totalentries; i++) + startScanEntry(ginstate, so->entries[i], scan->xs_snapshot); + + if (GinFuzzySearchLimit > 0) + { + /* + * If all of keys more than threshold we will try to reduce result, we + * hope (and only hope, for intersection operation of array our + * supposition isn't true), that total result will not more than + * minimal predictNumberResult. + */ + bool reduce = true; + + for (i = 0; i < so->totalentries; i++) + { + if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit) + { + reduce = false; + break; + } + } + if (reduce) + { + for (i = 0; i < so->totalentries; i++) + { + so->entries[i]->predictNumberResult /= so->totalentries; + so->entries[i]->reduceResult = true; + } + } + } + + /* + * Now that we have the estimates for the entry frequencies, finish + * initializing the scan keys. + */ + for (i = 0; i < so->nkeys; i++) + startScanKey(ginstate, so, so->keys + i); +} + +/* + * Load the next batch of item pointers from a posting tree. + * + * Note that we copy the page into GinScanEntry->list array and unlock it, but + * keep it pinned to prevent interference with vacuum. + */ +static void +entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, + ItemPointerData advancePast, Snapshot snapshot) +{ + Page page; + int i; + bool stepright; + + if (!BufferIsValid(entry->buffer)) + { + entry->isFinished = true; + return; + } + + /* + * We have two strategies for finding the correct page: step right from + * the current page, or descend the tree again from the root. If + * advancePast equals the current item, the next matching item should be + * on the next page, so we step right. Otherwise, descend from root. + */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0) + { + stepright = true; + LockBuffer(entry->buffer, GIN_SHARE); + } + else + { + GinBtreeStack *stack; + + ReleaseBuffer(entry->buffer); + + /* + * Set the search key, and find the correct leaf page. + */ + if (ItemPointerIsLossyPage(&advancePast)) + { + ItemPointerSet(&entry->btree.itemptr, + GinItemPointerGetBlockNumber(&advancePast) + 1, + FirstOffsetNumber); + } + else + { + ItemPointerSet(&entry->btree.itemptr, + GinItemPointerGetBlockNumber(&advancePast), + OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast))); + } + entry->btree.fullScan = false; + stack = ginFindLeafPage(&entry->btree, true, false, snapshot); + + /* we don't need the stack, just the buffer. */ + entry->buffer = stack->buffer; + IncrBufferRefCount(entry->buffer); + freeGinBtreeStack(stack); + stepright = false; + } + + elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d", + GinItemPointerGetBlockNumber(&advancePast), + GinItemPointerGetOffsetNumber(&advancePast), + !stepright); + + page = BufferGetPage(entry->buffer); + for (;;) + { + entry->offset = InvalidOffsetNumber; + if (entry->list) + { + pfree(entry->list); + entry->list = NULL; + entry->nlist = 0; + } + + if (stepright) + { + /* + * We've processed all the entries on this page. If it was the + * last page in the tree, we're done. + */ + if (GinPageRightMost(page)) + { + UnlockReleaseBuffer(entry->buffer); + entry->buffer = InvalidBuffer; + entry->isFinished = true; + return; + } + + /* + * Step to next page, following the right link. then find the + * first ItemPointer greater than advancePast. + */ + entry->buffer = ginStepRight(entry->buffer, + ginstate->index, + GIN_SHARE); + page = BufferGetPage(entry->buffer); + } + stepright = true; + + if (GinPageGetOpaque(page)->flags & GIN_DELETED) + continue; /* page was deleted by concurrent vacuum */ + + /* + * The first item > advancePast might not be on this page, but + * somewhere to the right, if the page was split, or a non-match from + * another key in the query allowed us to skip some items from this + * entry. Keep following the right-links until we re-find the correct + * page. + */ + if (!GinPageRightMost(page) && + ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0) + { + /* + * the item we're looking is > the right bound of the page, so it + * can't be on this page. + */ + continue; + } + + entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast); + + for (i = 0; i < entry->nlist; i++) + { + if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0) + { + entry->offset = i; + + if (GinPageRightMost(page)) + { + /* after processing the copied items, we're done. */ + UnlockReleaseBuffer(entry->buffer); + entry->buffer = InvalidBuffer; + } + else + LockBuffer(entry->buffer, GIN_UNLOCK); + return; + } + } + } +} + +#define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) +#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) + +/* + * Sets entry->curItem to next heap item pointer > advancePast, for one entry + * of one scan key, or sets entry->isFinished to true if there are no more. + * + * Item pointers are returned in ascending order. + * + * Note: this can return a "lossy page" item pointer, indicating that the + * entry potentially matches all items on that heap page. However, it is + * not allowed to return both a lossy page pointer and exact (regular) + * item pointers for the same page. (Doing so would break the key-combination + * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the + * current implementation this is guaranteed by the behavior of tidbitmaps. + */ +static void +entryGetItem(GinState *ginstate, GinScanEntry entry, + ItemPointerData advancePast, Snapshot snapshot) +{ + Assert(!entry->isFinished); + + Assert(!ItemPointerIsValid(&entry->curItem) || + ginCompareItemPointers(&entry->curItem, &advancePast) <= 0); + + if (entry->matchBitmap) + { + /* A bitmap result */ + BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast); + OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast); + + for (;;) + { + /* + * If we've exhausted all items on this block, move to next block + * in the bitmap. + */ + while (entry->matchResult == NULL || + (entry->matchResult->ntuples >= 0 && + entry->offset >= entry->matchResult->ntuples) || + entry->matchResult->blockno < advancePastBlk || + (ItemPointerIsLossyPage(&advancePast) && + entry->matchResult->blockno == advancePastBlk)) + { + entry->matchResult = tbm_iterate(entry->matchIterator); + + if (entry->matchResult == NULL) + { + ItemPointerSetInvalid(&entry->curItem); + tbm_end_iterate(entry->matchIterator); + entry->matchIterator = NULL; + entry->isFinished = true; + break; + } + + /* + * Reset counter to the beginning of entry->matchResult. Note: + * entry->offset is still greater than matchResult->ntuples if + * matchResult is lossy. So, on next call we will get next + * result from TIDBitmap. + */ + entry->offset = 0; + } + if (entry->isFinished) + break; + + /* + * We're now on the first page after advancePast which has any + * items on it. If it's a lossy result, return that. + */ + if (entry->matchResult->ntuples < 0) + { + ItemPointerSetLossyPage(&entry->curItem, + entry->matchResult->blockno); + + /* + * We might as well fall out of the loop; we could not + * estimate number of results on this page to support correct + * reducing of result even if it's enabled. + */ + break; + } + + /* + * Not a lossy page. Skip over any offsets <= advancePast, and + * return that. + */ + if (entry->matchResult->blockno == advancePastBlk) + { + /* + * First, do a quick check against the last offset on the + * page. If that's > advancePast, so are all the other + * offsets, so just go back to the top to get the next page. + */ + if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff) + { + entry->offset = entry->matchResult->ntuples; + continue; + } + + /* Otherwise scan to find the first item > advancePast */ + while (entry->matchResult->offsets[entry->offset] <= advancePastOff) + entry->offset++; + } + + ItemPointerSet(&entry->curItem, + entry->matchResult->blockno, + entry->matchResult->offsets[entry->offset]); + entry->offset++; + + /* Done unless we need to reduce the result */ + if (!entry->reduceResult || !dropItem(entry)) + break; + } + } + else if (!BufferIsValid(entry->buffer)) + { + /* + * A posting list from an entry tuple, or the last page of a posting + * tree. + */ + for (;;) + { + if (entry->offset >= entry->nlist) + { + ItemPointerSetInvalid(&entry->curItem); + entry->isFinished = true; + break; + } + + entry->curItem = entry->list[entry->offset++]; + + /* If we're not past advancePast, keep scanning */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + continue; + + /* Done unless we need to reduce the result */ + if (!entry->reduceResult || !dropItem(entry)) + break; + } + } + else + { + /* A posting tree */ + for (;;) + { + /* If we've processed the current batch, load more items */ + while (entry->offset >= entry->nlist) + { + entryLoadMoreItems(ginstate, entry, advancePast, snapshot); + + if (entry->isFinished) + { + ItemPointerSetInvalid(&entry->curItem); + return; + } + } + + entry->curItem = entry->list[entry->offset++]; + + /* If we're not past advancePast, keep scanning */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + continue; + + /* Done unless we need to reduce the result */ + if (!entry->reduceResult || !dropItem(entry)) + break; + + /* + * Advance advancePast (so that entryLoadMoreItems will load the + * right data), and keep scanning + */ + advancePast = entry->curItem; + } + } +} + +/* + * Identify the "current" item among the input entry streams for this scan key + * that is greater than advancePast, and test whether it passes the scan key + * qual condition. + * + * The current item is the smallest curItem among the inputs. key->curItem + * is set to that value. key->curItemMatches is set to indicate whether that + * TID passes the consistentFn test. If so, key->recheckCurItem is set true + * iff recheck is needed for this item pointer (including the case where the + * item pointer is a lossy page pointer). + * + * If all entry streams are exhausted, sets key->isFinished to true. + * + * Item pointers must be returned in ascending order. + * + * Note: this can return a "lossy page" item pointer, indicating that the + * key potentially matches all items on that heap page. However, it is + * not allowed to return both a lossy page pointer and exact (regular) + * item pointers for the same page. (Doing so would break the key-combination + * logic in scanGetItem.) + */ +static void +keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, + ItemPointerData advancePast, Snapshot snapshot) +{ + ItemPointerData minItem; + ItemPointerData curPageLossy; + uint32 i; + bool haveLossyEntry; + GinScanEntry entry; + GinTernaryValue res; + MemoryContext oldCtx; + bool allFinished; + + Assert(!key->isFinished); + + /* + * We might have already tested this item; if so, no need to repeat work. + * (Note: the ">" case can happen, if advancePast is exact but we + * previously had to set curItem to a lossy-page pointer.) + */ + if (ginCompareItemPointers(&key->curItem, &advancePast) > 0) + return; + + /* + * Find the minimum item > advancePast among the active entry streams. + * + * Note: a lossy-page entry is encoded by a ItemPointer with max value for + * offset (0xffff), so that it will sort after any exact entries for the + * same page. So we'll prefer to return exact pointers not lossy + * pointers, which is good. + */ + ItemPointerSetMax(&minItem); + allFinished = true; + for (i = 0; i < key->nrequired; i++) + { + entry = key->requiredEntries[i]; + + if (entry->isFinished) + continue; + + /* + * Advance this stream if necessary. + * + * In particular, since entry->curItem was initialized with + * ItemPointerSetMin, this ensures we fetch the first item for each + * entry on the first call. + */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + { + entryGetItem(ginstate, entry, advancePast, snapshot); + if (entry->isFinished) + continue; + } + + allFinished = false; + if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) + minItem = entry->curItem; + } + + if (allFinished && !key->excludeOnly) + { + /* all entries are finished */ + key->isFinished = true; + return; + } + + if (!key->excludeOnly) + { + /* + * For a normal scan key, we now know there are no matches < minItem. + * + * If minItem is lossy, it means that there were no exact items on the + * page among requiredEntries, because lossy pointers sort after exact + * items. However, there might be exact items for the same page among + * additionalEntries, so we mustn't advance past them. + */ + if (ItemPointerIsLossyPage(&minItem)) + { + if (GinItemPointerGetBlockNumber(&advancePast) < + GinItemPointerGetBlockNumber(&minItem)) + { + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&minItem), + InvalidOffsetNumber); + } + } + else + { + Assert(GinItemPointerGetOffsetNumber(&minItem) > 0); + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&minItem), + OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem))); + } + } + else + { + /* + * excludeOnly scan keys don't have any entries that are necessarily + * present in matching items. So, we consider the item just after + * advancePast. + */ + Assert(key->nrequired == 0); + ItemPointerSet(&minItem, + GinItemPointerGetBlockNumber(&advancePast), + OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast))); + } + + /* + * We might not have loaded all the entry streams for this TID yet. We + * could call the consistent function, passing MAYBE for those entries, to + * see if it can decide if this TID matches based on the information we + * have. But if the consistent-function is expensive, and cannot in fact + * decide with partial information, that could be a big loss. So, load all + * the additional entries, before calling the consistent function. + */ + for (i = 0; i < key->nadditional; i++) + { + entry = key->additionalEntries[i]; + + if (entry->isFinished) + continue; + + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + { + entryGetItem(ginstate, entry, advancePast, snapshot); + if (entry->isFinished) + continue; + } + + /* + * Normally, none of the items in additionalEntries can have a curItem + * larger than minItem. But if minItem is a lossy page, then there + * might be exact items on the same page among additionalEntries. + */ + if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) + { + Assert(ItemPointerIsLossyPage(&minItem)); + minItem = entry->curItem; + } + } + + /* + * Ok, we've advanced all the entries up to minItem now. Set key->curItem, + * and perform consistentFn test. + * + * Lossy-page entries pose a problem, since we don't know the correct + * entryRes state to pass to the consistentFn, and we also don't know what + * its combining logic will be (could be AND, OR, or even NOT). If the + * logic is OR then the consistentFn might succeed for all items in the + * lossy page even when none of the other entries match. + * + * Our strategy is to call the tri-state consistent function, with the + * lossy-page entries set to MAYBE, and all the other entries FALSE. If it + * returns FALSE, none of the lossy items alone are enough for a match, so + * we don't need to return a lossy-page pointer. Otherwise, return a + * lossy-page pointer to indicate that the whole heap page must be + * checked. (On subsequent calls, we'll do nothing until minItem is past + * the page altogether, thus ensuring that we never return both regular + * and lossy pointers for the same page.) + * + * An exception is that it doesn't matter what we pass for lossy pointers + * in "hidden" entries, because the consistentFn's result can't depend on + * them. We could pass them as MAYBE as well, but if we're using the + * "shim" implementation of a tri-state consistent function (see + * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass + * them as true. + * + * Note that only lossy-page entries pointing to the current item's page + * should trigger this processing; we might have future lossy pages in the + * entry array, but they aren't relevant yet. + */ + key->curItem = minItem; + ItemPointerSetLossyPage(&curPageLossy, + GinItemPointerGetBlockNumber(&key->curItem)); + haveLossyEntry = false; + for (i = 0; i < key->nentries; i++) + { + entry = key->scanEntry[i]; + if (entry->isFinished == false && + ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) + { + if (i < key->nuserentries) + key->entryRes[i] = GIN_MAYBE; + else + key->entryRes[i] = GIN_TRUE; + haveLossyEntry = true; + } + else + key->entryRes[i] = GIN_FALSE; + } + + /* prepare for calling consistentFn in temp context */ + oldCtx = MemoryContextSwitchTo(tempCtx); + + if (haveLossyEntry) + { + /* Have lossy-page entries, so see if whole page matches */ + res = key->triConsistentFn(key); + + if (res == GIN_TRUE || res == GIN_MAYBE) + { + /* Yes, so clean up ... */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(tempCtx); + + /* and return lossy pointer for whole page */ + key->curItem = curPageLossy; + key->curItemMatches = true; + key->recheckCurItem = true; + return; + } + } + + /* + * At this point we know that we don't need to return a lossy whole-page + * pointer, but we might have matches for individual exact item pointers, + * possibly in combination with a lossy pointer. Pass lossy pointers as + * MAYBE to the ternary consistent function, to let it decide if this + * tuple satisfies the overall key, even though we don't know if the lossy + * entries match. + * + * Prepare entryRes array to be passed to consistentFn. + */ + for (i = 0; i < key->nentries; i++) + { + entry = key->scanEntry[i]; + if (entry->isFinished) + key->entryRes[i] = GIN_FALSE; +#if 0 + + /* + * This case can't currently happen, because we loaded all the entries + * for this item earlier. + */ + else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + key->entryRes[i] = GIN_MAYBE; +#endif + else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) + key->entryRes[i] = GIN_MAYBE; + else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0) + key->entryRes[i] = GIN_TRUE; + else + key->entryRes[i] = GIN_FALSE; + } + + res = key->triConsistentFn(key); + + switch (res) + { + case GIN_TRUE: + key->curItemMatches = true; + /* triConsistentFn set recheckCurItem */ + break; + + case GIN_FALSE: + key->curItemMatches = false; + break; + + case GIN_MAYBE: + key->curItemMatches = true; + key->recheckCurItem = true; + break; + + default: + + /* + * the 'default' case shouldn't happen, but if the consistent + * function returns something bogus, this is the safe result + */ + key->curItemMatches = true; + key->recheckCurItem = true; + break; + } + + /* + * We have a tuple, and we know if it matches or not. If it's a non-match, + * we could continue to find the next matching tuple, but let's break out + * and give scanGetItem a chance to advance the other keys. They might be + * able to skip past to a much higher TID, allowing us to save work. + */ + + /* clean up after consistentFn calls */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(tempCtx); +} + +/* + * Get next heap item pointer (after advancePast) from scan. + * Returns true if anything found. + * On success, *item and *recheck are set. + * + * Note: this is very nearly the same logic as in keyGetItem(), except + * that we know the keys are to be combined with AND logic, whereas in + * keyGetItem() the combination logic is known only to the consistentFn. + */ +static bool +scanGetItem(IndexScanDesc scan, ItemPointerData advancePast, + ItemPointerData *item, bool *recheck) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + uint32 i; + bool match; + + /*---------- + * Advance the scan keys in lock-step, until we find an item that matches + * all the keys. If any key reports isFinished, meaning its subset of the + * entries is exhausted, we can stop. Otherwise, set *item to the next + * matching item. + * + * This logic works only if a keyGetItem stream can never contain both + * exact and lossy pointers for the same page. Else we could have a + * case like + * + * stream 1 stream 2 + * ... ... + * 42/6 42/7 + * 50/1 42/0xffff + * ... ... + * + * We would conclude that 42/6 is not a match and advance stream 1, + * thus never detecting the match to the lossy pointer in stream 2. + * (keyGetItem has a similar problem versus entryGetItem.) + *---------- + */ + do + { + ItemPointerSetMin(item); + match = true; + for (i = 0; i < so->nkeys && match; i++) + { + GinScanKey key = so->keys + i; + + /* + * If we're considering a lossy page, skip excludeOnly keys, They + * can't exclude the whole page anyway. + */ + if (ItemPointerIsLossyPage(item) && key->excludeOnly) + { + /* + * ginNewScanKey() should never mark the first key as + * excludeOnly. + */ + Assert(i > 0); + continue; + } + + /* Fetch the next item for this key that is > advancePast. */ + keyGetItem(&so->ginstate, so->tempCtx, key, advancePast, + scan->xs_snapshot); + + if (key->isFinished) + return false; + + /* + * If it's not a match, we can immediately conclude that nothing + * <= this item matches, without checking the rest of the keys. + */ + if (!key->curItemMatches) + { + advancePast = key->curItem; + match = false; + break; + } + + /* + * It's a match. We can conclude that nothing < matches, so the + * other key streams can skip to this item. + * + * Beware of lossy pointers, though; from a lossy pointer, we can + * only conclude that nothing smaller than this *block* matches. + */ + if (ItemPointerIsLossyPage(&key->curItem)) + { + if (GinItemPointerGetBlockNumber(&advancePast) < + GinItemPointerGetBlockNumber(&key->curItem)) + { + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&key->curItem), + InvalidOffsetNumber); + } + } + else + { + Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0); + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&key->curItem), + OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem))); + } + + /* + * If this is the first key, remember this location as a potential + * match, and proceed to check the rest of the keys. + * + * Otherwise, check if this is the same item that we checked the + * previous keys for (or a lossy pointer for the same page). If + * not, loop back to check the previous keys for this item (we + * will check this key again too, but keyGetItem returns quickly + * for that) + */ + if (i == 0) + { + *item = key->curItem; + } + else + { + if (ItemPointerIsLossyPage(&key->curItem) || + ItemPointerIsLossyPage(item)) + { + Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item)); + match = (GinItemPointerGetBlockNumber(&key->curItem) == + GinItemPointerGetBlockNumber(item)); + } + else + { + Assert(ginCompareItemPointers(&key->curItem, item) >= 0); + match = (ginCompareItemPointers(&key->curItem, item) == 0); + } + } + } + } while (!match); + + Assert(!ItemPointerIsMin(item)); + + /* + * Now *item contains the first ItemPointer after previous result that + * satisfied all the keys for that exact TID, or a lossy reference to the + * same page. + * + * We must return recheck = true if any of the keys are marked recheck. + */ + *recheck = false; + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + if (key->recheckCurItem) + { + *recheck = true; + break; + } + } + + return true; +} + + +/* + * Functions for scanning the pending list + */ + + +/* + * Get ItemPointer of next heap row to be checked from pending list. + * Returns false if there are no more. On pages with several heap rows + * it returns each row separately, on page with part of heap row returns + * per page data. pos->firstOffset and pos->lastOffset are set to identify + * the range of pending-list tuples belonging to this heap row. + * + * The pendingBuffer is presumed pinned and share-locked on entry, and is + * pinned and share-locked on success exit. On failure exit it's released. + */ +static bool +scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) +{ + OffsetNumber maxoff; + Page page; + IndexTuple itup; + + ItemPointerSetInvalid(&pos->item); + for (;;) + { + page = BufferGetPage(pos->pendingBuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); + + maxoff = PageGetMaxOffsetNumber(page); + if (pos->firstOffset > maxoff) + { + BlockNumber blkno = GinPageGetOpaque(page)->rightlink; + + if (blkno == InvalidBlockNumber) + { + UnlockReleaseBuffer(pos->pendingBuffer); + pos->pendingBuffer = InvalidBuffer; + + return false; + } + else + { + /* + * Here we must prevent deletion of next page by insertcleanup + * process, which may be trying to obtain exclusive lock on + * current page. So, we lock next page before releasing the + * current one + */ + Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno); + + LockBuffer(tmpbuf, GIN_SHARE); + UnlockReleaseBuffer(pos->pendingBuffer); + + pos->pendingBuffer = tmpbuf; + pos->firstOffset = FirstOffsetNumber; + } + } + else + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset)); + pos->item = itup->t_tid; + if (GinPageHasFullRow(page)) + { + /* + * find itempointer to the next row + */ + for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset)); + if (!ItemPointerEquals(&pos->item, &itup->t_tid)) + break; + } + } + else + { + /* + * All itempointers are the same on this page + */ + pos->lastOffset = maxoff + 1; + } + + /* + * Now pos->firstOffset points to the first tuple of current heap + * row, pos->lastOffset points to the first tuple of next heap row + * (or to the end of page) + */ + break; + } + } + + return true; +} + +/* + * Scan pending-list page from current tuple (off) up till the first of: + * - match is found (then returns true) + * - no later match is possible + * - tuple's attribute number is not equal to entry's attrnum + * - reach end of page + * + * datum[]/category[]/datumExtracted[] arrays are used to cache the results + * of gintuple_get_key() on the current page. + */ +static bool +matchPartialInPendingList(GinState *ginstate, Page page, + OffsetNumber off, OffsetNumber maxoff, + GinScanEntry entry, + Datum *datum, GinNullCategory *category, + bool *datumExtracted) +{ + IndexTuple itup; + int32 cmp; + + /* Partial match to a null is not possible */ + if (entry->queryCategory != GIN_CAT_NORM_KEY) + return false; + + while (off < maxoff) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + + if (gintuple_get_attrnum(ginstate, itup) != entry->attnum) + return false; + + if (datumExtracted[off - 1] == false) + { + datum[off - 1] = gintuple_get_key(ginstate, itup, + &category[off - 1]); + datumExtracted[off - 1] = true; + } + + /* Once we hit nulls, no further match is possible */ + if (category[off - 1] != GIN_CAT_NORM_KEY) + return false; + + /*---------- + * Check partial match. + * case cmp == 0 => match + * case cmp > 0 => not match and end scan (no later match possible) + * case cmp < 0 => not match and continue scan + *---------- + */ + cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1], + ginstate->supportCollation[entry->attnum - 1], + entry->queryKey, + datum[off - 1], + UInt16GetDatum(entry->strategy), + PointerGetDatum(entry->extra_data))); + if (cmp == 0) + return true; + else if (cmp > 0) + return false; + + off++; + } + + return false; +} + +/* + * Set up the entryRes array for each key by looking at + * every entry for current heap row in pending list. + * + * Returns true if each scan key has at least one entryRes match. + * This corresponds to the situations where the normal index search will + * try to apply the key's consistentFn. (A tuple not meeting that requirement + * cannot be returned by the normal search since no entry stream will + * source its TID.) + * + * The pendingBuffer is presumed pinned and share-locked on entry. + */ +static bool +collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + OffsetNumber attrnum; + Page page; + IndexTuple itup; + int i, + j; + + /* + * Reset all entryRes and hasMatchKey flags + */ + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + memset(key->entryRes, GIN_FALSE, key->nentries); + } + memset(pos->hasMatchKey, false, so->nkeys); + + /* + * Outer loop iterates over multiple pending-list pages when a single heap + * row has entries spanning those pages. + */ + for (;;) + { + Datum datum[BLCKSZ / sizeof(IndexTupleData)]; + GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)]; + bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)]; + + Assert(pos->lastOffset > pos->firstOffset); + memset(datumExtracted + pos->firstOffset - 1, 0, + sizeof(bool) * (pos->lastOffset - pos->firstOffset)); + + page = BufferGetPage(pos->pendingBuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); + + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + for (j = 0; j < key->nentries; j++) + { + GinScanEntry entry = key->scanEntry[j]; + OffsetNumber StopLow = pos->firstOffset, + StopHigh = pos->lastOffset, + StopMiddle; + + /* If already matched on earlier page, do no extra work */ + if (key->entryRes[j]) + continue; + + /* + * Interesting tuples are from pos->firstOffset to + * pos->lastOffset and they are ordered by (attnum, Datum) as + * it's done in entry tree. So we can use binary search to + * avoid linear scanning. + */ + while (StopLow < StopHigh) + { + int res; + + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); + + attrnum = gintuple_get_attrnum(&so->ginstate, itup); + + if (key->attnum < attrnum) + { + StopHigh = StopMiddle; + continue; + } + if (key->attnum > attrnum) + { + StopLow = StopMiddle + 1; + continue; + } + + if (datumExtracted[StopMiddle - 1] == false) + { + datum[StopMiddle - 1] = + gintuple_get_key(&so->ginstate, itup, + &category[StopMiddle - 1]); + datumExtracted[StopMiddle - 1] = true; + } + + if (entry->queryCategory == GIN_CAT_EMPTY_QUERY) + { + /* special behavior depending on searchMode */ + if (entry->searchMode == GIN_SEARCH_MODE_ALL) + { + /* match anything except NULL_ITEM */ + if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM) + res = -1; + else + res = 0; + } + else + { + /* match everything */ + res = 0; + } + } + else + { + res = ginCompareEntries(&so->ginstate, + entry->attnum, + entry->queryKey, + entry->queryCategory, + datum[StopMiddle - 1], + category[StopMiddle - 1]); + } + + if (res == 0) + { + /* + * Found exact match (there can be only one, except in + * EMPTY_QUERY mode). + * + * If doing partial match, scan forward from here to + * end of page to check for matches. + * + * See comment above about tuple's ordering. + */ + if (entry->isPartialMatch) + key->entryRes[j] = + matchPartialInPendingList(&so->ginstate, + page, + StopMiddle, + pos->lastOffset, + entry, + datum, + category, + datumExtracted); + else + key->entryRes[j] = true; + + /* done with binary search */ + break; + } + else if (res < 0) + StopHigh = StopMiddle; + else + StopLow = StopMiddle + 1; + } + + if (StopLow >= StopHigh && entry->isPartialMatch) + { + /* + * No exact match on this page. If doing partial match, + * scan from the first tuple greater than target value to + * end of page. Note that since we don't remember whether + * the comparePartialFn told us to stop early on a + * previous page, we will uselessly apply comparePartialFn + * to the first tuple on each subsequent page. + */ + key->entryRes[j] = + matchPartialInPendingList(&so->ginstate, + page, + StopHigh, + pos->lastOffset, + entry, + datum, + category, + datumExtracted); + } + + pos->hasMatchKey[i] |= key->entryRes[j]; + } + } + + /* Advance firstOffset over the scanned tuples */ + pos->firstOffset = pos->lastOffset; + + if (GinPageHasFullRow(page)) + { + /* + * We have examined all pending entries for the current heap row. + * Break out of loop over pages. + */ + break; + } + else + { + /* + * Advance to next page of pending entries for the current heap + * row. Complain if there isn't one. + */ + ItemPointerData item = pos->item; + + if (scanGetCandidate(scan, pos) == false || + !ItemPointerEquals(&pos->item, &item)) + elog(ERROR, "could not find additional pending pages for same heap tuple"); + } + } + + /* + * All scan keys except excludeOnly require at least one entry to match. + * excludeOnly keys are an exception, because their implied + * GIN_CAT_EMPTY_QUERY scanEntry always matches. So return "true" if all + * non-excludeOnly scan keys have at least one match. + */ + for (i = 0; i < so->nkeys; i++) + { + if (pos->hasMatchKey[i] == false && !so->keys[i].excludeOnly) + return false; + } + + return true; +} + +/* + * Collect all matched rows from pending list into bitmap. + */ +static void +scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + MemoryContext oldCtx; + bool recheck, + match; + int i; + pendingPosition pos; + Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); + Page page; + BlockNumber blkno; + + *ntids = 0; + + /* + * Acquire predicate lock on the metapage, to conflict with any fastupdate + * insertions. + */ + PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot); + + LockBuffer(metabuffer, GIN_SHARE); + page = BufferGetPage(metabuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); + blkno = GinPageGetMeta(page)->head; + + /* + * fetch head of list before unlocking metapage. head page must be pinned + * to prevent deletion by vacuum process + */ + if (blkno == InvalidBlockNumber) + { + /* No pending list, so proceed with normal scan */ + UnlockReleaseBuffer(metabuffer); + return; + } + + pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); + LockBuffer(pos.pendingBuffer, GIN_SHARE); + pos.firstOffset = FirstOffsetNumber; + UnlockReleaseBuffer(metabuffer); + pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys); + + /* + * loop for each heap row. scanGetCandidate returns full row or row's + * tuples from first page. + */ + while (scanGetCandidate(scan, &pos)) + { + /* + * Check entries in tuple and set up entryRes array. + * + * If pending tuples belonging to the current heap row are spread + * across several pages, collectMatchesForHeapRow will read all of + * those pages. + */ + if (!collectMatchesForHeapRow(scan, &pos)) + continue; + + /* + * Matching of entries of one row is finished, so check row using + * consistent functions. + */ + oldCtx = MemoryContextSwitchTo(so->tempCtx); + recheck = false; + match = true; + + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + if (!key->boolConsistentFn(key)) + { + match = false; + break; + } + recheck |= key->recheckCurItem; + } + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(so->tempCtx); + + if (match) + { + tbm_add_tuples(tbm, &pos.item, 1, recheck); + (*ntids)++; + } + } + + pfree(pos.hasMatchKey); +} + + +#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes ) + +int64 +gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + int64 ntids; + ItemPointerData iptr; + bool recheck; + + /* + * Set up the scan keys, and check for unsatisfiable query. + */ + ginFreeScanKeys(so); /* there should be no keys yet, but just to be + * sure */ + ginNewScanKey(scan); + + if (GinIsVoidRes(scan)) + return 0; + + ntids = 0; + + /* + * First, scan the pending list and collect any matching entries into the + * bitmap. After we scan a pending item, some other backend could post it + * into the main index, and so we might visit it a second time during the + * main scan. This is okay because we'll just re-set the same bit in the + * bitmap. (The possibility of duplicate visits is a major reason why GIN + * can't support the amgettuple API, however.) Note that it would not do + * to scan the main index before the pending list, since concurrent + * cleanup could then make us miss entries entirely. + */ + scanPendingInsert(scan, tbm, &ntids); + + /* + * Now scan the main index. + */ + startScan(scan); + + ItemPointerSetMin(&iptr); + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (!scanGetItem(scan, iptr, &iptr, &recheck)) + break; + + if (ItemPointerIsLossyPage(&iptr)) + tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr)); + else + tbm_add_tuples(tbm, &iptr, 1, recheck); + ntids++; + } + + return ntids; +} diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c new file mode 100644 index 0000000..0e8672c --- /dev/null +++ b/src/backend/access/gin/gininsert.c @@ -0,0 +1,541 @@ +/*------------------------------------------------------------------------- + * + * gininsert.c + * insert routines for the postgres inverted index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/gininsert.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/tableam.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/predicate.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +typedef struct +{ + GinState ginstate; + double indtuples; + GinStatsData buildStats; + MemoryContext tmpCtx; + MemoryContext funcCtx; + BuildAccumulator accum; +} GinBuildState; + + +/* + * Adds array of item pointers to tuple's posting list, or + * creates posting tree and tuple pointing to tree in case + * of not enough space. Max size of tuple is defined in + * GinFormTuple(). Returns a new, modified index tuple. + * items[] must be in sorted order with no duplicates. + */ +static IndexTuple +addItemPointersToLeafTuple(GinState *ginstate, + IndexTuple old, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats, Buffer buffer) +{ + OffsetNumber attnum; + Datum key; + GinNullCategory category; + IndexTuple res; + ItemPointerData *newItems, + *oldItems; + int oldNPosting, + newNPosting; + GinPostingList *compressedList; + + Assert(!GinIsPostingTree(old)); + + attnum = gintuple_get_attrnum(ginstate, old); + key = gintuple_get_key(ginstate, old, &category); + + /* merge the old and new posting lists */ + oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting); + + newItems = ginMergeItemPointers(items, nitem, + oldItems, oldNPosting, + &newNPosting); + + /* Compress the posting list, and try to a build tuple with room for it */ + res = NULL; + compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, + NULL); + pfree(newItems); + if (compressedList) + { + res = GinFormTuple(ginstate, attnum, key, category, + (char *) compressedList, + SizeOfGinPostingList(compressedList), + newNPosting, + false); + pfree(compressedList); + } + if (!res) + { + /* posting list would be too big, convert to posting tree */ + BlockNumber postingRoot; + + /* + * Initialize posting tree with the old tuple's posting list. It's + * surely small enough to fit on one posting-tree page, and should + * already be in order with no duplicates. + */ + postingRoot = createPostingTree(ginstate->index, + oldItems, + oldNPosting, + buildStats, + buffer); + + /* Now insert the TIDs-to-be-added into the posting tree */ + ginInsertItemPointers(ginstate->index, postingRoot, + items, nitem, + buildStats); + + /* And build a new posting-tree-only result tuple */ + res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true); + GinSetPostingTree(res, postingRoot); + } + pfree(oldItems); + + return res; +} + +/* + * Build a fresh leaf tuple, either posting-list or posting-tree format + * depending on whether the given items list will fit. + * items[] must be in sorted order with no duplicates. + * + * This is basically the same logic as in addItemPointersToLeafTuple, + * but working from slightly different input. + */ +static IndexTuple +buildFreshLeafTuple(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats, Buffer buffer) +{ + IndexTuple res = NULL; + GinPostingList *compressedList; + + /* try to build a posting list tuple with all the items */ + compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, NULL); + if (compressedList) + { + res = GinFormTuple(ginstate, attnum, key, category, + (char *) compressedList, + SizeOfGinPostingList(compressedList), + nitem, false); + pfree(compressedList); + } + if (!res) + { + /* posting list would be too big, build posting tree */ + BlockNumber postingRoot; + + /* + * Build posting-tree-only result tuple. We do this first so as to + * fail quickly if the key is too big. + */ + res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true); + + /* + * Initialize a new posting tree with the TIDs. + */ + postingRoot = createPostingTree(ginstate->index, items, nitem, + buildStats, buffer); + + /* And save the root link in the result tuple */ + GinSetPostingTree(res, postingRoot); + } + + return res; +} + +/* + * Insert one or more heap TIDs associated with the given key value. + * This will either add a single key entry, or enlarge a pre-existing entry. + * + * During an index build, buildStats is non-null and the counters + * it contains should be incremented as needed. + */ +void +ginEntryInsert(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats) +{ + GinBtreeData btree; + GinBtreeEntryInsertData insertdata; + GinBtreeStack *stack; + IndexTuple itup; + Page page; + + insertdata.isDelete = false; + + ginPrepareEntryScan(&btree, attnum, key, category, ginstate); + btree.isBuild = (buildStats != NULL); + + stack = ginFindLeafPage(&btree, false, false, NULL); + page = BufferGetPage(stack->buffer); + + if (btree.findItem(&btree, stack)) + { + /* found pre-existing entry */ + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); + + if (GinIsPostingTree(itup)) + { + /* add entries to existing posting tree */ + BlockNumber rootPostingTree = GinGetPostingTree(itup); + + /* release all stack */ + LockBuffer(stack->buffer, GIN_UNLOCK); + freeGinBtreeStack(stack); + + /* insert into posting tree */ + ginInsertItemPointers(ginstate->index, rootPostingTree, + items, nitem, + buildStats); + return; + } + + CheckForSerializableConflictIn(ginstate->index, NULL, + BufferGetBlockNumber(stack->buffer)); + /* modify an existing leaf entry */ + itup = addItemPointersToLeafTuple(ginstate, itup, + items, nitem, buildStats, stack->buffer); + + insertdata.isDelete = true; + } + else + { + CheckForSerializableConflictIn(ginstate->index, NULL, + BufferGetBlockNumber(stack->buffer)); + /* no match, so construct a new leaf entry */ + itup = buildFreshLeafTuple(ginstate, attnum, key, category, + items, nitem, buildStats, stack->buffer); + + /* + * nEntries counts leaf tuples, so increment it only when we make a + * new one. + */ + if (buildStats) + buildStats->nEntries++; + } + + /* Insert the new or modified leaf tuple */ + insertdata.entry = itup; + ginInsertValue(&btree, stack, &insertdata, buildStats); + pfree(itup); +} + +/* + * Extract index entries for a single indexable item, and add them to the + * BuildAccumulator's state. + * + * This function is used only during initial index creation. + */ +static void +ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum, + Datum value, bool isNull, + ItemPointer heapptr) +{ + Datum *entries; + GinNullCategory *categories; + int32 nentries; + MemoryContext oldCtx; + + oldCtx = MemoryContextSwitchTo(buildstate->funcCtx); + entries = ginExtractEntries(buildstate->accum.ginstate, attnum, + value, isNull, + &nentries, &categories); + MemoryContextSwitchTo(oldCtx); + + ginInsertBAEntries(&buildstate->accum, heapptr, attnum, + entries, categories, nentries); + + buildstate->indtuples += nentries; + + MemoryContextReset(buildstate->funcCtx); +} + +static void +ginBuildCallback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + GinBuildState *buildstate = (GinBuildState *) state; + MemoryContext oldCtx; + int i; + + oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++) + ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1), + values[i], isnull[i], tid); + + /* If we've maxed out our available memory, dump everything to the index */ + if (buildstate->accum.allocatedMemory >= (Size) maintenance_work_mem * 1024L) + { + ItemPointerData *list; + Datum key; + GinNullCategory category; + uint32 nlist; + OffsetNumber attnum; + + ginBeginBAScan(&buildstate->accum); + while ((list = ginGetBAEntry(&buildstate->accum, + &attnum, &key, &category, &nlist)) != NULL) + { + /* there could be many entries, so be willing to abort here */ + CHECK_FOR_INTERRUPTS(); + ginEntryInsert(&buildstate->ginstate, attnum, key, category, + list, nlist, &buildstate->buildStats); + } + + MemoryContextReset(buildstate->tmpCtx); + ginInitBA(&buildstate->accum); + } + + MemoryContextSwitchTo(oldCtx); +} + +IndexBuildResult * +ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + GinBuildState buildstate; + Buffer RootBuffer, + MetaBuffer; + ItemPointerData *list; + Datum key; + GinNullCategory category; + uint32 nlist; + MemoryContext oldCtx; + OffsetNumber attnum; + + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + initGinState(&buildstate.ginstate, index); + buildstate.indtuples = 0; + memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); + + /* initialize the meta page */ + MetaBuffer = GinNewBuffer(index); + + /* initialize the root page */ + RootBuffer = GinNewBuffer(index); + + START_CRIT_SECTION(); + GinInitMetabuffer(MetaBuffer); + MarkBufferDirty(MetaBuffer); + GinInitBuffer(RootBuffer, GIN_LEAF); + MarkBufferDirty(RootBuffer); + + + UnlockReleaseBuffer(MetaBuffer); + UnlockReleaseBuffer(RootBuffer); + END_CRIT_SECTION(); + + /* count the root as first entry page */ + buildstate.buildStats.nEntryPages++; + + /* + * create a temporary memory context that is used to hold data not yet + * dumped out to the index + */ + buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin build temporary context", + ALLOCSET_DEFAULT_SIZES); + + /* + * create a temporary memory context that is used for calling + * ginExtractEntries(), and can be reset after each tuple + */ + buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin build temporary context for user-defined function", + ALLOCSET_DEFAULT_SIZES); + + buildstate.accum.ginstate = &buildstate.ginstate; + ginInitBA(&buildstate.accum); + + /* + * Do the heap scan. We disallow sync scan here because dataPlaceToPage + * prefers to receive tuples in TID order. + */ + reltuples = table_index_build_scan(heap, index, indexInfo, false, true, + ginBuildCallback, (void *) &buildstate, + NULL); + + /* dump remaining entries to the index */ + oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); + ginBeginBAScan(&buildstate.accum); + while ((list = ginGetBAEntry(&buildstate.accum, + &attnum, &key, &category, &nlist)) != NULL) + { + /* there could be many entries, so be willing to abort here */ + CHECK_FOR_INTERRUPTS(); + ginEntryInsert(&buildstate.ginstate, attnum, key, category, + list, nlist, &buildstate.buildStats); + } + MemoryContextSwitchTo(oldCtx); + + MemoryContextDelete(buildstate.funcCtx); + MemoryContextDelete(buildstate.tmpCtx); + + /* + * Update metapage stats + */ + buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); + ginUpdateStats(index, &buildstate.buildStats, true); + + /* + * We didn't write WAL records as we built the index, so if WAL-logging is + * required, write all pages to the WAL now. + */ + if (RelationNeedsWAL(index)) + { + log_newpage_range(index, MAIN_FORKNUM, + 0, RelationGetNumberOfBlocks(index), + true); + } + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = buildstate.indtuples; + + return result; +} + +/* + * ginbuildempty() -- build an empty gin index in the initialization fork + */ +void +ginbuildempty(Relation index) +{ + Buffer RootBuffer, + MetaBuffer; + + /* An empty GIN index has two pages. */ + MetaBuffer = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE); + RootBuffer = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize and xlog metabuffer and root buffer. */ + START_CRIT_SECTION(); + GinInitMetabuffer(MetaBuffer); + MarkBufferDirty(MetaBuffer); + log_newpage_buffer(MetaBuffer, true); + GinInitBuffer(RootBuffer, GIN_LEAF); + MarkBufferDirty(RootBuffer); + log_newpage_buffer(RootBuffer, false); + END_CRIT_SECTION(); + + /* Unlock and release the buffers. */ + UnlockReleaseBuffer(MetaBuffer); + UnlockReleaseBuffer(RootBuffer); +} + +/* + * Insert index entries for a single indexable item during "normal" + * (non-fast-update) insertion + */ +static void +ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, + Datum value, bool isNull, + ItemPointer item) +{ + Datum *entries; + GinNullCategory *categories; + int32 i, + nentries; + + entries = ginExtractEntries(ginstate, attnum, value, isNull, + &nentries, &categories); + + for (i = 0; i < nentries; i++) + ginEntryInsert(ginstate, attnum, entries[i], categories[i], + item, 1, NULL); +} + +bool +gininsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + GinState *ginstate = (GinState *) indexInfo->ii_AmCache; + MemoryContext oldCtx; + MemoryContext insertCtx; + int i; + + /* Initialize GinState cache if first call in this statement */ + if (ginstate == NULL) + { + oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context); + ginstate = (GinState *) palloc(sizeof(GinState)); + initGinState(ginstate, index); + indexInfo->ii_AmCache = (void *) ginstate; + MemoryContextSwitchTo(oldCtx); + } + + insertCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin insert temporary context", + ALLOCSET_DEFAULT_SIZES); + + oldCtx = MemoryContextSwitchTo(insertCtx); + + if (GinGetUseFastUpdate(index)) + { + GinTupleCollector collector; + + memset(&collector, 0, sizeof(GinTupleCollector)); + + for (i = 0; i < ginstate->origTupdesc->natts; i++) + ginHeapTupleFastCollect(ginstate, &collector, + (OffsetNumber) (i + 1), + values[i], isnull[i], + ht_ctid); + + ginHeapTupleFastInsert(ginstate, &collector); + } + else + { + for (i = 0; i < ginstate->origTupdesc->natts; i++) + ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1), + values[i], isnull[i], + ht_ctid); + } + + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + + return false; +} diff --git a/src/backend/access/gin/ginlogic.c b/src/backend/access/gin/ginlogic.c new file mode 100644 index 0000000..6bf3288 --- /dev/null +++ b/src/backend/access/gin/ginlogic.c @@ -0,0 +1,246 @@ +/*------------------------------------------------------------------------- + * + * ginlogic.c + * routines for performing binary- and ternary-logic consistent checks. + * + * A GIN operator class can provide a boolean or ternary consistent + * function, or both. This file provides both boolean and ternary + * interfaces to the rest of the GIN code, even if only one of them is + * implemented by the opclass. + * + * Providing a boolean interface when the opclass implements only the + * ternary function is straightforward - just call the ternary function + * with the check-array as is, and map the GIN_TRUE, GIN_FALSE, GIN_MAYBE + * return codes to TRUE, FALSE and TRUE+recheck, respectively. Providing + * a ternary interface when the opclass only implements a boolean function + * is implemented by calling the boolean function many times, with all the + * MAYBE arguments set to all combinations of TRUE and FALSE (up to a + * certain number of MAYBE arguments). + * + * (A boolean function is enough to determine if an item matches, but a + * GIN scan can apply various optimizations if it can determine that an + * item matches or doesn't match, even if it doesn't know if some of the + * keys are present or not. That's what the ternary consistent function + * is used for.) + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginlogic.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/reloptions.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_type.h" +#include "miscadmin.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" + + +/* + * Maximum number of MAYBE inputs that shimTriConsistentFn will try to + * resolve by calling all combinations. + */ +#define MAX_MAYBE_ENTRIES 4 + +/* + * Dummy consistent functions for an EVERYTHING key. Just claim it matches. + */ +static bool +trueConsistentFn(GinScanKey key) +{ + key->recheckCurItem = false; + return true; +} +static GinTernaryValue +trueTriConsistentFn(GinScanKey key) +{ + return GIN_TRUE; +} + +/* + * A helper function for calling a regular, binary logic, consistent function. + */ +static bool +directBoolConsistentFn(GinScanKey key) +{ + /* + * Initialize recheckCurItem in case the consistentFn doesn't know it + * should set it. The safe assumption in that case is to force recheck. + */ + key->recheckCurItem = true; + + return DatumGetBool(FunctionCall8Coll(key->consistentFmgrInfo, + key->collation, + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + UInt32GetDatum(key->nuserentries), + PointerGetDatum(key->extra_data), + PointerGetDatum(&key->recheckCurItem), + PointerGetDatum(key->queryValues), + PointerGetDatum(key->queryCategories))); +} + +/* + * A helper function for calling a native ternary logic consistent function. + */ +static GinTernaryValue +directTriConsistentFn(GinScanKey key) +{ + return DatumGetGinTernaryValue(FunctionCall7Coll(key->triConsistentFmgrInfo, + key->collation, + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + UInt32GetDatum(key->nuserentries), + PointerGetDatum(key->extra_data), + PointerGetDatum(key->queryValues), + PointerGetDatum(key->queryCategories))); +} + +/* + * This function implements a binary logic consistency check, using a ternary + * logic consistent function provided by the opclass. GIN_MAYBE return value + * is interpreted as true with recheck flag. + */ +static bool +shimBoolConsistentFn(GinScanKey key) +{ + GinTernaryValue result; + + result = DatumGetGinTernaryValue(FunctionCall7Coll(key->triConsistentFmgrInfo, + key->collation, + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + UInt32GetDatum(key->nuserentries), + PointerGetDatum(key->extra_data), + PointerGetDatum(key->queryValues), + PointerGetDatum(key->queryCategories))); + if (result == GIN_MAYBE) + { + key->recheckCurItem = true; + return true; + } + else + { + key->recheckCurItem = false; + return result; + } +} + +/* + * This function implements a tri-state consistency check, using a boolean + * consistent function provided by the opclass. + * + * Our strategy is to call consistentFn with MAYBE inputs replaced with every + * combination of TRUE/FALSE. If consistentFn returns the same value for every + * combination, that's the overall result. Otherwise, return MAYBE. Testing + * every combination is O(n^2), so this is only feasible for a small number of + * MAYBE inputs. + * + * NB: This function modifies the key->entryRes array! + */ +static GinTernaryValue +shimTriConsistentFn(GinScanKey key) +{ + int nmaybe; + int maybeEntries[MAX_MAYBE_ENTRIES]; + int i; + bool boolResult; + bool recheck = false; + GinTernaryValue curResult; + + /* + * Count how many MAYBE inputs there are, and store their indexes in + * maybeEntries. If there are too many MAYBE inputs, it's not feasible to + * test all combinations, so give up and return MAYBE. + */ + nmaybe = 0; + for (i = 0; i < key->nentries; i++) + { + if (key->entryRes[i] == GIN_MAYBE) + { + if (nmaybe >= MAX_MAYBE_ENTRIES) + return GIN_MAYBE; + maybeEntries[nmaybe++] = i; + } + } + + /* + * If none of the inputs were MAYBE, so we can just call consistent + * function as is. + */ + if (nmaybe == 0) + return directBoolConsistentFn(key); + + /* First call consistent function with all the maybe-inputs set FALSE */ + for (i = 0; i < nmaybe; i++) + key->entryRes[maybeEntries[i]] = GIN_FALSE; + curResult = directBoolConsistentFn(key); + + for (;;) + { + /* Twiddle the entries for next combination. */ + for (i = 0; i < nmaybe; i++) + { + if (key->entryRes[maybeEntries[i]] == GIN_FALSE) + { + key->entryRes[maybeEntries[i]] = GIN_TRUE; + break; + } + else + key->entryRes[maybeEntries[i]] = GIN_FALSE; + } + if (i == nmaybe) + break; + + boolResult = directBoolConsistentFn(key); + recheck |= key->recheckCurItem; + + if (curResult != boolResult) + return GIN_MAYBE; + } + + /* TRUE with recheck is taken to mean MAYBE */ + if (curResult == GIN_TRUE && recheck) + curResult = GIN_MAYBE; + + return curResult; +} + +/* + * Set up the implementation of the consistent functions for a scan key. + */ +void +ginInitConsistentFunction(GinState *ginstate, GinScanKey key) +{ + if (key->searchMode == GIN_SEARCH_MODE_EVERYTHING) + { + key->boolConsistentFn = trueConsistentFn; + key->triConsistentFn = trueTriConsistentFn; + } + else + { + key->consistentFmgrInfo = &ginstate->consistentFn[key->attnum - 1]; + key->triConsistentFmgrInfo = &ginstate->triConsistentFn[key->attnum - 1]; + key->collation = ginstate->supportCollation[key->attnum - 1]; + + if (OidIsValid(ginstate->consistentFn[key->attnum - 1].fn_oid)) + key->boolConsistentFn = directBoolConsistentFn; + else + key->boolConsistentFn = shimBoolConsistentFn; + + if (OidIsValid(ginstate->triConsistentFn[key->attnum - 1].fn_oid)) + key->triConsistentFn = directTriConsistentFn; + else + key->triConsistentFn = shimTriConsistentFn; + } +} diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c new file mode 100644 index 0000000..216b2b9 --- /dev/null +++ b/src/backend/access/gin/ginpostinglist.c @@ -0,0 +1,434 @@ +/*------------------------------------------------------------------------- + * + * ginpostinglist.c + * routines for dealing with posting lists. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginpostinglist.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" + +#ifdef USE_ASSERT_CHECKING +#define CHECK_ENCODING_ROUNDTRIP +#endif + +/* + * For encoding purposes, item pointers are represented as 64-bit unsigned + * integers. The lowest 11 bits represent the offset number, and the next + * lowest 32 bits are the block number. That leaves 21 bits unused, i.e. + * only 43 low bits are used. + * + * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage < + * 2^11 on all supported block sizes. We are frugal with the bits, because + * smaller integers use fewer bytes in the varbyte encoding, saving disk + * space. (If we get a new table AM in the future that wants to use the full + * range of possible offset numbers, we'll need to change this.) + * + * These 43-bit integers are encoded using varbyte encoding. In each byte, + * the 7 low bits contain data, while the highest bit is a continuation bit. + * When the continuation bit is set, the next byte is part of the same + * integer, otherwise this is the last byte of this integer. 43 bits need + * at most 7 bytes in this encoding: + * + * 0XXXXXXX + * 1XXXXXXX 0XXXXYYY + * 1XXXXXXX 1XXXXYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY + * + * X = bits used for offset number + * Y = bits used for block number + * u = unused bit + * + * The bytes are in stored in little-endian order. + * + * An important property of this encoding is that removing an item from list + * never increases the size of the resulting compressed posting list. Proof: + * + * Removing number is actually replacement of two numbers with their sum. We + * have to prove that varbyte encoding of a sum can't be longer than varbyte + * encoding of its summands. Sum of two numbers is at most one bit wider than + * the larger of the summands. Widening a number by one bit enlarges its length + * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum + * is at most one byte longer than varbyte encoding of larger summand. Lesser + * summand is at least one byte, so the sum cannot take more space than the + * summands, Q.E.D. + * + * This property greatly simplifies VACUUM, which can assume that posting + * lists always fit on the same page after vacuuming. Note that even though + * that holds for removing items from a posting list, you must also be + * careful to not cause expansion e.g. when merging uncompressed items on the + * page into the compressed lists, when vacuuming. + */ + +/* + * How many bits do you need to encode offset number? OffsetNumber is a 16-bit + * integer, but you can't fit that many items on a page. 11 ought to be more + * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and + * use the minimum number of bits, but that would require changing the on-disk + * format if MaxHeapTuplesPerPage changes. Better to leave some slack. + */ +#define MaxHeapTuplesPerPageBits 11 + +/* Max. number of bytes needed to encode the largest supported integer. */ +#define MaxBytesPerInteger 7 + +static inline uint64 +itemptr_to_uint64(const ItemPointer iptr) +{ + uint64 val; + + Assert(ItemPointerIsValid(iptr)); + Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits)); + + val = GinItemPointerGetBlockNumber(iptr); + val <<= MaxHeapTuplesPerPageBits; + val |= GinItemPointerGetOffsetNumber(iptr); + + return val; +} + +static inline void +uint64_to_itemptr(uint64 val, ItemPointer iptr) +{ + GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1)); + val = val >> MaxHeapTuplesPerPageBits; + GinItemPointerSetBlockNumber(iptr, val); + + Assert(ItemPointerIsValid(iptr)); +} + +/* + * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer. + */ +static void +encode_varbyte(uint64 val, unsigned char **ptr) +{ + unsigned char *p = *ptr; + + while (val > 0x7F) + { + *(p++) = 0x80 | (val & 0x7F); + val >>= 7; + } + *(p++) = (unsigned char) val; + + *ptr = p; +} + +/* + * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer. + */ +static uint64 +decode_varbyte(unsigned char **ptr) +{ + uint64 val; + unsigned char *p = *ptr; + uint64 c; + + /* 1st byte */ + c = *(p++); + val = c & 0x7F; + if (c & 0x80) + { + /* 2nd byte */ + c = *(p++); + val |= (c & 0x7F) << 7; + if (c & 0x80) + { + /* 3rd byte */ + c = *(p++); + val |= (c & 0x7F) << 14; + if (c & 0x80) + { + /* 4th byte */ + c = *(p++); + val |= (c & 0x7F) << 21; + if (c & 0x80) + { + /* 5th byte */ + c = *(p++); + val |= (c & 0x7F) << 28; + if (c & 0x80) + { + /* 6th byte */ + c = *(p++); + val |= (c & 0x7F) << 35; + if (c & 0x80) + { + /* 7th byte, should not have continuation bit */ + c = *(p++); + val |= c << 42; + Assert((c & 0x80) == 0); + } + } + } + } + } + } + + *ptr = p; + + return val; +} + +/* + * Encode a posting list. + * + * The encoded list is returned in a palloc'd struct, which will be at most + * 'maxsize' bytes in size. The number items in the returned segment is + * returned in *nwritten. If it's not equal to nipd, not all the items fit + * in 'maxsize', and only the first *nwritten were encoded. + * + * The allocated size of the returned struct is short-aligned, and the padding + * byte at the end, if any, is zero. + */ +GinPostingList * +ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, + int *nwritten) +{ + uint64 prev; + int totalpacked = 0; + int maxbytes; + GinPostingList *result; + unsigned char *ptr; + unsigned char *endptr; + + maxsize = SHORTALIGN_DOWN(maxsize); + + result = palloc(maxsize); + + maxbytes = maxsize - offsetof(GinPostingList, bytes); + Assert(maxbytes > 0); + + /* Store the first special item */ + result->first = ipd[0]; + + prev = itemptr_to_uint64(&result->first); + + ptr = result->bytes; + endptr = result->bytes + maxbytes; + for (totalpacked = 1; totalpacked < nipd; totalpacked++) + { + uint64 val = itemptr_to_uint64(&ipd[totalpacked]); + uint64 delta = val - prev; + + Assert(val > prev); + + if (endptr - ptr >= MaxBytesPerInteger) + encode_varbyte(delta, &ptr); + else + { + /* + * There are less than 7 bytes left. Have to check if the next + * item fits in that space before writing it out. + */ + unsigned char buf[MaxBytesPerInteger]; + unsigned char *p = buf; + + encode_varbyte(delta, &p); + if (p - buf > (endptr - ptr)) + break; /* output is full */ + + memcpy(ptr, buf, p - buf); + ptr += (p - buf); + } + prev = val; + } + result->nbytes = ptr - result->bytes; + + /* + * If we wrote an odd number of bytes, zero out the padding byte at the + * end. + */ + if (result->nbytes != SHORTALIGN(result->nbytes)) + result->bytes[result->nbytes] = 0; + + if (nwritten) + *nwritten = totalpacked; + + Assert(SizeOfGinPostingList(result) <= maxsize); + + /* + * Check that the encoded segment decodes back to the original items. + */ +#if defined (CHECK_ENCODING_ROUNDTRIP) + { + int ndecoded; + ItemPointer tmp = ginPostingListDecode(result, &ndecoded); + + Assert(ndecoded == totalpacked); + Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0); + pfree(tmp); + } +#endif + + return result; +} + +/* + * Decode a compressed posting list into an array of item pointers. + * The number of items is returned in *ndecoded. + */ +ItemPointer +ginPostingListDecode(GinPostingList *plist, int *ndecoded) +{ + return ginPostingListDecodeAllSegments(plist, + SizeOfGinPostingList(plist), + ndecoded); +} + +/* + * Decode multiple posting list segments into an array of item pointers. + * The number of items is returned in *ndecoded_out. The segments are stored + * one after each other, with total size 'len' bytes. + */ +ItemPointer +ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out) +{ + ItemPointer result; + int nallocated; + uint64 val; + char *endseg = ((char *) segment) + len; + int ndecoded; + unsigned char *ptr; + unsigned char *endptr; + + /* + * Guess an initial size of the array. + */ + nallocated = segment->nbytes * 2 + 1; + result = palloc(nallocated * sizeof(ItemPointerData)); + + ndecoded = 0; + while ((char *) segment < endseg) + { + /* enlarge output array if needed */ + if (ndecoded >= nallocated) + { + nallocated *= 2; + result = repalloc(result, nallocated * sizeof(ItemPointerData)); + } + + /* copy the first item */ + Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); + Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0); + result[ndecoded] = segment->first; + ndecoded++; + + val = itemptr_to_uint64(&segment->first); + ptr = segment->bytes; + endptr = segment->bytes + segment->nbytes; + while (ptr < endptr) + { + /* enlarge output array if needed */ + if (ndecoded >= nallocated) + { + nallocated *= 2; + result = repalloc(result, nallocated * sizeof(ItemPointerData)); + } + + val += decode_varbyte(&ptr); + + uint64_to_itemptr(val, &result[ndecoded]); + ndecoded++; + } + segment = GinNextPostingListSegment(segment); + } + + if (ndecoded_out) + *ndecoded_out = ndecoded; + return result; +} + +/* + * Add all item pointers from a bunch of posting lists to a TIDBitmap. + */ +int +ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, + TIDBitmap *tbm) +{ + int ndecoded; + ItemPointer items; + + items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded); + tbm_add_tuples(tbm, items, ndecoded, false); + pfree(items); + + return ndecoded; +} + +/* + * Merge two ordered arrays of itempointers, eliminating any duplicates. + * + * Returns a palloc'd array, and *nmerged is set to the number of items in + * the result, after eliminating duplicates. + */ +ItemPointer +ginMergeItemPointers(ItemPointerData *a, uint32 na, + ItemPointerData *b, uint32 nb, + int *nmerged) +{ + ItemPointerData *dst; + + dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData)); + + /* + * If the argument arrays don't overlap, we can just append them to each + * other. + */ + if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0) + { + memcpy(dst, a, na * sizeof(ItemPointerData)); + memcpy(&dst[na], b, nb * sizeof(ItemPointerData)); + *nmerged = na + nb; + } + else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0) + { + memcpy(dst, b, nb * sizeof(ItemPointerData)); + memcpy(&dst[nb], a, na * sizeof(ItemPointerData)); + *nmerged = na + nb; + } + else + { + ItemPointerData *dptr = dst; + ItemPointerData *aptr = a; + ItemPointerData *bptr = b; + + while (aptr - a < na && bptr - b < nb) + { + int cmp = ginCompareItemPointers(aptr, bptr); + + if (cmp > 0) + *dptr++ = *bptr++; + else if (cmp == 0) + { + /* only keep one copy of the identical items */ + *dptr++ = *bptr++; + aptr++; + } + else + *dptr++ = *aptr++; + } + + while (aptr - a < na) + *dptr++ = *aptr++; + + while (bptr - b < nb) + *dptr++ = *bptr++; + + *nmerged = dptr - dst; + } + + return dst; +} diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c new file mode 100644 index 0000000..55e2d49 --- /dev/null +++ b/src/backend/access/gin/ginscan.c @@ -0,0 +1,468 @@ +/*------------------------------------------------------------------------- + * + * ginscan.c + * routines to manage scans of inverted index relations + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginscan.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/relscan.h" +#include "pgstat.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +IndexScanDesc +ginbeginscan(Relation rel, int nkeys, int norderbys) +{ + IndexScanDesc scan; + GinScanOpaque so; + + /* no order by operators allowed */ + Assert(norderbys == 0); + + scan = RelationGetIndexScan(rel, nkeys, norderbys); + + /* allocate private workspace */ + so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData)); + so->keys = NULL; + so->nkeys = 0; + so->tempCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin scan temporary context", + ALLOCSET_DEFAULT_SIZES); + so->keyCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin scan key context", + ALLOCSET_DEFAULT_SIZES); + initGinState(&so->ginstate, scan->indexRelation); + + scan->opaque = so; + + return scan; +} + +/* + * Create a new GinScanEntry, unless an equivalent one already exists, + * in which case just return it + */ +static GinScanEntry +ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum, + StrategyNumber strategy, int32 searchMode, + Datum queryKey, GinNullCategory queryCategory, + bool isPartialMatch, Pointer extra_data) +{ + GinState *ginstate = &so->ginstate; + GinScanEntry scanEntry; + uint32 i; + + /* + * Look for an existing equivalent entry. + * + * Entries with non-null extra_data are never considered identical, since + * we can't know exactly what the opclass might be doing with that. + */ + if (extra_data == NULL) + { + for (i = 0; i < so->totalentries; i++) + { + GinScanEntry prevEntry = so->entries[i]; + + if (prevEntry->extra_data == NULL && + prevEntry->isPartialMatch == isPartialMatch && + prevEntry->strategy == strategy && + prevEntry->searchMode == searchMode && + prevEntry->attnum == attnum && + ginCompareEntries(ginstate, attnum, + prevEntry->queryKey, + prevEntry->queryCategory, + queryKey, + queryCategory) == 0) + { + /* Successful match */ + return prevEntry; + } + } + } + + /* Nope, create a new entry */ + scanEntry = (GinScanEntry) palloc(sizeof(GinScanEntryData)); + scanEntry->queryKey = queryKey; + scanEntry->queryCategory = queryCategory; + scanEntry->isPartialMatch = isPartialMatch; + scanEntry->extra_data = extra_data; + scanEntry->strategy = strategy; + scanEntry->searchMode = searchMode; + scanEntry->attnum = attnum; + + scanEntry->buffer = InvalidBuffer; + ItemPointerSetMin(&scanEntry->curItem); + scanEntry->matchBitmap = NULL; + scanEntry->matchIterator = NULL; + scanEntry->matchResult = NULL; + scanEntry->list = NULL; + scanEntry->nlist = 0; + scanEntry->offset = InvalidOffsetNumber; + scanEntry->isFinished = false; + scanEntry->reduceResult = false; + + /* Add it to so's array */ + if (so->totalentries >= so->allocentries) + { + so->allocentries *= 2; + so->entries = (GinScanEntry *) + repalloc(so->entries, so->allocentries * sizeof(GinScanEntry)); + } + so->entries[so->totalentries++] = scanEntry; + + return scanEntry; +} + +/* + * Append hidden scan entry of given category to the scan key. + * + * NB: this had better be called at most once per scan key, since + * ginFillScanKey leaves room for only one hidden entry. Currently, + * it seems sufficiently clear that this is true that we don't bother + * with any cross-check logic. + */ +static void +ginScanKeyAddHiddenEntry(GinScanOpaque so, GinScanKey key, + GinNullCategory queryCategory) +{ + int i = key->nentries++; + + /* strategy is of no interest because this is not a partial-match item */ + key->scanEntry[i] = ginFillScanEntry(so, key->attnum, + InvalidStrategy, key->searchMode, + (Datum) 0, queryCategory, + false, NULL); +} + +/* + * Initialize the next GinScanKey using the output from the extractQueryFn + */ +static void +ginFillScanKey(GinScanOpaque so, OffsetNumber attnum, + StrategyNumber strategy, int32 searchMode, + Datum query, uint32 nQueryValues, + Datum *queryValues, GinNullCategory *queryCategories, + bool *partial_matches, Pointer *extra_data) +{ + GinScanKey key = &(so->keys[so->nkeys++]); + GinState *ginstate = &so->ginstate; + uint32 i; + + key->nentries = nQueryValues; + key->nuserentries = nQueryValues; + + /* Allocate one extra array slot for possible "hidden" entry */ + key->scanEntry = (GinScanEntry *) palloc(sizeof(GinScanEntry) * + (nQueryValues + 1)); + key->entryRes = (GinTernaryValue *) palloc0(sizeof(GinTernaryValue) * + (nQueryValues + 1)); + + key->query = query; + key->queryValues = queryValues; + key->queryCategories = queryCategories; + key->extra_data = extra_data; + key->strategy = strategy; + key->searchMode = searchMode; + key->attnum = attnum; + + /* + * Initially, scan keys of GIN_SEARCH_MODE_ALL mode are marked + * excludeOnly. This might get changed later. + */ + key->excludeOnly = (searchMode == GIN_SEARCH_MODE_ALL); + + ItemPointerSetMin(&key->curItem); + key->curItemMatches = false; + key->recheckCurItem = false; + key->isFinished = false; + key->nrequired = 0; + key->nadditional = 0; + key->requiredEntries = NULL; + key->additionalEntries = NULL; + + ginInitConsistentFunction(ginstate, key); + + /* Set up normal scan entries using extractQueryFn's outputs */ + for (i = 0; i < nQueryValues; i++) + { + Datum queryKey; + GinNullCategory queryCategory; + bool isPartialMatch; + Pointer this_extra; + + queryKey = queryValues[i]; + queryCategory = queryCategories[i]; + isPartialMatch = + (ginstate->canPartialMatch[attnum - 1] && partial_matches) + ? partial_matches[i] : false; + this_extra = (extra_data) ? extra_data[i] : NULL; + + key->scanEntry[i] = ginFillScanEntry(so, attnum, + strategy, searchMode, + queryKey, queryCategory, + isPartialMatch, this_extra); + } + + /* + * For GIN_SEARCH_MODE_INCLUDE_EMPTY and GIN_SEARCH_MODE_EVERYTHING search + * modes, we add the "hidden" entry immediately. GIN_SEARCH_MODE_ALL is + * handled later, since we might be able to omit the hidden entry for it. + */ + if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) + ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_ITEM); + else if (searchMode == GIN_SEARCH_MODE_EVERYTHING) + ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_QUERY); +} + +/* + * Release current scan keys, if any. + */ +void +ginFreeScanKeys(GinScanOpaque so) +{ + uint32 i; + + if (so->keys == NULL) + return; + + for (i = 0; i < so->totalentries; i++) + { + GinScanEntry entry = so->entries[i]; + + if (entry->buffer != InvalidBuffer) + ReleaseBuffer(entry->buffer); + if (entry->list) + pfree(entry->list); + if (entry->matchIterator) + tbm_end_iterate(entry->matchIterator); + if (entry->matchBitmap) + tbm_free(entry->matchBitmap); + } + + MemoryContextResetAndDeleteChildren(so->keyCtx); + + so->keys = NULL; + so->nkeys = 0; + so->entries = NULL; + so->totalentries = 0; +} + +void +ginNewScanKey(IndexScanDesc scan) +{ + ScanKey scankey = scan->keyData; + GinScanOpaque so = (GinScanOpaque) scan->opaque; + int i; + bool hasNullQuery = false; + bool attrHasNormalScan[INDEX_MAX_KEYS] = {false}; + MemoryContext oldCtx; + + /* + * Allocate all the scan key information in the key context. (If + * extractQuery leaks anything there, it won't be reset until the end of + * scan or rescan, but that's OK.) + */ + oldCtx = MemoryContextSwitchTo(so->keyCtx); + + /* if no scan keys provided, allocate extra EVERYTHING GinScanKey */ + so->keys = (GinScanKey) + palloc(Max(scan->numberOfKeys, 1) * sizeof(GinScanKeyData)); + so->nkeys = 0; + + /* initialize expansible array of GinScanEntry pointers */ + so->totalentries = 0; + so->allocentries = 32; + so->entries = (GinScanEntry *) + palloc(so->allocentries * sizeof(GinScanEntry)); + + so->isVoidRes = false; + + for (i = 0; i < scan->numberOfKeys; i++) + { + ScanKey skey = &scankey[i]; + Datum *queryValues; + int32 nQueryValues = 0; + bool *partial_matches = NULL; + Pointer *extra_data = NULL; + bool *nullFlags = NULL; + GinNullCategory *categories; + int32 searchMode = GIN_SEARCH_MODE_DEFAULT; + + /* + * We assume that GIN-indexable operators are strict, so a null query + * argument means an unsatisfiable query. + */ + if (skey->sk_flags & SK_ISNULL) + { + so->isVoidRes = true; + break; + } + + /* OK to call the extractQueryFn */ + queryValues = (Datum *) + DatumGetPointer(FunctionCall7Coll(&so->ginstate.extractQueryFn[skey->sk_attno - 1], + so->ginstate.supportCollation[skey->sk_attno - 1], + skey->sk_argument, + PointerGetDatum(&nQueryValues), + UInt16GetDatum(skey->sk_strategy), + PointerGetDatum(&partial_matches), + PointerGetDatum(&extra_data), + PointerGetDatum(&nullFlags), + PointerGetDatum(&searchMode))); + + /* + * If bogus searchMode is returned, treat as GIN_SEARCH_MODE_ALL; note + * in particular we don't allow extractQueryFn to select + * GIN_SEARCH_MODE_EVERYTHING. + */ + if (searchMode < GIN_SEARCH_MODE_DEFAULT || + searchMode > GIN_SEARCH_MODE_ALL) + searchMode = GIN_SEARCH_MODE_ALL; + + /* Non-default modes require the index to have placeholders */ + if (searchMode != GIN_SEARCH_MODE_DEFAULT) + hasNullQuery = true; + + /* + * In default mode, no keys means an unsatisfiable query. + */ + if (queryValues == NULL || nQueryValues <= 0) + { + if (searchMode == GIN_SEARCH_MODE_DEFAULT) + { + so->isVoidRes = true; + break; + } + nQueryValues = 0; /* ensure sane value */ + } + + /* + * Create GinNullCategory representation. If the extractQueryFn + * didn't create a nullFlags array, we assume everything is non-null. + * While at it, detect whether any null keys are present. + */ + categories = (GinNullCategory *) palloc0(nQueryValues * sizeof(GinNullCategory)); + if (nullFlags) + { + int32 j; + + for (j = 0; j < nQueryValues; j++) + { + if (nullFlags[j]) + { + categories[j] = GIN_CAT_NULL_KEY; + hasNullQuery = true; + } + } + } + + ginFillScanKey(so, skey->sk_attno, + skey->sk_strategy, searchMode, + skey->sk_argument, nQueryValues, + queryValues, categories, + partial_matches, extra_data); + + /* Remember if we had any non-excludeOnly keys */ + if (searchMode != GIN_SEARCH_MODE_ALL) + attrHasNormalScan[skey->sk_attno - 1] = true; + } + + /* + * Processing GIN_SEARCH_MODE_ALL scan keys requires us to make a second + * pass over the scan keys. Above we marked each such scan key as + * excludeOnly. If the involved column has any normal (not excludeOnly) + * scan key as well, then we can leave it like that. Otherwise, one + * excludeOnly scan key must receive a GIN_CAT_EMPTY_QUERY hidden entry + * and be set to normal (excludeOnly = false). + */ + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = &so->keys[i]; + + if (key->searchMode != GIN_SEARCH_MODE_ALL) + continue; + + if (!attrHasNormalScan[key->attnum - 1]) + { + key->excludeOnly = false; + ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_QUERY); + attrHasNormalScan[key->attnum - 1] = true; + } + } + + /* + * If there are no regular scan keys, generate an EVERYTHING scankey to + * drive a full-index scan. + */ + if (so->nkeys == 0 && !so->isVoidRes) + { + hasNullQuery = true; + ginFillScanKey(so, FirstOffsetNumber, + InvalidStrategy, GIN_SEARCH_MODE_EVERYTHING, + (Datum) 0, 0, + NULL, NULL, NULL, NULL); + } + + /* + * If the index is version 0, it may be missing null and placeholder + * entries, which would render searches for nulls and full-index scans + * unreliable. Throw an error if so. + */ + if (hasNullQuery && !so->isVoidRes) + { + GinStatsData ginStats; + + ginGetStats(scan->indexRelation, &ginStats); + if (ginStats.ginVersion < 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("old GIN indexes do not support whole-index scans nor searches for nulls"), + errhint("To fix this, do REINDEX INDEX \"%s\".", + RelationGetRelationName(scan->indexRelation)))); + } + + MemoryContextSwitchTo(oldCtx); + + pgstat_count_index_scan(scan->indexRelation); +} + +void +ginrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + + ginFreeScanKeys(so); + + if (scankey && scan->numberOfKeys > 0) + { + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } +} + + +void +ginendscan(IndexScanDesc scan) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + + ginFreeScanKeys(so); + + MemoryContextDelete(so->tempCtx); + MemoryContextDelete(so->keyCtx); + + pfree(so); +} diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c new file mode 100644 index 0000000..cdd626f --- /dev/null +++ b/src/backend/access/gin/ginutil.c @@ -0,0 +1,707 @@ +/*------------------------------------------------------------------------- + * + * ginutil.c + * Utility routines for the Postgres inverted index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginutil.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/reloptions.h" +#include "access/xloginsert.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_type.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/builtins.h" +#include "utils/index_selfuncs.h" +#include "utils/typcache.h" + + +/* + * GIN handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +ginhandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = GINNProcs; + amroutine->amoptsprocnum = GIN_OPTIONS_PROC; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = true; + amroutine->amclusterable = false; + amroutine->ampredlocks = true; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = true; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = ginbuild; + amroutine->ambuildempty = ginbuildempty; + amroutine->aminsert = gininsert; + amroutine->ambulkdelete = ginbulkdelete; + amroutine->amvacuumcleanup = ginvacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = gincostestimate; + amroutine->amoptions = ginoptions; + amroutine->amproperty = NULL; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = ginvalidate; + amroutine->amadjustmembers = ginadjustmembers; + amroutine->ambeginscan = ginbeginscan; + amroutine->amrescan = ginrescan; + amroutine->amgettuple = NULL; + amroutine->amgetbitmap = gingetbitmap; + amroutine->amendscan = ginendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * initGinState: fill in an empty GinState struct to describe the index + * + * Note: assorted subsidiary data is allocated in the CurrentMemoryContext. + */ +void +initGinState(GinState *state, Relation index) +{ + TupleDesc origTupdesc = RelationGetDescr(index); + int i; + + MemSet(state, 0, sizeof(GinState)); + + state->index = index; + state->oneCol = (origTupdesc->natts == 1) ? true : false; + state->origTupdesc = origTupdesc; + + for (i = 0; i < origTupdesc->natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(origTupdesc, i); + + if (state->oneCol) + state->tupdesc[i] = state->origTupdesc; + else + { + state->tupdesc[i] = CreateTemplateTupleDesc(2); + + TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, + INT2OID, -1, 0); + TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL, + attr->atttypid, + attr->atttypmod, + attr->attndims); + TupleDescInitEntryCollation(state->tupdesc[i], (AttrNumber) 2, + attr->attcollation); + } + + /* + * If the compare proc isn't specified in the opclass definition, look + * up the index key type's default btree comparator. + */ + if (index_getprocid(index, i + 1, GIN_COMPARE_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->compareFn[i]), + index_getprocinfo(index, i + 1, GIN_COMPARE_PROC), + CurrentMemoryContext); + } + else + { + TypeCacheEntry *typentry; + + typentry = lookup_type_cache(attr->atttypid, + TYPECACHE_CMP_PROC_FINFO); + if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a comparison function for type %s", + format_type_be(attr->atttypid)))); + fmgr_info_copy(&(state->compareFn[i]), + &(typentry->cmp_proc_finfo), + CurrentMemoryContext); + } + + /* Opclass must always provide extract procs */ + fmgr_info_copy(&(state->extractValueFn[i]), + index_getprocinfo(index, i + 1, GIN_EXTRACTVALUE_PROC), + CurrentMemoryContext); + fmgr_info_copy(&(state->extractQueryFn[i]), + index_getprocinfo(index, i + 1, GIN_EXTRACTQUERY_PROC), + CurrentMemoryContext); + + /* + * Check opclass capability to do tri-state or binary logic consistent + * check. + */ + if (index_getprocid(index, i + 1, GIN_TRICONSISTENT_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->triConsistentFn[i]), + index_getprocinfo(index, i + 1, GIN_TRICONSISTENT_PROC), + CurrentMemoryContext); + } + + if (index_getprocid(index, i + 1, GIN_CONSISTENT_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->consistentFn[i]), + index_getprocinfo(index, i + 1, GIN_CONSISTENT_PROC), + CurrentMemoryContext); + } + + if (state->consistentFn[i].fn_oid == InvalidOid && + state->triConsistentFn[i].fn_oid == InvalidOid) + { + elog(ERROR, "missing GIN support function (%d or %d) for attribute %d of index \"%s\"", + GIN_CONSISTENT_PROC, GIN_TRICONSISTENT_PROC, + i + 1, RelationGetRelationName(index)); + } + + /* + * Check opclass capability to do partial match. + */ + if (index_getprocid(index, i + 1, GIN_COMPARE_PARTIAL_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->comparePartialFn[i]), + index_getprocinfo(index, i + 1, GIN_COMPARE_PARTIAL_PROC), + CurrentMemoryContext); + state->canPartialMatch[i] = true; + } + else + { + state->canPartialMatch[i] = false; + } + + /* + * If the index column has a specified collation, we should honor that + * while doing comparisons. However, we may have a collatable storage + * type for a noncollatable indexed data type (for instance, hstore + * uses text index entries). If there's no index collation then + * specify default collation in case the support functions need + * collation. This is harmless if the support functions don't care + * about collation, so we just do it unconditionally. (We could + * alternatively call get_typcollation, but that seems like expensive + * overkill --- there aren't going to be any cases where a GIN storage + * type has a nondefault collation.) + */ + if (OidIsValid(index->rd_indcollation[i])) + state->supportCollation[i] = index->rd_indcollation[i]; + else + state->supportCollation[i] = DEFAULT_COLLATION_OID; + } +} + +/* + * Extract attribute (column) number of stored entry from GIN tuple + */ +OffsetNumber +gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple) +{ + OffsetNumber colN; + + if (ginstate->oneCol) + { + /* column number is not stored explicitly */ + colN = FirstOffsetNumber; + } + else + { + Datum res; + bool isnull; + + /* + * First attribute is always int16, so we can safely use any tuple + * descriptor to obtain first attribute of tuple + */ + res = index_getattr(tuple, FirstOffsetNumber, ginstate->tupdesc[0], + &isnull); + Assert(!isnull); + + colN = DatumGetUInt16(res); + Assert(colN >= FirstOffsetNumber && colN <= ginstate->origTupdesc->natts); + } + + return colN; +} + +/* + * Extract stored datum (and possible null category) from GIN tuple + */ +Datum +gintuple_get_key(GinState *ginstate, IndexTuple tuple, + GinNullCategory *category) +{ + Datum res; + bool isnull; + + if (ginstate->oneCol) + { + /* + * Single column index doesn't store attribute numbers in tuples + */ + res = index_getattr(tuple, FirstOffsetNumber, ginstate->origTupdesc, + &isnull); + } + else + { + /* + * Since the datum type depends on which index column it's from, we + * must be careful to use the right tuple descriptor here. + */ + OffsetNumber colN = gintuple_get_attrnum(ginstate, tuple); + + res = index_getattr(tuple, OffsetNumberNext(FirstOffsetNumber), + ginstate->tupdesc[colN - 1], + &isnull); + } + + if (isnull) + *category = GinGetNullCategory(tuple, ginstate); + else + *category = GIN_CAT_NORM_KEY; + + return res; +} + +/* + * Allocate a new page (either by recycling, or by extending the index file) + * The returned buffer is already pinned and exclusive-locked + * Caller is responsible for initializing the page by calling GinInitBuffer + */ +Buffer +GinNewBuffer(Relation index) +{ + Buffer buffer; + bool needLock; + + /* First, try to get a page from FSM */ + for (;;) + { + BlockNumber blkno = GetFreeIndexPage(index); + + if (blkno == InvalidBlockNumber) + break; + + buffer = ReadBuffer(index, blkno); + + /* + * We have to guard against the possibility that someone else already + * recycled this page; the buffer may be locked if so. + */ + if (ConditionalLockBuffer(buffer)) + { + if (GinPageIsRecyclable(BufferGetPage(buffer))) + return buffer; /* OK to use */ + + LockBuffer(buffer, GIN_UNLOCK); + } + + /* Can't use it, so release buffer and try again */ + ReleaseBuffer(buffer); + } + + /* Must extend the file */ + needLock = !RELATION_IS_LOCAL(index); + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + + buffer = ReadBuffer(index, P_NEW); + LockBuffer(buffer, GIN_EXCLUSIVE); + + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + return buffer; +} + +void +GinInitPage(Page page, uint32 f, Size pageSize) +{ + GinPageOpaque opaque; + + PageInit(page, pageSize, sizeof(GinPageOpaqueData)); + + opaque = GinPageGetOpaque(page); + opaque->flags = f; + opaque->rightlink = InvalidBlockNumber; +} + +void +GinInitBuffer(Buffer b, uint32 f) +{ + GinInitPage(BufferGetPage(b), f, BufferGetPageSize(b)); +} + +void +GinInitMetabuffer(Buffer b) +{ + GinMetaPageData *metadata; + Page page = BufferGetPage(b); + + GinInitPage(page, GIN_META, BufferGetPageSize(b)); + + metadata = GinPageGetMeta(page); + + metadata->head = metadata->tail = InvalidBlockNumber; + metadata->tailFreeSize = 0; + metadata->nPendingPages = 0; + metadata->nPendingHeapTuples = 0; + metadata->nTotalPages = 0; + metadata->nEntryPages = 0; + metadata->nDataPages = 0; + metadata->nEntries = 0; + metadata->ginVersion = GIN_CURRENT_VERSION; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metadata + sizeof(GinMetaPageData)) - (char *) page; +} + +/* + * Compare two keys of the same index column + */ +int +ginCompareEntries(GinState *ginstate, OffsetNumber attnum, + Datum a, GinNullCategory categorya, + Datum b, GinNullCategory categoryb) +{ + /* if not of same null category, sort by that first */ + if (categorya != categoryb) + return (categorya < categoryb) ? -1 : 1; + + /* all null items in same category are equal */ + if (categorya != GIN_CAT_NORM_KEY) + return 0; + + /* both not null, so safe to call the compareFn */ + return DatumGetInt32(FunctionCall2Coll(&ginstate->compareFn[attnum - 1], + ginstate->supportCollation[attnum - 1], + a, b)); +} + +/* + * Compare two keys of possibly different index columns + */ +int +ginCompareAttEntries(GinState *ginstate, + OffsetNumber attnuma, Datum a, GinNullCategory categorya, + OffsetNumber attnumb, Datum b, GinNullCategory categoryb) +{ + /* attribute number is the first sort key */ + if (attnuma != attnumb) + return (attnuma < attnumb) ? -1 : 1; + + return ginCompareEntries(ginstate, attnuma, a, categorya, b, categoryb); +} + + +/* + * Support for sorting key datums in ginExtractEntries + * + * Note: we only have to worry about null and not-null keys here; + * ginExtractEntries never generates more than one placeholder null, + * so it doesn't have to sort those. + */ +typedef struct +{ + Datum datum; + bool isnull; +} keyEntryData; + +typedef struct +{ + FmgrInfo *cmpDatumFunc; + Oid collation; + bool haveDups; +} cmpEntriesArg; + +static int +cmpEntries(const void *a, const void *b, void *arg) +{ + const keyEntryData *aa = (const keyEntryData *) a; + const keyEntryData *bb = (const keyEntryData *) b; + cmpEntriesArg *data = (cmpEntriesArg *) arg; + int res; + + if (aa->isnull) + { + if (bb->isnull) + res = 0; /* NULL "=" NULL */ + else + res = 1; /* NULL ">" not-NULL */ + } + else if (bb->isnull) + res = -1; /* not-NULL "<" NULL */ + else + res = DatumGetInt32(FunctionCall2Coll(data->cmpDatumFunc, + data->collation, + aa->datum, bb->datum)); + + /* + * Detect if we have any duplicates. If there are equal keys, qsort must + * compare them at some point, else it wouldn't know whether one should go + * before or after the other. + */ + if (res == 0) + data->haveDups = true; + + return res; +} + + +/* + * Extract the index key values from an indexable item + * + * The resulting key values are sorted, and any duplicates are removed. + * This avoids generating redundant index entries. + */ +Datum * +ginExtractEntries(GinState *ginstate, OffsetNumber attnum, + Datum value, bool isNull, + int32 *nentries, GinNullCategory **categories) +{ + Datum *entries; + bool *nullFlags; + int32 i; + + /* + * We don't call the extractValueFn on a null item. Instead generate a + * placeholder. + */ + if (isNull) + { + *nentries = 1; + entries = (Datum *) palloc(sizeof(Datum)); + entries[0] = (Datum) 0; + *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory)); + (*categories)[0] = GIN_CAT_NULL_ITEM; + return entries; + } + + /* OK, call the opclass's extractValueFn */ + nullFlags = NULL; /* in case extractValue doesn't set it */ + entries = (Datum *) + DatumGetPointer(FunctionCall3Coll(&ginstate->extractValueFn[attnum - 1], + ginstate->supportCollation[attnum - 1], + value, + PointerGetDatum(nentries), + PointerGetDatum(&nullFlags))); + + /* + * Generate a placeholder if the item contained no keys. + */ + if (entries == NULL || *nentries <= 0) + { + *nentries = 1; + entries = (Datum *) palloc(sizeof(Datum)); + entries[0] = (Datum) 0; + *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory)); + (*categories)[0] = GIN_CAT_EMPTY_ITEM; + return entries; + } + + /* + * If the extractValueFn didn't create a nullFlags array, create one, + * assuming that everything's non-null. + */ + if (nullFlags == NULL) + nullFlags = (bool *) palloc0(*nentries * sizeof(bool)); + + /* + * If there's more than one key, sort and unique-ify. + * + * XXX Using qsort here is notationally painful, and the overhead is + * pretty bad too. For small numbers of keys it'd likely be better to use + * a simple insertion sort. + */ + if (*nentries > 1) + { + keyEntryData *keydata; + cmpEntriesArg arg; + + keydata = (keyEntryData *) palloc(*nentries * sizeof(keyEntryData)); + for (i = 0; i < *nentries; i++) + { + keydata[i].datum = entries[i]; + keydata[i].isnull = nullFlags[i]; + } + + arg.cmpDatumFunc = &ginstate->compareFn[attnum - 1]; + arg.collation = ginstate->supportCollation[attnum - 1]; + arg.haveDups = false; + qsort_arg(keydata, *nentries, sizeof(keyEntryData), + cmpEntries, (void *) &arg); + + if (arg.haveDups) + { + /* there are duplicates, must get rid of 'em */ + int32 j; + + entries[0] = keydata[0].datum; + nullFlags[0] = keydata[0].isnull; + j = 1; + for (i = 1; i < *nentries; i++) + { + if (cmpEntries(&keydata[i - 1], &keydata[i], &arg) != 0) + { + entries[j] = keydata[i].datum; + nullFlags[j] = keydata[i].isnull; + j++; + } + } + *nentries = j; + } + else + { + /* easy, no duplicates */ + for (i = 0; i < *nentries; i++) + { + entries[i] = keydata[i].datum; + nullFlags[i] = keydata[i].isnull; + } + } + + pfree(keydata); + } + + /* + * Create GinNullCategory representation from nullFlags. + */ + *categories = (GinNullCategory *) palloc0(*nentries * sizeof(GinNullCategory)); + for (i = 0; i < *nentries; i++) + (*categories)[i] = (nullFlags[i] ? GIN_CAT_NULL_KEY : GIN_CAT_NORM_KEY); + + return entries; +} + +bytea * +ginoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"fastupdate", RELOPT_TYPE_BOOL, offsetof(GinOptions, useFastUpdate)}, + {"gin_pending_list_limit", RELOPT_TYPE_INT, offsetof(GinOptions, + pendingListCleanupSize)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_GIN, + sizeof(GinOptions), + tab, lengthof(tab)); +} + +/* + * Fetch index's statistical data into *stats + * + * Note: in the result, nPendingPages can be trusted to be up-to-date, + * as can ginVersion; but the other fields are as of the last VACUUM. + */ +void +ginGetStats(Relation index, GinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + stats->nPendingPages = metadata->nPendingPages; + stats->nTotalPages = metadata->nTotalPages; + stats->nEntryPages = metadata->nEntryPages; + stats->nDataPages = metadata->nDataPages; + stats->nEntries = metadata->nEntries; + stats->ginVersion = metadata->ginVersion; + + UnlockReleaseBuffer(metabuffer); +} + +/* + * Write the given statistics to the index's metapage + * + * Note: nPendingPages and ginVersion are *not* copied over + */ +void +ginUpdateStats(Relation index, const GinStatsData *stats, bool is_build) +{ + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + START_CRIT_SECTION(); + + metadata->nTotalPages = stats->nTotalPages; + metadata->nEntryPages = stats->nEntryPages; + metadata->nDataPages = stats->nDataPages; + metadata->nEntries = stats->nEntries; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. (We must do this here because pre-v11 versions of PG did not + * set the metapage's pd_lower correctly, so a pg_upgraded index might + * contain the wrong value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; + + MarkBufferDirty(metabuffer); + + if (RelationNeedsWAL(index) && !is_build) + { + XLogRecPtr recptr; + ginxlogUpdateMeta data; + + data.node = index->rd_node; + data.ntuples = 0; + data.newRightlink = data.prevTail = InvalidBlockNumber; + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); + + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); + XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); + PageSetLSN(metapage, recptr); + } + + UnlockReleaseBuffer(metabuffer); + + END_CRIT_SECTION(); +} diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c new file mode 100644 index 0000000..a276eb0 --- /dev/null +++ b/src/backend/access/gin/ginvacuum.c @@ -0,0 +1,822 @@ +/*------------------------------------------------------------------------- + * + * ginvacuum.c + * delete & vacuum routines for the postgres GIN + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginvacuum.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/memutils.h" + +struct GinVacuumState +{ + Relation index; + IndexBulkDeleteResult *result; + IndexBulkDeleteCallback callback; + void *callback_state; + GinState ginstate; + BufferAccessStrategy strategy; + MemoryContext tmpCxt; +}; + +/* + * Vacuums an uncompressed posting list. The size of the must can be specified + * in number of items (nitems). + * + * If none of the items need to be removed, returns NULL. Otherwise returns + * a new palloc'd array with the remaining items. The number of remaining + * items is returned in *nremaining. + */ +ItemPointer +ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items, + int nitem, int *nremaining) +{ + int i, + remaining = 0; + ItemPointer tmpitems = NULL; + + /* + * Iterate over TIDs array + */ + for (i = 0; i < nitem; i++) + { + if (gvs->callback(items + i, gvs->callback_state)) + { + gvs->result->tuples_removed += 1; + if (!tmpitems) + { + /* + * First TID to be deleted: allocate memory to hold the + * remaining items. + */ + tmpitems = palloc(sizeof(ItemPointerData) * nitem); + memcpy(tmpitems, items, sizeof(ItemPointerData) * i); + } + } + else + { + gvs->result->num_index_tuples += 1; + if (tmpitems) + tmpitems[remaining] = items[i]; + remaining++; + } + } + + *nremaining = remaining; + return tmpitems; +} + +/* + * Create a WAL record for vacuuming entry tree leaf page. + */ +static void +xlogVacuumPage(Relation index, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + XLogRecPtr recptr; + + /* This is only used for entry tree leaf pages. */ + Assert(!GinPageIsData(page)); + Assert(GinPageIsLeaf(page)); + + if (!RelationNeedsWAL(index)) + return; + + /* + * Always create a full image, we don't track the changes on the page at + * any more fine-grained level. This could obviously be improved... + */ + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE); + PageSetLSN(page, recptr); +} + + +typedef struct DataPageDeleteStack +{ + struct DataPageDeleteStack *child; + struct DataPageDeleteStack *parent; + + BlockNumber blkno; /* current block number */ + Buffer leftBuffer; /* pinned and locked rightest non-deleted page + * on left */ + bool isRoot; +} DataPageDeleteStack; + + +/* + * Delete a posting tree page. + */ +static void +ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, + BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) +{ + Buffer dBuffer; + Buffer lBuffer; + Buffer pBuffer; + Page page, + parentPage; + BlockNumber rightlink; + + /* + * This function MUST be called only if someone of parent pages hold + * exclusive cleanup lock. This guarantees that no insertions currently + * happen in this subtree. Caller also acquires Exclusive locks on + * deletable, parent and left pages. + */ + lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, + RBM_NORMAL, gvs->strategy); + dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, + RBM_NORMAL, gvs->strategy); + pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, + RBM_NORMAL, gvs->strategy); + + page = BufferGetPage(dBuffer); + rightlink = GinPageGetOpaque(page)->rightlink; + + /* + * Any insert which would have gone on the leaf block will now go to its + * right sibling. + */ + PredicateLockPageCombine(gvs->index, deleteBlkno, rightlink); + + START_CRIT_SECTION(); + + /* Unlink the page by changing left sibling's rightlink */ + page = BufferGetPage(lBuffer); + GinPageGetOpaque(page)->rightlink = rightlink; + + /* Delete downlink from parent */ + parentPage = BufferGetPage(pBuffer); +#ifdef USE_ASSERT_CHECKING + do + { + PostingItem *tod = GinDataPageGetPostingItem(parentPage, myoff); + + Assert(PostingItemGetBlockNumber(tod) == deleteBlkno); + } while (0); +#endif + GinPageDeletePostingItem(parentPage, myoff); + + page = BufferGetPage(dBuffer); + + /* + * we shouldn't change rightlink field to save workability of running + * search scan + */ + + /* + * Mark page as deleted, and remember last xid which could know its + * address. + */ + GinPageSetDeleted(page); + GinPageSetDeleteXid(page, ReadNextTransactionId()); + + MarkBufferDirty(pBuffer); + MarkBufferDirty(lBuffer); + MarkBufferDirty(dBuffer); + + if (RelationNeedsWAL(gvs->index)) + { + XLogRecPtr recptr; + ginxlogDeletePage data; + + /* + * We can't pass REGBUF_STANDARD for the deleted page, because we + * didn't set pd_lower on pre-9.4 versions. The page might've been + * binary-upgraded from an older version, and hence not have pd_lower + * set correctly. Ditto for the left page, but removing the item from + * the parent updated its pd_lower, so we know that's OK at this + * point. + */ + XLogBeginInsert(); + XLogRegisterBuffer(0, dBuffer, 0); + XLogRegisterBuffer(1, pBuffer, REGBUF_STANDARD); + XLogRegisterBuffer(2, lBuffer, 0); + + data.parentOffset = myoff; + data.rightLink = GinPageGetOpaque(page)->rightlink; + data.deleteXid = GinPageGetDeleteXid(page); + + XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE); + PageSetLSN(page, recptr); + PageSetLSN(parentPage, recptr); + PageSetLSN(BufferGetPage(lBuffer), recptr); + } + + ReleaseBuffer(pBuffer); + ReleaseBuffer(lBuffer); + ReleaseBuffer(dBuffer); + + END_CRIT_SECTION(); + + gvs->result->pages_newly_deleted++; + gvs->result->pages_deleted++; +} + + +/* + * Scans posting tree and deletes empty pages. Caller must lock root page for + * cleanup. During scan path from root to current page is kept exclusively + * locked. Also keep left page exclusively locked, because ginDeletePage() + * needs it. If we try to relock left page later, it could deadlock with + * ginStepRight(). + */ +static bool +ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, + DataPageDeleteStack *parent, OffsetNumber myoff) +{ + DataPageDeleteStack *me; + Buffer buffer; + Page page; + bool meDelete = false; + bool isempty; + + if (isRoot) + { + me = parent; + } + else + { + if (!parent->child) + { + me = (DataPageDeleteStack *) palloc0(sizeof(DataPageDeleteStack)); + me->parent = parent; + parent->child = me; + me->leftBuffer = InvalidBuffer; + } + else + me = parent->child; + } + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + + if (!isRoot) + LockBuffer(buffer, GIN_EXCLUSIVE); + + page = BufferGetPage(buffer); + + Assert(GinPageIsData(page)); + + if (!GinPageIsLeaf(page)) + { + OffsetNumber i; + + me->blkno = blkno; + for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++) + { + PostingItem *pitem = GinDataPageGetPostingItem(page, i); + + if (ginScanToDelete(gvs, PostingItemGetBlockNumber(pitem), false, me, i)) + i--; + } + + if (GinPageRightMost(page) && BufferIsValid(me->child->leftBuffer)) + { + UnlockReleaseBuffer(me->child->leftBuffer); + me->child->leftBuffer = InvalidBuffer; + } + } + + if (GinPageIsLeaf(page)) + isempty = GinDataLeafPageIsEmpty(page); + else + isempty = GinPageGetOpaque(page)->maxoff < FirstOffsetNumber; + + if (isempty) + { + /* we never delete the left- or rightmost branch */ + if (BufferIsValid(me->leftBuffer) && !GinPageRightMost(page)) + { + Assert(!isRoot); + ginDeletePage(gvs, blkno, BufferGetBlockNumber(me->leftBuffer), + me->parent->blkno, myoff, me->parent->isRoot); + meDelete = true; + } + } + + if (!meDelete) + { + if (BufferIsValid(me->leftBuffer)) + UnlockReleaseBuffer(me->leftBuffer); + me->leftBuffer = buffer; + } + else + { + if (!isRoot) + LockBuffer(buffer, GIN_UNLOCK); + + ReleaseBuffer(buffer); + } + + if (isRoot) + ReleaseBuffer(buffer); + + return meDelete; +} + + +/* + * Scan through posting tree leafs, delete empty tuples. Returns true if there + * is at least one empty page. + */ +static bool +ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno) +{ + Buffer buffer; + Page page; + bool hasVoidPage = false; + MemoryContext oldCxt; + + /* Find leftmost leaf page of posting tree and lock it in exclusive mode */ + while (true) + { + PostingItem *pitem; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + + Assert(GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + { + LockBuffer(buffer, GIN_UNLOCK); + LockBuffer(buffer, GIN_EXCLUSIVE); + break; + } + + Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); + + pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber); + blkno = PostingItemGetBlockNumber(pitem); + Assert(blkno != InvalidBlockNumber); + + UnlockReleaseBuffer(buffer); + } + + /* Iterate all posting tree leaves using rightlinks and vacuum them */ + while (true) + { + oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); + ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(gvs->tmpCxt); + + if (GinDataLeafPageIsEmpty(page)) + hasVoidPage = true; + + blkno = GinPageGetOpaque(page)->rightlink; + + UnlockReleaseBuffer(buffer); + + if (blkno == InvalidBlockNumber) + break; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + } + + return hasVoidPage; +} + +static void +ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) +{ + if (ginVacuumPostingTreeLeaves(gvs, rootBlkno)) + { + /* + * There is at least one empty page. So we have to rescan the tree + * deleting empty pages. + */ + Buffer buffer; + DataPageDeleteStack root, + *ptr, + *tmp; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, rootBlkno, + RBM_NORMAL, gvs->strategy); + + /* + * Lock posting tree root for cleanup to ensure there are no + * concurrent inserts. + */ + LockBufferForCleanup(buffer); + + memset(&root, 0, sizeof(DataPageDeleteStack)); + root.leftBuffer = InvalidBuffer; + root.isRoot = true; + + ginScanToDelete(gvs, rootBlkno, true, &root, InvalidOffsetNumber); + + ptr = root.child; + + while (ptr) + { + tmp = ptr->child; + pfree(ptr); + ptr = tmp; + } + + UnlockReleaseBuffer(buffer); + } +} + +/* + * returns modified page or NULL if page isn't modified. + * Function works with original page until first change is occurred, + * then page is copied into temporary one. + */ +static Page +ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint32 *nroot) +{ + Page origpage = BufferGetPage(buffer), + tmppage; + OffsetNumber i, + maxoff = PageGetMaxOffsetNumber(origpage); + + tmppage = origpage; + + *nroot = 0; + + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + IndexTuple itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); + + if (GinIsPostingTree(itup)) + { + /* + * store posting tree's roots for further processing, we can't + * vacuum it just now due to risk of deadlocks with scans/inserts + */ + roots[*nroot] = GinGetDownlink(itup); + (*nroot)++; + } + else if (GinGetNPosting(itup) > 0) + { + int nitems; + ItemPointer items_orig; + bool free_items_orig; + ItemPointer items; + + /* Get list of item pointers from the tuple. */ + if (GinItupIsCompressed(itup)) + { + items_orig = ginPostingListDecode((GinPostingList *) GinGetPosting(itup), &nitems); + free_items_orig = true; + } + else + { + items_orig = (ItemPointer) GinGetPosting(itup); + nitems = GinGetNPosting(itup); + free_items_orig = false; + } + + /* Remove any items from the list that need to be vacuumed. */ + items = ginVacuumItemPointers(gvs, items_orig, nitems, &nitems); + + if (free_items_orig) + pfree(items_orig); + + /* If any item pointers were removed, recreate the tuple. */ + if (items) + { + OffsetNumber attnum; + Datum key; + GinNullCategory category; + GinPostingList *plist; + int plistsize; + + if (nitems > 0) + { + plist = ginCompressPostingList(items, nitems, GinMaxItemSize, NULL); + plistsize = SizeOfGinPostingList(plist); + } + else + { + plist = NULL; + plistsize = 0; + } + + /* + * if we already created a temporary page, make changes in + * place + */ + if (tmppage == origpage) + { + /* + * On first difference, create a temporary copy of the + * page and copy the tuple's posting list to it. + */ + tmppage = PageGetTempPageCopy(origpage); + + /* set itup pointer to new page */ + itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); + } + + attnum = gintuple_get_attrnum(&gvs->ginstate, itup); + key = gintuple_get_key(&gvs->ginstate, itup, &category); + itup = GinFormTuple(&gvs->ginstate, attnum, key, category, + (char *) plist, plistsize, + nitems, true); + if (plist) + pfree(plist); + PageIndexTupleDelete(tmppage, i); + + if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(gvs->index)); + + pfree(itup); + pfree(items); + } + } + } + + return (tmppage == origpage) ? NULL : tmppage; +} + +IndexBulkDeleteResult * +ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation index = info->index; + BlockNumber blkno = GIN_ROOT_BLKNO; + GinVacuumState gvs; + Buffer buffer; + BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; + uint32 nRoot; + + gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "Gin vacuum temporary context", + ALLOCSET_DEFAULT_SIZES); + gvs.index = index; + gvs.callback = callback; + gvs.callback_state = callback_state; + gvs.strategy = info->strategy; + initGinState(&gvs.ginstate, index); + + /* first time through? */ + if (stats == NULL) + { + /* Yes, so initialize stats to zeroes */ + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + /* + * and cleanup any pending inserts + */ + ginInsertCleanup(&gvs.ginstate, !IsAutoVacuumWorkerProcess(), + false, true, stats); + } + + /* we'll re-count the tuples each time */ + stats->num_index_tuples = 0; + gvs.result = stats; + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + + /* find leaf page */ + for (;;) + { + Page page = BufferGetPage(buffer); + IndexTuple itup; + + LockBuffer(buffer, GIN_SHARE); + + Assert(!GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + { + LockBuffer(buffer, GIN_UNLOCK); + LockBuffer(buffer, GIN_EXCLUSIVE); + + if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page)) + { + LockBuffer(buffer, GIN_UNLOCK); + continue; /* check it one more */ + } + break; + } + + Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); + blkno = GinGetDownlink(itup); + Assert(blkno != InvalidBlockNumber); + + UnlockReleaseBuffer(buffer); + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + } + + /* right now we found leftmost page in entry's BTree */ + + for (;;) + { + Page page = BufferGetPage(buffer); + Page resPage; + uint32 i; + + Assert(!GinPageIsData(page)); + + resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot); + + blkno = GinPageGetOpaque(page)->rightlink; + + if (resPage) + { + START_CRIT_SECTION(); + PageRestoreTempPage(resPage, page); + MarkBufferDirty(buffer); + xlogVacuumPage(gvs.index, buffer); + UnlockReleaseBuffer(buffer); + END_CRIT_SECTION(); + } + else + { + UnlockReleaseBuffer(buffer); + } + + vacuum_delay_point(); + + for (i = 0; i < nRoot; i++) + { + ginVacuumPostingTree(&gvs, rootOfPostingTree[i]); + vacuum_delay_point(); + } + + if (blkno == InvalidBlockNumber) /* rightmost page */ + break; + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + LockBuffer(buffer, GIN_EXCLUSIVE); + } + + MemoryContextDelete(gvs.tmpCxt); + + return gvs.result; +} + +IndexBulkDeleteResult * +ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation index = info->index; + bool needLock; + BlockNumber npages, + blkno; + BlockNumber totFreePages; + GinState ginstate; + GinStatsData idxStat; + + /* + * In an autovacuum analyze, we want to clean up pending insertions. + * Otherwise, an ANALYZE-only call is a no-op. + */ + if (info->analyze_only) + { + if (IsAutoVacuumWorkerProcess()) + { + initGinState(&ginstate, index); + ginInsertCleanup(&ginstate, false, true, true, stats); + } + return stats; + } + + /* + * Set up all-zero stats and cleanup pending inserts if ginbulkdelete + * wasn't called + */ + if (stats == NULL) + { + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + initGinState(&ginstate, index); + ginInsertCleanup(&ginstate, !IsAutoVacuumWorkerProcess(), + false, true, stats); + } + + memset(&idxStat, 0, sizeof(idxStat)); + + /* + * XXX we always report the heap tuple count as the number of index + * entries. This is bogus if the index is partial, but it's real hard to + * tell how many distinct heap entries are referenced by a GIN index. + */ + stats->num_index_tuples = Max(info->num_heap_tuples, 0); + stats->estimated_count = info->estimated_count; + + /* + * Need lock unless it's local to this backend. + */ + needLock = !RELATION_IS_LOCAL(index); + + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + npages = RelationGetNumberOfBlocks(index); + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + totFreePages = 0; + + for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++) + { + Buffer buffer; + Page page; + + vacuum_delay_point(); + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + LockBuffer(buffer, GIN_SHARE); + page = (Page) BufferGetPage(buffer); + + if (GinPageIsRecyclable(page)) + { + Assert(blkno != GIN_ROOT_BLKNO); + RecordFreeIndexPage(index, blkno); + totFreePages++; + } + else if (GinPageIsData(page)) + { + idxStat.nDataPages++; + } + else if (!GinPageIsList(page)) + { + idxStat.nEntryPages++; + + if (GinPageIsLeaf(page)) + idxStat.nEntries += PageGetMaxOffsetNumber(page); + } + + UnlockReleaseBuffer(buffer); + } + + /* Update the metapage with accurate page and entry counts */ + idxStat.nTotalPages = npages; + ginUpdateStats(info->index, &idxStat, false); + + /* Finally, vacuum the FSM */ + IndexFreeSpaceMapVacuum(info->index); + + stats->pages_free = totFreePages; + + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + stats->num_pages = RelationGetNumberOfBlocks(index); + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + return stats; +} + +/* + * Return whether Page can safely be recycled. + */ +bool +GinPageIsRecyclable(Page page) +{ + TransactionId delete_xid; + + if (PageIsNew(page)) + return true; + + if (!GinPageIsDeleted(page)) + return false; + + delete_xid = GinPageGetDeleteXid(page); + + if (!TransactionIdIsValid(delete_xid)) + return true; + + /* + * If no backend still could view delete_xid as in running, all scans + * concurrent with ginDeletePage() must have finished. + */ + return GlobalVisCheckRemovableXid(NULL, delete_xid); +} diff --git a/src/backend/access/gin/ginvalidate.c b/src/backend/access/gin/ginvalidate.c new file mode 100644 index 0000000..d2510da --- /dev/null +++ b/src/backend/access/gin/ginvalidate.c @@ -0,0 +1,338 @@ +/*------------------------------------------------------------------------- + * + * ginvalidate.c + * Opclass validator for GIN. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/gin_private.h" +#include "access/htup_details.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + +/* + * Validator for a GIN opclass. + */ +bool +ginvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + Oid opckeytype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opckeytype = classform->opckeytype; + if (!OidIsValid(opckeytype)) + opckeytype = opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* + * All GIN support functions should be registered with matching + * left/right types + */ + if (procform->amproclefttype != procform->amprocrighttype) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types", + opfamilyname, "gin", + format_procedure(procform->amproc)))); + result = false; + } + + /* + * We can't check signatures except within the specific opclass, since + * we need to know the associated opckeytype in many cases. + */ + if (procform->amproclefttype != opcintype) + continue; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case GIN_COMPARE_PROC: + ok = check_amproc_signature(procform->amproc, INT4OID, false, + 2, 2, opckeytype, opckeytype); + break; + case GIN_EXTRACTVALUE_PROC: + /* Some opclasses omit nullFlags */ + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 2, 3, opcintype, INTERNALOID, + INTERNALOID); + break; + case GIN_EXTRACTQUERY_PROC: + /* Some opclasses omit nullFlags and searchMode */ + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 5, 7, opcintype, INTERNALOID, + INT2OID, INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + break; + case GIN_CONSISTENT_PROC: + /* Some opclasses omit queryKeys and nullFlags */ + ok = check_amproc_signature(procform->amproc, BOOLOID, false, + 6, 8, INTERNALOID, INT2OID, + opcintype, INT4OID, + INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + break; + case GIN_COMPARE_PARTIAL_PROC: + ok = check_amproc_signature(procform->amproc, INT4OID, false, + 4, 4, opckeytype, opckeytype, + INT2OID, INTERNALOID); + break; + case GIN_TRICONSISTENT_PROC: + ok = check_amproc_signature(procform->amproc, CHAROID, false, + 7, 7, INTERNALOID, INT2OID, + opcintype, INT4OID, + INTERNALOID, INTERNALOID, + INTERNALOID); + break; + case GIN_OPTIONS_PROC: + ok = check_amoptsproc_signature(procform->amproc); + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "gin", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* don't want additional message */ + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "gin", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* TODO: Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "gin", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* gin doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s", + opfamilyname, "gin", + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all gin strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "gin", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * There is not a lot we can do to check the operator sets, since each + * GIN opclass is more or less a law unto itself, and some contain + * only operators that are binary-compatible with the opclass datatype + * (meaning that empty operator sets can be OK). That case also means + * that we shouldn't insist on nonempty function sets except for the + * opclass's own group. + */ + } + + /* Check that the originally-named opclass is complete */ + for (i = 1; i <= GINNProcs; i++) + { + if (opclassgroup && + (opclassgroup->functionset & (((uint64) 1) << i)) != 0) + continue; /* got it */ + if (i == GIN_COMPARE_PROC || i == GIN_COMPARE_PARTIAL_PROC || + i == GIN_OPTIONS_PROC) + continue; /* optional method */ + if (i == GIN_CONSISTENT_PROC || i == GIN_TRICONSISTENT_PROC) + continue; /* don't need both, see check below loop */ + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d", + opclassname, "gin", i))); + result = false; + } + if (!opclassgroup || + ((opclassgroup->functionset & (1 << GIN_CONSISTENT_PROC)) == 0 && + (opclassgroup->functionset & (1 << GIN_TRICONSISTENT_PROC)) == 0)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d or %d", + opclassname, "gin", + GIN_CONSISTENT_PROC, GIN_TRICONSISTENT_PROC))); + result = false; + } + + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + +/* + * Prechecking function for adding operators/functions to a GIN opfamily. + */ +void +ginadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions) +{ + ListCell *lc; + + /* + * Operator members of a GIN opfamily should never have hard dependencies, + * since their connection to the opfamily depends only on what the support + * functions think, and that can be altered. For consistency, we make all + * soft dependencies point to the opfamily, though a soft dependency on + * the opclass would work as well in the CREATE OPERATOR CLASS case. + */ + foreach(lc, operators) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + + /* + * Required support functions should have hard dependencies. Preferably + * those are just dependencies on the opclass, but if we're in ALTER + * OPERATOR FAMILY, we leave the dependency pointing at the whole + * opfamily. (Given that GIN opclasses generally don't share opfamilies, + * it seems unlikely to be worth working harder.) + */ + foreach(lc, functions) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + switch (op->number) + { + case GIN_EXTRACTVALUE_PROC: + case GIN_EXTRACTQUERY_PROC: + /* Required support function */ + op->ref_is_hard = true; + break; + case GIN_COMPARE_PROC: + case GIN_CONSISTENT_PROC: + case GIN_COMPARE_PARTIAL_PROC: + case GIN_TRICONSISTENT_PROC: + case GIN_OPTIONS_PROC: + /* Optional, so force it to be a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("support function number %d is invalid for access method %s", + op->number, "gin"))); + break; + } + } +} diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c new file mode 100644 index 0000000..09ce4d6 --- /dev/null +++ b/src/backend/access/gin/ginxlog.c @@ -0,0 +1,813 @@ +/*------------------------------------------------------------------------- + * + * ginxlog.c + * WAL replay logic for inverted index. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginxlog.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xlogutils.h" +#include "utils/memutils.h" + +static MemoryContext opCtx; /* working memory for operations */ + +static void +ginRedoClearIncompleteSplit(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, block_id, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + GinPageGetOpaque(page)->flags &= ~GIN_INCOMPLETE_SPLIT; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoCreatePTree(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record); + char *ptr; + Buffer buffer; + Page page; + + buffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(buffer); + + GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED); + + ptr = XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree); + + /* Place page data */ + memcpy(GinDataLeafPageGetPostingList(page), ptr, data->size); + + GinDataPageSetDataSize(page, data->size); + + PageSetLSN(page, lsn); + + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata) +{ + Page page = BufferGetPage(buffer); + ginxlogInsertEntry *data = (ginxlogInsertEntry *) rdata; + OffsetNumber offset = data->offset; + IndexTuple itup; + + if (rightblkno != InvalidBlockNumber) + { + /* update link to right page after split */ + Assert(!GinPageIsLeaf(page)); + Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offset)); + GinSetDownlink(itup, rightblkno); + } + + if (data->isDelete) + { + Assert(GinPageIsLeaf(page)); + Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page)); + PageIndexTupleDelete(page, offset); + } + + itup = &data->tuple; + + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber) + { + RelFileNode node; + ForkNumber forknum; + BlockNumber blknum; + + BufferGetTag(buffer, &node, &forknum, &blknum); + elog(ERROR, "failed to add item to index page in %u/%u/%u", + node.spcNode, node.dbNode, node.relNode); + } +} + +/* + * Redo recompression of posting list. Doing all the changes in-place is not + * always possible, because it might require more space than we've on the page. + * Instead, once modification is required we copy unprocessed tail of the page + * into separately allocated chunk of memory for further reading original + * versions of segments. Thanks to that we don't bother about moving page data + * in-place. + */ +static void +ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) +{ + int actionno; + int segno; + GinPostingList *oldseg; + Pointer segmentend; + char *walbuf; + int totalsize; + Pointer tailCopy = NULL; + Pointer writePtr; + Pointer segptr; + + /* + * If the page is in pre-9.4 format, convert to new format first. + */ + if (!GinPageIsCompressed(page)) + { + ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page); + int nuncompressed = GinPageGetOpaque(page)->maxoff; + int npacked; + + /* + * Empty leaf pages are deleted as part of vacuum, but leftmost and + * rightmost pages are never deleted. So, pg_upgrade'd from pre-9.4 + * instances might contain empty leaf pages, and we need to handle + * them correctly. + */ + if (nuncompressed > 0) + { + GinPostingList *plist; + + plist = ginCompressPostingList(uncompressed, nuncompressed, + BLCKSZ, &npacked); + totalsize = SizeOfGinPostingList(plist); + + Assert(npacked == nuncompressed); + + memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize); + } + else + { + totalsize = 0; + } + + GinDataPageSetDataSize(page, totalsize); + GinPageSetCompressed(page); + GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; + } + + oldseg = GinDataLeafPageGetPostingList(page); + writePtr = (Pointer) oldseg; + segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page); + segno = 0; + + walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf); + for (actionno = 0; actionno < data->nactions; actionno++) + { + uint8 a_segno = *((uint8 *) (walbuf++)); + uint8 a_action = *((uint8 *) (walbuf++)); + GinPostingList *newseg = NULL; + int newsegsize = 0; + ItemPointerData *items = NULL; + uint16 nitems = 0; + ItemPointerData *olditems; + int nolditems; + ItemPointerData *newitems; + int nnewitems; + int segsize; + + /* Extract all the information we need from the WAL record */ + if (a_action == GIN_SEGMENT_INSERT || + a_action == GIN_SEGMENT_REPLACE) + { + newseg = (GinPostingList *) walbuf; + newsegsize = SizeOfGinPostingList(newseg); + walbuf += SHORTALIGN(newsegsize); + } + + if (a_action == GIN_SEGMENT_ADDITEMS) + { + memcpy(&nitems, walbuf, sizeof(uint16)); + walbuf += sizeof(uint16); + items = (ItemPointerData *) walbuf; + walbuf += nitems * sizeof(ItemPointerData); + } + + /* Skip to the segment that this action concerns */ + Assert(segno <= a_segno); + while (segno < a_segno) + { + /* + * Once modification is started and page tail is copied, we've to + * copy unmodified segments. + */ + segsize = SizeOfGinPostingList(oldseg); + if (tailCopy) + { + Assert(writePtr + segsize < PageGetSpecialPointer(page)); + memcpy(writePtr, (Pointer) oldseg, segsize); + } + writePtr += segsize; + oldseg = GinNextPostingListSegment(oldseg); + segno++; + } + + /* + * ADDITEMS action is handled like REPLACE, but the new segment to + * replace the old one is reconstructed using the old segment from + * disk and the new items from the WAL record. + */ + if (a_action == GIN_SEGMENT_ADDITEMS) + { + int npacked; + + olditems = ginPostingListDecode(oldseg, &nolditems); + + newitems = ginMergeItemPointers(items, nitems, + olditems, nolditems, + &nnewitems); + Assert(nnewitems == nolditems + nitems); + + newseg = ginCompressPostingList(newitems, nnewitems, + BLCKSZ, &npacked); + Assert(npacked == nnewitems); + + newsegsize = SizeOfGinPostingList(newseg); + a_action = GIN_SEGMENT_REPLACE; + } + + segptr = (Pointer) oldseg; + if (segptr != segmentend) + segsize = SizeOfGinPostingList(oldseg); + else + { + /* + * Positioned after the last existing segment. Only INSERTs + * expected here. + */ + Assert(a_action == GIN_SEGMENT_INSERT); + segsize = 0; + } + + /* + * We're about to start modification of the page. So, copy tail of + * the page if it's not done already. + */ + if (!tailCopy && segptr != segmentend) + { + int tailSize = segmentend - segptr; + + tailCopy = (Pointer) palloc(tailSize); + memcpy(tailCopy, segptr, tailSize); + segptr = tailCopy; + oldseg = (GinPostingList *) segptr; + segmentend = segptr + tailSize; + } + + switch (a_action) + { + case GIN_SEGMENT_DELETE: + segptr += segsize; + segno++; + break; + + case GIN_SEGMENT_INSERT: + /* copy the new segment in place */ + Assert(writePtr + newsegsize <= PageGetSpecialPointer(page)); + memcpy(writePtr, newseg, newsegsize); + writePtr += newsegsize; + break; + + case GIN_SEGMENT_REPLACE: + /* copy the new version of segment in place */ + Assert(writePtr + newsegsize <= PageGetSpecialPointer(page)); + memcpy(writePtr, newseg, newsegsize); + writePtr += newsegsize; + segptr += segsize; + segno++; + break; + + default: + elog(ERROR, "unexpected GIN leaf action: %u", a_action); + } + oldseg = (GinPostingList *) segptr; + } + + /* Copy the rest of unmodified segments if any. */ + segptr = (Pointer) oldseg; + if (segptr != segmentend && tailCopy) + { + int restSize = segmentend - segptr; + + Assert(writePtr + restSize <= PageGetSpecialPointer(page)); + memcpy(writePtr, segptr, restSize); + writePtr += restSize; + } + + totalsize = writePtr - (Pointer) GinDataLeafPageGetPostingList(page); + GinDataPageSetDataSize(page, totalsize); +} + +static void +ginRedoInsertData(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata) +{ + Page page = BufferGetPage(buffer); + + if (isLeaf) + { + ginxlogRecompressDataLeaf *data = (ginxlogRecompressDataLeaf *) rdata; + + Assert(GinPageIsLeaf(page)); + + ginRedoRecompress(page, data); + } + else + { + ginxlogInsertDataInternal *data = (ginxlogInsertDataInternal *) rdata; + PostingItem *oldpitem; + + Assert(!GinPageIsLeaf(page)); + + /* update link to right page after split */ + oldpitem = GinDataPageGetPostingItem(page, data->offset); + PostingItemSetBlockNumber(oldpitem, rightblkno); + + GinDataPageAddPostingItem(page, &data->newitem, data->offset); + } +} + +static void +ginRedoInsert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record); + Buffer buffer; +#ifdef NOT_USED + BlockNumber leftChildBlkno = InvalidBlockNumber; +#endif + BlockNumber rightChildBlkno = InvalidBlockNumber; + bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; + + /* + * First clear incomplete-split flag on child page if this finishes a + * split. + */ + if (!isLeaf) + { + char *payload = XLogRecGetData(record) + sizeof(ginxlogInsert); + +#ifdef NOT_USED + leftChildBlkno = BlockIdGetBlockNumber((BlockId) payload); +#endif + payload += sizeof(BlockIdData); + rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload); + payload += sizeof(BlockIdData); + + ginRedoClearIncompleteSplit(record, 1); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + Size len; + char *payload = XLogRecGetBlockData(record, 0, &len); + + /* How to insert the payload is tree-type specific */ + if (data->flags & GIN_INSERT_ISDATA) + { + Assert(GinPageIsData(page)); + ginRedoInsertData(buffer, isLeaf, rightChildBlkno, payload); + } + else + { + Assert(!GinPageIsData(page)); + ginRedoInsertEntry(buffer, isLeaf, rightChildBlkno, payload); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoSplit(XLogReaderState *record) +{ + ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record); + Buffer lbuffer, + rbuffer, + rootbuf; + bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; + bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0; + + /* + * First clear incomplete-split flag on child page if this finishes a + * split + */ + if (!isLeaf) + ginRedoClearIncompleteSplit(record, 3); + + if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of left page"); + + if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of right page"); + + if (isRoot) + { + if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of root page"); + UnlockReleaseBuffer(rootbuf); + } + + UnlockReleaseBuffer(rbuffer); + UnlockReleaseBuffer(lbuffer); +} + +/* + * VACUUM_PAGE record contains simply a full image of the page, similar to + * an XLOG_FPI record. + */ +static void +ginRedoVacuumPage(XLogReaderState *record) +{ + Buffer buffer; + + if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) + { + elog(ERROR, "replay of gin entry tree page vacuum did not restore the page"); + } + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoVacuumDataLeafPage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + Size len; + ginxlogVacuumDataLeafPage *xlrec; + + xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, &len); + + Assert(GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + + ginRedoRecompress(page, &xlrec->data); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoDeletePage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogDeletePage *data = (ginxlogDeletePage *) XLogRecGetData(record); + Buffer dbuffer; + Buffer pbuffer; + Buffer lbuffer; + Page page; + + /* + * Lock left page first in order to prevent possible deadlock with + * ginStepRight(). + */ + if (XLogReadBufferForRedo(record, 2, &lbuffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(lbuffer); + Assert(GinPageIsData(page)); + GinPageGetOpaque(page)->rightlink = data->rightLink; + PageSetLSN(page, lsn); + MarkBufferDirty(lbuffer); + } + + if (XLogReadBufferForRedo(record, 0, &dbuffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(dbuffer); + Assert(GinPageIsData(page)); + GinPageSetDeleted(page); + GinPageSetDeleteXid(page, data->deleteXid); + PageSetLSN(page, lsn); + MarkBufferDirty(dbuffer); + } + + if (XLogReadBufferForRedo(record, 1, &pbuffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(pbuffer); + Assert(GinPageIsData(page)); + Assert(!GinPageIsLeaf(page)); + GinPageDeletePostingItem(page, data->parentOffset); + PageSetLSN(page, lsn); + MarkBufferDirty(pbuffer); + } + + if (BufferIsValid(lbuffer)) + UnlockReleaseBuffer(lbuffer); + if (BufferIsValid(pbuffer)) + UnlockReleaseBuffer(pbuffer); + if (BufferIsValid(dbuffer)) + UnlockReleaseBuffer(dbuffer); +} + +static void +ginRedoUpdateMetapage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record); + Buffer metabuffer; + Page metapage; + Buffer buffer; + + /* + * Restore the metapage. This is essentially the same as a full-page + * image, so restore the metapage unconditionally without looking at the + * LSN, to avoid torn page hazards. + */ + metabuffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); + metapage = BufferGetPage(metabuffer); + + GinInitMetabuffer(metabuffer); + memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + + if (data->ntuples > 0) + { + /* + * insert into tail page + */ + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + OffsetNumber off; + int i; + Size tupsize; + char *payload; + IndexTuple tuples; + Size totaltupsize; + + payload = XLogRecGetBlockData(record, 1, &totaltupsize); + tuples = (IndexTuple) payload; + + if (PageIsEmpty(page)) + off = FirstOffsetNumber; + else + off = OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + for (i = 0; i < data->ntuples; i++) + { + tupsize = IndexTupleSize(tuples); + + if (PageAddItem(page, (Item) tuples, tupsize, off, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); + + tuples = (IndexTuple) (((char *) tuples) + tupsize); + + off++; + } + Assert(payload + totaltupsize == (char *) tuples); + + /* + * Increase counter of heap tuples + */ + GinPageGetOpaque(page)->maxoff++; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + else if (data->prevTail != InvalidBlockNumber) + { + /* + * New tail + */ + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + + GinPageGetOpaque(page)->rightlink = data->newRightlink; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + UnlockReleaseBuffer(metabuffer); +} + +static void +ginRedoInsertListPage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogInsertListPage *data = (ginxlogInsertListPage *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber l, + off = FirstOffsetNumber; + int i, + tupsize; + char *payload; + IndexTuple tuples; + Size totaltupsize; + + /* We always re-initialize the page. */ + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + + GinInitBuffer(buffer, GIN_LIST); + GinPageGetOpaque(page)->rightlink = data->rightlink; + if (data->rightlink == InvalidBlockNumber) + { + /* tail of sublist */ + GinPageSetFullRow(page); + GinPageGetOpaque(page)->maxoff = 1; + } + else + { + GinPageGetOpaque(page)->maxoff = 0; + } + + payload = XLogRecGetBlockData(record, 0, &totaltupsize); + + tuples = (IndexTuple) payload; + for (i = 0; i < data->ntuples; i++) + { + tupsize = IndexTupleSize(tuples); + + l = PageAddItem(page, (Item) tuples, tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); + + tuples = (IndexTuple) (((char *) tuples) + tupsize); + off++; + } + Assert((char *) tuples == payload + totaltupsize); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoDeleteListPages(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogDeleteListPages *data = (ginxlogDeleteListPages *) XLogRecGetData(record); + Buffer metabuffer; + Page metapage; + int i; + + metabuffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); + metapage = BufferGetPage(metabuffer); + + GinInitMetabuffer(metabuffer); + + memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + + /* + * In normal operation, shiftList() takes exclusive lock on all the + * pages-to-be-deleted simultaneously. During replay, however, it should + * be all right to lock them one at a time. This is dependent on the fact + * that we are deleting pages from the head of the list, and that readers + * share-lock the next page before releasing the one they are on. So we + * cannot get past a reader that is on, or due to visit, any page we are + * going to delete. New incoming readers will block behind our metapage + * lock and then see a fully updated page list. + * + * No full-page images are taken of the deleted pages. Instead, they are + * re-initialized as empty, deleted pages. Their right-links don't need to + * be preserved, because no new readers can see the pages, as explained + * above. + */ + for (i = 0; i < data->ndeleted; i++) + { + Buffer buffer; + Page page; + + buffer = XLogInitBufferForRedo(record, i + 1); + page = BufferGetPage(buffer); + GinInitBuffer(buffer, GIN_DELETED); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); + } + UnlockReleaseBuffer(metabuffer); +} + +void +gin_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCtx; + + /* + * GIN indexes do not require any conflict processing. NB: If we ever + * implement a similar optimization as we have in b-tree, and remove + * killed tuples outside VACUUM, we'll need to handle that here. + */ + + oldCtx = MemoryContextSwitchTo(opCtx); + switch (info) + { + case XLOG_GIN_CREATE_PTREE: + ginRedoCreatePTree(record); + break; + case XLOG_GIN_INSERT: + ginRedoInsert(record); + break; + case XLOG_GIN_SPLIT: + ginRedoSplit(record); + break; + case XLOG_GIN_VACUUM_PAGE: + ginRedoVacuumPage(record); + break; + case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: + ginRedoVacuumDataLeafPage(record); + break; + case XLOG_GIN_DELETE_PAGE: + ginRedoDeletePage(record); + break; + case XLOG_GIN_UPDATE_META_PAGE: + ginRedoUpdateMetapage(record); + break; + case XLOG_GIN_INSERT_LISTPAGE: + ginRedoInsertListPage(record); + break; + case XLOG_GIN_DELETE_LISTPAGE: + ginRedoDeleteListPages(record); + break; + default: + elog(PANIC, "gin_redo: unknown op code %u", info); + } + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(opCtx); +} + +void +gin_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "GIN recovery temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +gin_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; +} + +/* + * Mask a GIN page before running consistency checks on it. + */ +void +gin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + PageHeader pagehdr = (PageHeader) page; + GinPageOpaque opaque; + + mask_page_lsn_and_checksum(page); + opaque = GinPageGetOpaque(page); + + mask_page_hint_bits(page); + + /* + * For a GIN_DELETED page, the page is initialized to empty. Hence, mask + * the whole page content. For other pages, mask the hole if pd_lower + * appears to have been set correctly. + */ + if (opaque->flags & GIN_DELETED) + mask_page_content(page); + else if (pagehdr->pd_lower > SizeOfPageHeaderData) + mask_unused_space(page); +} |